%%% -*-BibTeX-*- %%% ==================================================================== %%% BibTeX-file{ %%% author = "Nelson H. F. Beebe", %%% version = "3.17", %%% date = "02 April 2012", %%% time = "16:12:51 MDT", %%% filename = "multithreading.bib", %%% address = "University of Utah %%% Department of Mathematics, 110 LCB %%% 155 S 1400 E RM 233 %%% Salt Lake City, UT 84112-0090 %%% USA", %%% telephone = "+1 801 581 5254", %%% FAX = "+1 801 581 4148", %%% URL = "http://www.math.utah.edu/~beebe", %%% checksum = "43933 32089 149448 1511762", %%% email = "beebe at math.utah.edu, beebe at acm.org, %%% beebe at computer.org (Internet)", %%% codetable = "ISO/ASCII", %%% keywords = "multithreading; OpenMP; POSIX; pthreads; %%% threads; UNIX; Win32; Windows NT", %%% license = "public domain", %%% supported = "no", %%% docstring = "This bibliography covers publications about %%% multithreaded programming. %%% %%% At version 3.17, the year coverage looked %%% like this: %%% %%% 1973 ( 1) 1987 ( 1) 2001 ( 33) %%% 1974 ( 0) 1988 ( 2) 2002 ( 54) %%% 1975 ( 0) 1989 ( 10) 2003 ( 52) %%% 1976 ( 0) 1990 ( 15) 2004 ( 30) %%% 1977 ( 0) 1991 ( 30) 2005 ( 25) %%% 1978 ( 0) 1992 ( 37) 2006 ( 31) %%% 1979 ( 0) 1993 ( 40) 2007 ( 40) %%% 1980 ( 1) 1994 ( 57) 2008 ( 49) %%% 1981 ( 0) 1995 ( 79) 2009 ( 48) %%% 1982 ( 0) 1996 ( 68) 2010 ( 49) %%% 1983 ( 0) 1997 ( 68) 2011 ( 23) %%% 1984 ( 0) 1998 ( 64) 2012 ( 5) %%% 1985 ( 0) 1999 ( 58) %%% 1986 ( 1) 2000 ( 55) %%% %%% Article: 783 %%% Book: 44 %%% InBook: 1 %%% InCollection: 1 %%% InProceedings: 56 %%% Manual: 4 %%% MastersThesis: 37 %%% PhdThesis: 22 %%% Proceedings: 43 %%% TechReport: 35 %%% %%% Total entries: 1026 %%% %%% OpenMP is an ``Application Program Interface %%% (API) supports multi-platform shared-memory %%% parallel programming in C/C++ and Fortran on %%% all architectures, including Unix platforms %%% and Windows NT platforms. Jointly defined by %%% a group of major computer hardware and %%% software vendors, OpenMP is a portable, %%% scalable model that gives shared-memory %%% parallel programmers a simple and flexible %%% interface for developing parallel %%% applications for platforms ranging from the %%% desktop to the supercomputer.'' [from the %%% OpenMP Web site]. For details, visit %%% %%% http://www.openmp.org/ %%% %%% At least two vendors, Kuck & Associates (KAI), %%% %%% http://www.kai.com/parallel/openmp.html %%% %%% and the Portland Group, Inc. (PGI) %%% %%% http://www.pgroup.com/ppro_docs/pgiws_ug/pgi31u11.htm %%% http://www.pgroup.com/ppro_docs/pgiws_ug/pgi31u12.htm %%% %%% provide extensive support of OpenMP. %%% %%% BibTeX citation tags are uniformly chosen as %%% name:year:abbrev, where name is the family %%% name of the first author or editor, year is a %%% 4-digit number, and abbrev is a 3-letter %%% condensation of important title words. %%% Citation tags were automatically generated by %%% software developed for the BibNet Project. %%% %%% In this bibliography, entries are sorted %%% first by ascending year, and within each %%% year, alphabetically by author or editor, %%% and then, if necessary, by the 3-letter %%% abbreviation at the end of the BibTeX %%% citation tag, using the bibsort -byyear %%% utility. Year order has been chosen to %%% make it easier to identify the most recent %%% work. %%% %%% The checksum field above contains a CRC-16 %%% checksum as the first value, followed by the %%% equivalent of the standard UNIX wc (word %%% count) utility output of lines, words, and %%% characters. This is produced by Robert %%% Solovay's checksum utility.", %%% } %%% ==================================================================== %%% ==================================================================== %%% Acknowledgement abbreviations: @String{ack-nhfb = "Nelson H. F. Beebe, University of Utah, Department of Mathematics, 110 LCB, 155 S 1400 E RM 233, Salt Lake City, UT 84112-0090, USA, Tel: +1 801 581 5254, FAX: +1 801 581 4148, e-mail: \path|beebe@math.utah.edu|, \path|beebe@acm.org|, \path|beebe@computer.org| (Internet), URL: \path|http://www.math.utah.edu/~beebe/|"} %%% ==================================================================== %%% Institution abbreviations: @String{inst-CSU = "Colorado State University"} @String{inst-CSU:adr = "Fort Collins, CO, USA"} @String{inst-NLRC = "NASA Langley Research Center"} @String{inst-NLRC:adr = "Hampton, VA, USA"} @String{inst-SRC-IDA = "Supercomputing Research Center: IDA"} @String{inst-SRC-IDA:adr = "Lanham, MD, USA"} @String{inst-U-MARYLAND = "University of Maryland"} @String{inst-U-MARYLAND:adr = "College Park, MD, USA"} @String{inst-UCB-EECS = "Department of Electrical Engineering and Computer Science, University of California, Berkeley"} @String{inst-UCB-EECS:adr = "Berkeley, CA, USA"} @String{inst-UIUC-CSRD = "University of Illinois at Urbana-Champaign, Center for Supercomputing Research and Development"} @String{inst-UIUC-CSRD:adr = "Urbana, IL 61801, USA"} @String{inst-UT-CS = "Department of Computer Science, University of Tennessee, Knoxville"} @String{inst-UT-CS:adr = "Knoxville, TN 37996, USA"} %%% ==================================================================== %%% Journal abbreviations: @String{j-ACM-COMM-COMP-ALGEBRA = "ACM Communications in Computer Algebra"} @String{j-ACM-J-EXP-ALGORITHMICS = "ACM Journal of Experimental Algorithmics"} @String{j-ACTA-INFO = "Acta Informatica"} @String{j-ADA-USER = "Ada User"} @String{j-ALGORITHMICA = "Algorithmica"} @String{j-APPL-MATH-COMP = "Applied Mathematics and Computation"} @String{j-BYTE = "Byte Magazine"} @String{j-C-PLUS-PLUS-REPORT = "C++ Report"} @String{j-CACM = "Communications of the ACM"} @String{j-CCCUJ = "C/C++ Users Journal"} @String{j-CCPE = "Concurrency and Computation: Prac\-tice and Experience"} @String{j-COMP-ARCH-NEWS = "ACM SIGARCH Computer Architecture News"} @String{j-COMP-J = "The Computer Journal"} @String{j-COMP-NET-AMSTERDAM = "Computer Networks (Amsterdam, Netherlands: 1999)"} @String{j-COMP-PHYS-COMM = "Computer Physics Communications"} @String{j-COMP-SURV = "ACM Computing Surveys"} @String{j-COMPUTER = "Computer"} @String{j-COMPUTERS-AND-GRAPHICS = "Computers and Graphics"} @String{j-CPE = "Concurrency: Prac\-tice and Experience"} @String{j-CUJ = "C Users Journal"} @String{j-DDJ = "Dr. Dobb's Journal of Software Tools"} @String{j-DEC-TECH-J = "Digital Technical Journal"} @String{j-ELECTRONIK = "Elektronik"} @String{j-FUT-GEN-COMP-SYS = "Future Generation Computer Systems"} @String{j-HIGHER-ORDER-SYMB-COMPUT = "Higher-Order and Symbolic Computation"} @String{j-IBM-JRD = "IBM Journal of Research and Development"} @String{j-IBM-SYS-J = "IBM Systems Journal"} @String{j-IEEE-COMPUT-SCI-ENG = "IEEE Computational Science \& Engineering"} @String{j-IEEE-DISTRIB-SYST-ONLINE = "IEEE Distributed Systems Online"} @String{j-IEEE-MICRO = "IEEE Micro"} @String{j-IEEE-TRANS-COMPUT = "IEEE Transactions on Computers"} @String{j-IEEE-TRANS-PAR-DIST-SYS = "IEEE Transactions on Parallel and Distributed Systems"} @String{j-IJHPCA = "The International Journal of High Performance Computing Applications"} @String{j-INFO-PROC-LETT = "Information Processing Letters"} @String{j-INT-J-PAR-EMER-DIST-SYS = "International Journal of Parallel, Emergent and Distributed Systems: IJPEDS"} @String{j-INT-J-PARALLEL-PROG = "International Journal of Parallel Programming"} @String{j-INT-J-SOFTW-TOOLS-TECHNOL-TRANSFER = "International Journal on Software Tools for Technology Transfer (STTT)"} @String{j-INTEL-TECH-J = "Intel Technology Journal"} @String{j-J-ACM = "Journal of the ACM"} @String{j-J-GRAPHICS-TOOLS = "Journal of Graphics Tools: JGT"} @String{j-J-PAR-DIST-COMP = "Journal of Parallel and Distributed Computing"} @String{j-J-SUPERCOMPUTING = "The Journal of Supercomputing"} @String{j-J-SYMBOLIC-COMP = "Journal of Symbolic Computation"} @String{j-J-SYST-SOFTW = "The Journal of Systems and Software"} @String{j-JAVA-REPORT = "{Java} Report: The Source for {Java} Development"} @String{j-JAVAWORLD = "JavaWorld: IDG's magazine for the Java community"} @String{j-JERIC = "ACM Journal on Educational Resources in Computing (JERIC)"} @String{j-JETC = "ACM Journal on Emerging Technologies in Computing Systems (JETC)"} @String{j-LECT-NOTES-COMP-SCI = "Lecture Notes in Computer Science"} @String{j-LINUX-J = "Linux Journal"} @String{j-MICROPROC-MICROSYS = "Microprocessors and Microsystems"} @String{j-OPEN-SYSTEMS-TODAY = "Open Systems Today"} @String{j-OPER-SYS-REV = "Operating Systems Review"} @String{j-PARALLEL-COMPUTING = "Parallel Computing"} @String{j-PARALLEL-DIST-COMP-PRACT = "Parallel and Distributed Computing Practices"} @String{j-PARALLEL-PROCESS-LETT = "Parallel Processing Letters"} @String{j-PROC-REAL-TIME-SYS-SYMP = "Proceedings --- Real-Time Systems Symposium"} @String{j-QUEUE = "ACM Queue: Tomorrow's Computing Today"} @String{j-SCI-COMPUT-PROGRAM = "Science of Computer Programming"} @String{j-SCI-PROG = "Scientific Programming"} @String{j-SIAM-J-COMPUT = "SIAM Journal on Computing"} @String{j-SIGADA-LETTERS = "ACM SIGADA Ada Letters"} @String{j-SIGMETRICS = "ACM SIGMETRICS Performance Evaluation Review"} @String{j-SIGMOD = "SIGMOD Record (ACM Special Interest Group on Management of Data)"} @String{j-SIGPLAN = "ACM SIG{\-}PLAN Notices"} @String{j-SPE = "Soft{\-}ware\emdash Prac{\-}tice and Experience"} @String{j-SUPERCOMPUTER = "Supercomputer"} @String{j-TACO = "ACM Transactions on Architecture and Code Optimization"} @String{j-TCBB = "IEEE/ACM Transactions on Computational Biology and Bioinformatics"} @String{j-TECS = "ACM Transactions on Embedded Computing Systems"} @String{j-THEOR-COMP-SCI = "Theoretical Computer Science"} @String{j-TISSEC = "ACM Transactions on Information and System Security"} @String{j-TOCHI = "ACM Transactions on Computer-Human Interaction"} @String{j-TOCS = "ACM Transactions on Computer Systems"} @String{j-TODAES = "ACM Transactions on Design Automation of Electronic Systems."} @String{j-TODS = "ACM Transactions on Database Systems"} @String{j-TOIS = "ACM Transactions on Information Systems"} @String{j-TOMS = "ACM Transactions on Mathematical Software"} @String{j-TOPLAS = "ACM Transactions on Programming Languages and Systems"} @String{j-TOSEM = "ACM Transactions on Software Engineering and Methodology"} @String{j-UNIX-REVIEW = "UNIX review"} @String{j-UNIXWORLD-OPEN-COMP = "UnixWorld's Open Computing"} @String{j-VLDB-J = "VLDB Journal: Very Large Data Bases"} @String{j-WEB-TECHNIQUES = "Web Techniques"} %%% ==================================================================== %%% Publisher abbreviations: @String{pub-ACM = "ACM Press"} @String{pub-ACM:adr = "New York, NY 10036, USA"} @String{pub-AP = "Academic Press"} @String{pub-AP:adr = "New York, USA"} @String{pub-APRESS = "Apress"} @String{pub-APRESS:adr = "Berkeley, CA, USA"} @String{pub-AW = "Ad{\-d}i{\-s}on-Wes{\-l}ey"} @String{pub-AW:adr = "Reading, MA, USA"} @String{pub-AWDP = "Ad{\-d}i{\-s}on-Wes{\-l}ey Developers Press"} @String{pub-AWDP:adr = "Reading, MA, USA"} @String{pub-EYROLLES = "Editions Eyrolles"} @String{pub-EYROLLES:adr = "Paris, France"} @String{pub-HERMES = "Hermes"} @String{pub-HERMES:adr = "Paris, France"} @String{pub-IEEE = "IEEE Computer Society Press"} @String{pub-IEEE:adr = "1109 Spring Street, Suite 300, Silver Spring, MD 20910, USA"} @String{pub-KLUWER = "Kluwer Academic Publishers"} @String{pub-KLUWER:adr = "Dordrecht, The Netherlands; Boston, MA, USA"} @String{pub-MCGRAW-HILL = "Mc{\-}Graw-Hill"} @String{pub-MCGRAW-HILL:adr = "New York, NY, USA"} @String{pub-MIT = "MIT Press"} @String{pub-MIT:adr = "Cambridge, MA, USA"} @String{pub-MORGAN-KAUFMANN = "Morgan Kaufmann Publishers"} @String{pub-MORGAN-KAUFMANN:adr = "Los Altos, CA 94022, USA"} @String{pub-MORGAN-KAUFMANN:adrnew = "2929 Campus Drive, Suite 260, San Mateo, CA 94403, USA"} @String{pub-NTIS = "National Technical Information Service"} @String{pub-NTIS:adr = "Washington, DC, USA"} @String{pub-ORA = "O'Reilly \& Associates, Inc."} @String{pub-ORA:adr = "981 Chestnut Street, Newton, MA 02164, USA"} @String{pub-PH = "Pren{\-}tice-Hall"} @String{pub-PH:adr = "Englewood Cliffs, NJ 07632, USA"} @String{pub-PHI = "Pren{\-}tice-Hall International"} @String{pub-PHI:adr = "Englewood Cliffs, NJ 07632, USA"} @String{pub-PHPTR = "P T R Pren{\-}tice-Hall"} @String{pub-PHPTR:adr = "Englewood Cliffs, NJ 07632, USA"} @String{pub-SAMS = "Howard W. Sams"} @String{pub-SAMS:adr = "Indianapolis, IN 46268, USA"} @String{pub-SUN = "Sun Microsystems"} @String{pub-SUN:adr = "2550 Garcia Avenue, Mountain View, CA 94043, USA"} @String{pub-SUN-MICROSYSTEMS-PRESS = "Sun Microsystems Press"} @String{pub-SUN-MICROSYSTEMS-PRESS:adr = "Palo Alto, CA, USA"} @String{pub-SUNSOFT = "SunSoft Press"} @String{pub-SUNSOFT:adr = "Mountainview, CA, USA"} @String{pub-SV = "Spring{\-}er-Ver{\-}lag"} @String{pub-SV:adr = "Berlin, Germany~/ Heidelberg, Germany~/ London, UK~/ etc."} @String{pub-UKUUG = "UK Unix Users Group"} @String{pub-UKUUG:adr = "Buntingford, Herts, UK"} @String{pub-USENIX = "USENIX Association"} @String{pub-USENIX:adr = "Berkeley, CA, USA"} @String{pub-WILEY = "John Wiley and Sons"} @String{pub-WILEY:adr = "New York, NY, USA; London, UK; Sydney, Australia"} %%% ==================================================================== %%% Series abbreviations: @String{ser-LNCS = "Lecture Notes in Computer Science"} %%% ==================================================================== %%% Bibliography entries: @Article{Bettcher:1973:TSR, author = "C. W. Bettcher", title = "Thread standardization and relative cost", journal = j-COMP-ARCH-NEWS, volume = "2", number = "1", pages = "9--9", month = jan, year = "1973", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:28 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", remark = "This is a reprint of an article published in the {\em Journal of the Society of Automotive Engineers}, Volume XVIII, Number 2, p. 131, February 1926, about the cost of the lack of standardization of screw threads. {\em Computer Architecture News\/} Editor-in-Chief Caxton C. Foster has added a hand-written note ``of course, there is no message here for {\em us}.''", } @Article{Smith:1980:ASD, author = "Connie Smith and J. C. Browne", title = "Aspects of software design analysis: {Concurrency} and blocking", journal = j-SIGMETRICS, volume = "9", number = "2", pages = "245--253", month = "Summer", year = "1980", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1009375.806169", ISSN = "0163-5999 (print), 1557-9484 (electronic)", ISSN-L = "0163-5999", bibdate = "Thu Jun 26 10:54:53 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "This paper extends previous work on development of a methodology for the prediction of the performance of computer software systems from design level specifications and continuing through implementation. The effects of synchronized behavior, such as results from data reservation in multi-thread executions of data base systems, and competition for host system resources are incorporated. The previous methodology uses hierarchical graphs to represent the execution of software on some host computer system (or on some abstract machine). Performance metrics such as response time were obtained from analysis of these graphs assuming execution of a single copy on a dedicated host. This paper discusses the mapping of these execution graphs upon queueing network models of the host computing environment to yield performance metric estimates for more complex and realistic processing environments.", acknowledgement = ack-nhfb, fjournal = "ACM SIGMETRICS Performance Evaluation Review", } @Article{Jonak:1986:EFL, author = "J. E. Jonak", title = "Experience with a {FORTH}-like language", journal = j-SIGPLAN, volume = "21", number = "2", pages = "27--36", month = feb, year = "1986", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:14:55 MST 2003", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, classification = "C6110 (Systems analysis and programming); C6140D (High level languages)", corpsource = "Sperry Network Syst., London, UK", fjournal = "ACM SIGPLAN Notices", keywords = "FORTH; languages; programming; threaded code language", pubcountry = "USA A03", subject = "D.3.2 Software, PROGRAMMING LANGUAGES, Language Classifications, FORTH", treatment = "P Practical", } @Book{McJones:1987:EUS, author = "Paul R. McJones and Garret Frederick Swart", title = "Evolving the {UNIX} system interface to support multithreaded programs: The {Topaz Operating System} programmer's manual", volume = "21", publisher = "Digital Systems Research Center", address = "Palo Alto, CA, USA", pages = "100", day = "28", month = sep, year = "1987", LCCN = "QA76.76.O63M42 1987", bibdate = "Fri Aug 7 08:29:38 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", series = "Systems Research Center", acknowledgement = ack-nhfb, keywords = "computer networks; Computer networks; electronic data processing -- distributed processing; Electronic data processing -- Distributed processing; multithreaded operating system interface -- Topaz operating; Operating systems (Computers); operating systems (computers); system; UNIX (computer file); UNIX (Computer operating system)", } @Article{Gilbert:1988:DVN, author = "P. D. Gilbert", title = "Development of the {VAX NOTES} system", journal = j-DEC-TECH-J, volume = "1", number = "6", pages = "117--124", month = feb, year = "1988", CODEN = "DTJOEL", ISSN = "0898-901X", bibdate = "Thu Mar 20 18:15:43 MST 1997", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, classcodes = "C6110B (Software engineering techniques); C7410F (Communications)", corpsource = "Digital Equipment Corp., Hudson, MA, USA", fjournal = "Digital Technical Journal", keywords = "callable interface; communications tool; computer conferencing; DEC; DEC computers; discussions; human factors; human-factors engineering; interfaces; medium; multiprogramming; multitasking; multithreaded server; online; program; program testing; software engineering; storage; technical writer; teleconferencing; testing; user; user interface; VAX NOTES", treatment = "P Practical", } @Article{Halstead:1988:MMP, author = "R. H. {Halstead, Jr.} and T. Fujita", title = "{MASA}: a multithreaded processor architecture for parallel symbolic computing", journal = j-COMP-ARCH-NEWS, volume = "16", number = "2", pages = "443--451", month = may, year = "1988", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", } @TechReport{Agarwal:1989:PTM, author = "Anant Agarwal", title = "Performance tradeoffs in multithreaded processors", number = "89-566", institution = "Massachusetts Institute of Technology, Microsystems Program Office", address = "Cambridge, MA, USA", pages = "30", year = "1989", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", series = "VLSI memo", acknowledgement = ack-nhfb, } @Article{Amamiya:1989:DFC, author = "M. Amamiya", title = "Data Flow Computing and Parallel Reduction Machine", journal = j-FUT-GEN-COMP-SYS, volume = "4", number = "??", pages = "53--67", month = "????", year = "1989", CODEN = "FGSEVI", ISSN = "0167-739X (print), 1872-7115 (electronic)", ISSN-L = "0167-739X", bibdate = "Wed Feb 27 18:37:19 2002", bibsource = "ftp://ftp.ira.uka.de/bibliography/Compiler/Functional.bib; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", fjournal = "Future Generation Computer Systems", keywords = "functional cell toke flow multi-thread control flow architecture", } @TechReport{Birrell:1989:IPT, author = "Andrew D. Birrell", title = "An introduction to programming with threads", type = "SRC reports", number = "35", institution = "Digital Systems Research Center", address = "Palo Alto, CA, USA", pages = "35", day = "6", month = jan, year = "1989", LCCN = "QA76.6.B5729 1989", bibdate = "Fri May 10 12:18:17 MDT 2002", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "parallel programming (computer science); synchronization", } @Article{Briot:1989:OAS, author = "Jean-Pierre Briot", title = "From objects to actors: study of a limited symbiosis in {Smalltalk-80}", journal = j-SIGPLAN, volume = "24", number = "4", pages = "69--72", month = apr, year = "1989", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:15:37 MST 2003", bibsource = "Compendex database; http://portal.acm.org/; http://www.acm.org/pubs/toc/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org:80/pubs/citations/proceedings/plan/67386/p69-briot/", abstract = "In this paper we describe an implementation of actors in Smalltalk-80, named Actalk. This attempt is designed as a minimal extension preserving the Smalltalk-80 language. Actors are active and autonomous objects, as opposed to standard passive Smalltalk-80 objects. An actor is built from a standard Smalltalk-80 object by associating a process with it and by serializing the messages it could receive into a queue. We will study the cohabitation and synergy between the two models of computations: transfer of active messages (message and thread of activity) between passive objects, and exchange of passive messages between active objects. We propose a sketch of methodology in order to have a safe combination between these two programming paradigms.", acknowledgement = ack-nhfb, affiliation = "Univ Paris VI", affiliationaddress = "Paris, Fr", classification = "723", conference = "Proceedings of the ACM SIGPLAN Workshop on Object-Based Concurrent Programming", confname = "Proceedings of the ACM SIGPLAN workshop on Object-based concurrent programming, September 26--27 1988, San Diego, CA", fjournal = "ACM SIGPLAN Notices", journalabr = "SIGPLAN Not", keywords = "Actor Based Systems; Computer Metatheory--Programming Theory; Computer Programming Languages; Concurrent Programming; Design; design; languages; Object-Based Programming; Smalltalk-80", meetingaddress = "San Diego, CA, USA", meetingdate = "Sep 26--27 1988", meetingdate2 = "09/26--27/88", subject = "{\bf D.3.2} Software, PROGRAMMING LANGUAGES, Language Classifications, Smalltalk-80. {\bf D.1.3} Software, PROGRAMMING TECHNIQUES, Concurrent Programming. {\bf D.4.1} Software, OPERATING SYSTEMS, Process Management, Concurrency.", } @Article{Caromel:1989:GMC, author = "Denis Caromel", title = "A general model for concurrent and distributed object-oriented programming", journal = j-SIGPLAN, volume = "24", number = "4", pages = "102--104", month = apr, year = "1989", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:15:37 MST 2003", bibsource = "Compendex database; http://portal.acm.org/; http://www.acm.org/pubs/toc/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org:80/pubs/citations/proceedings/plan/67386/p102-caromel/", abstract = "This paper presents a general model supporting object-oriented programming in concurrent as well as distributed environments. The model combines the advantages of remote procedure calls with those of message passing. It relies on the following concepts: All objects are not active but the active entities are objects, Asynchronous Message Passing with Data-driven synchronization, and Service mechanism allowing an explicit thread of control.", acknowledgement = ack-nhfb, affiliation = "CNRS", affiliationaddress = "Vandoeuvres-les-Nancy, Fr", classification = "722; 723", conference = "Proceedings of the ACM SIGPLAN Workshop on Object-Based Concurrent Programming", confname = "Proceedings of the ACM SIGPLAN workshop on Object-based concurrent programming, September 26--27 1988, San Diego, CA", fjournal = "ACM SIGPLAN Notices", journalabr = "SIGPLAN Not", keywords = "Computer Systems Programming; Computer Systems, Digital--Distributed; Concurrent Programming; design; Multiprocessing Programs; Object-Oriented Programming", meetingaddress = "San Diego, CA, USA", meetingdate = "Sep 26--27 1988", meetingdate2 = "09/26--27/88", subject = "{\bf D.1.3} Software, PROGRAMMING TECHNIQUES, Concurrent Programming. {\bf D.1.m} Software, PROGRAMMING TECHNIQUES, Miscellaneous. {\bf D.4.7} Software, OPERATING SYSTEMS, Organization and Design, Distributed systems. {\bf D.4.1} Software, OPERATING SYSTEMS, Process Management, Concurrency.", } @MastersThesis{CarrerasVaquer:1989:APE, author = "Carlos {Carreras Vaquer}", title = "Architecture and performance evaluation of a multithreaded cache design", type = "Thesis (M.S. in Engineering)", school = "University of Texas at Austin", address = "Austin, TX, USA", pages = "xii + 108", year = "1989", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "Cache memory; Computer architecture; Computer storage devices; Integrated circuits -- Very large scale integration; Microprocessors", } @TechReport{Caswell:1989:IMD, author = "Deborah L. Caswell and David L. Black", title = "Implementing a {Mach} debugger for multithreaded applications", type = "Research paper", number = "CMU-CS-89-154", institution = "Carnegie Mellon University, Computer Science Dept.", address = "Pittsburgh, PA, USA", pages = "13", month = nov, year = "1989", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "To appear in the Conference Proceedings of Winter 1990 USENIX Technical Conference and Exhibition, Washington, DC, January, 1990.", abstract = "Multiple threads of control add new challenges to the task of application debugging, and require the development of new debuggers to meet these challenges. This paper describes the design and implementation of modifications to an existing debugger (gdb) for debugging multithreaded applications under the Mach operating system. It also describes the operating system facilities that support it. Although certain implementation details are specific to Mach, the underlying design principles are applicable to other systems that support threads in a Unix compatible environment.", acknowledgement = ack-nhfb, annote = "Supported by the Space and Naval Warfare Systems Command.", keywords = "Debugging in computer science -- Computer programs", } @Article{Massalin:1989:TIO, author = "H. Massalin and C. Pu", title = "Threads and input\slash output in the synthesis kernel", journal = j-OPER-SYS-REV, volume = "23", number = "5", pages = "191--201", month = dec, year = "1989", CODEN = "OSRED8", ISSN = "0163-5980", ISSN-L = "0163-5980", bibdate = "Sat Aug 26 12:47:29 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Operating Systems Review", } @MastersThesis{Plyler:1989:AMC, author = "Kevin Brian Plyler", title = "Adding multithreaded capabilities to the process manager of the {BIGSAM} distributed operating system", type = "Thesis (M.S.)", school = "Arizona State University", address = "Tempe, AZ, USA", pages = "x + 105 + 2", year = "1989", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "Electronic data processing -- Distributed processing; Multiprocessors; Operating systems (Computers)", } @Article{Schonberg:1989:FDA, author = "Edith Schonberg", title = "On-the-fly detection of access anomalies", journal = j-SIGPLAN, volume = "24", number = "7", pages = "285--297", month = jul, year = "1989", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:15:41 MST 2003", bibsource = "http://portal.acm.org/; http://www.acm.org/pubs/contents/proceedings/pldi/73141/index.html; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org:80/pubs/citations/proceedings/pldi/73141/p285-schonberg/", abstract = "Access anomalies are a common class of bugs in shared-memory parallel programs. An access anomaly occurs when two concurrent execution threads both write (or one thread reads and the other writes) the same shared memory location without coordination. Approaches to the detection of access anomalies include static analysis, post-mortem trace analysis, and on-the-fly monitoring. A general on-the-fly algorithm for access anomaly detection is presented, which can be applied to programs with both nested fork-join and synchronization operations. The advantage of on-the-fly detection over post-mortem analysis is that the amount of storage used can be greatly reduced by data compression techniques and by discarding information as soon as it becomes obsolete. In the algorithm presented, the amount of storage required at any time depends only on the number V of shared variables being monitored and the number N of threads, not on the number of synchronizations. Data compression is achieved by the use of two techniques called merging and subtraction. Upper bounds on storage are shown to be V \$MUL N${}^2$ for merging and V \$MUL N for subtraction.", acknowledgement = ack-nhfb, affiliationaddress = "New York, NY, USA", annote = "Published as part of the Proceedings of PLDI'89.", classification = "722; 723", conference = "Proceedings of the SIGPLAN '89 Conference on Programming Language Design and Implementation", fjournal = "ACM SIGPLAN Notices", journalabr = "SIGPLAN Not", keywords = "Access Anomalies; algorithms; Computer Operating Systems; Computer Programming Languages--Design; Computer Systems, Digital--Parallel Processing; languages; Parallel Programs; Program Processors", meetingaddress = "Portland, OR, USA", meetingdate = "Jun 21--23 1989", meetingdate2 = "06/21--23/89", sponsor = "ACM, Special Interest Group on Programming Languages, New York; SS NY, USA", subject = "{\bf D.1.3} Software, PROGRAMMING TECHNIQUES, Concurrent Programming. {\bf D.3.2} Software, PROGRAMMING LANGUAGES, Language Classifications, Ada. {\bf D.2.2} Software, SOFTWARE ENGINEERING, Design Tools and Techniques, Flow charts.", } @InProceedings{Caswell:1990:IMD, author = "D. Caswell and D. Black", title = "Implementing a {Mach} debugger for multithreaded applications", crossref = "Anonymous:1990:PWU", pages = "25--39", year = "1990", bibdate = "Sat Sep 28 20:03:34 MDT 1996", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, affiliation = "Hewlett Packard Labs., Palo Alto, CA, USA", classification = "C6150G (Diagnostic, testing, debugging and evaluating systems); C6150J (Operating systems)", keywords = "Application debugging; Mach debugger; Mach operating system; Multithreaded applications; Operating system facilities; Underlying design principles; Unix compatible environment", thesaurus = "Operating systems [computers]; Program debugging; Unix", } @Article{Colvin:1990:CTS, author = "Gregory Colvin", title = "{CUG306} Thread and Synapsys", journal = j-CUJ, volume = "8", type = "CUG New Release", number = "3", pages = "131--??", month = mar, year = "1990", ISSN = "0898-9788", bibdate = "Fri Aug 30 16:52:23 MDT 1996", bibsource = "http://www.cuj.com/cbklist.htm; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "C Users Journal", } @Article{Colvin:1990:MLT, author = "Gregory Colvin", title = "Multitasking With Lightweight Threads", journal = j-CUJ, volume = "8", number = "3", pages = "55--??", month = mar, year = "1990", ISSN = "0898-9788", bibdate = "Fri Aug 30 16:52:23 MDT 1996", bibsource = "http://www.cuj.com/cbklist.htm; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "C Users Journal", } @Article{Eggers:1990:TEI, author = "S. J. Eggers and David R. Keppel and Eric J. Koldinger and Henry M. Levy", title = "Techniques for efficient inline tracing on a shared-memory multiprocessor", journal = j-SIGMETRICS, volume = "18", number = "1", pages = "37--47", month = may, year = "1990", CODEN = "????", DOI = "http://doi.acm.org/10.1145/98457.98501", ISSN = "0163-5999 (print), 1557-9484 (electronic)", ISSN-L = "0163-5999", bibdate = "Thu Jun 26 11:09:08 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "While much current research concerns multiprocessor design, few traces of parallel programs are available for analyzing the effect of design trade-offs. Existing trace collection methods have serious drawbacks: trap-driven methods often slow down program execution by more than 1000 times, significantly perturbing program behavior; microcode modification is faster, but the technique is neither general nor portable. This paper describes a new tool, called MPTRACE, for collecting traces of multithreaded parallel programs executing on shared-memory multiprocessors. MPTRACE requires no hardware or microcode modification; it collects complete program traces; it is portable; and it reduces execution-time dilation to less than a factor 3. MPTRACE is based on inline tracing, in which a program is automatically modified to produce trace information as it executes. We show how the use of compiler flow analysis techniques can reduce the amount of data collected and therefore the runtime dilation of the traced program. We also discuss problematic issues concerning buffering and writing of trace data on a multiprocessor.", acknowledgement = ack-nhfb, fjournal = "ACM SIGMETRICS Performance Evaluation Review", } @Article{Faust:1990:POO, author = "John E. Faust and Henry M. Levy", title = "The performance of an object-oriented threads package", journal = j-SIGPLAN, volume = "25", number = "10", pages = "278--288", month = oct, year = "1990", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:15:57 MST 2003", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", } @Article{Gonzalez:1990:MSC, author = "Dean W. Gonzalez", title = "Multitasking Software Components", journal = j-SIGADA-LETTERS, volume = "10", number = "1", pages = "92--96", month = jan # "\slash " # feb, year = "1990", CODEN = "AALEE5", ISSN = "0736-721X", ISSN-L = "0736-721X", bibdate = "Thu Sep 28 07:33:23 MDT 2000", bibsource = "ftp://ftp.uu.net/library/bibliography; http://www.adahome.com/Resources/Bibliography/articles.ref; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, classcodes = "C6110B (Software engineering techniques); C6120 (File organisation)", fjournal = "ACM SIGADA Ada Letters", keywords = "Ada; Ada parameter passing semantics; concurrency, tasking, reuse; concurrent forms; data integrity; data structure manipulation routines; data structures; multiple; parallel programming; reusability; semaphore calls; software; threads of control", treatment = "P Practical", } @InProceedings{Hansen:1990:EPA, author = "G. J. Hansen and C. A. Linthicum and G. Brooks", title = "Experience with a performance analyzer for multithreaded applications", crossref = "IEEE:1990:PSN", pages = "124--131", year = "1990", bibdate = "Wed Apr 15 18:34:48 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, classification = "C5470 (Performance evaluation and testing); C6150E (General utility programs); C6150G (Diagnostic, testing, debugging and evaluating systems)", corpsource = "CONVEX Comput. Corp., Richardson, TX, USA", keywords = "CONVEX C200 series computers; Convex OS V8.0; CONVEX performance analyzer, CX/sub pa/; loops; multiprocessing systems; multithreaded applications; operating system facilities; parallel code monitoring; performance evaluation; profiling data; profiling information; time-sharing environment; time-sharing systems; Unix; UNIX based operating system", sponsororg = "IEEE; ACM; Lawrence Livermore Nat. Lab.; Los Alamos Nat. Lab.; NASA Ames Res. Center; Nat. Center Atmos. Res.; NSF; SIAM; Supercomput. Res. Center", treatment = "P Practical; X Experimental", } @Article{Nordstrom:1990:TL, author = "D. J. Nordstrom", title = "Threading {Lisp}", journal = j-SIGPLAN, volume = "25", number = "2", pages = "17--24", month = feb, year = "1990", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:15:50 MST 2003", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", } @TechReport{Polychronopoulos:1990:ASC, author = "C. D. (Constantine D.) Polychronopoulos", title = "Auto scheduling: control flow and data flow come together", type = "Technical Report", number = "CSRD 1058", institution = inst-UIUC-CSRD, address = inst-UIUC-CSRD:adr, pages = "28", month = dec, year = "1990", bibdate = "Fri Aug 30 08:01:51 MDT 1996", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "This paper presents a framework we term auto-scheduling, which brings together the control flow and data flow models by combining most of the advantages and excluding the major disadvantages of the two familiar models. Auto-scheduling can be viewed either as an abstract architectural model or as a parallel program compilation framework. While in ordinary environments parallel task creation and scheduling is done by the operating system, or at best the run-time library, in auto-scheduling task creation and scheduling is performed by the user program itself, making parallel processing affordable at fine-granularity levels. Under auto-scheduling the compiler does not only generate object code, but it `lends' its knowledge about a program to the parallel instruction threads of that program, allowing them to manage, activate, and schedule themselves at run-time, without the need of an external monitor. This is done by means of special drive-code injected by the compiler to each schedulable unit of a program (task, thread, etc). We argue that auto-scheduling offers an optimal approach for exploiting parallelism on real parallel computer systems.", acknowledgement = ack-nhfb, annote = "Title on P. 1: Auto-scheduling: control flow and data flow come together. Supported in part by the National Science Foundation. Supported in part by the U.S. Department of Energy. Supported in part by Digital Equipment Corporation.", keywords = "Parallel processing (Electronic computers); Scheduling (Management)", } @InProceedings{Presotto:1990:MSP, author = "D. L. Presotto", booktitle = "UKUUG. UNIX - The Legend Evolves. Proceedings of the Summer 1990 UKUUG Conference", title = "Multiprocessor Streams for {Plan 9}", publisher = pub-UKUUG, address = pub-UKUUG:adr, pages = "11--19 (of xi + 260)", month = "????", year = "1990", ISBN = "0-9513181-7-9", ISBN-13 = "978-0-9513181-7-1", LCCN = "????", bibdate = "Sat Mar 22 15:10:17 MST 1997", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, classcodes = "C6150J (Operating systems)", conflocation = "London, UK; 9-13 July 1990", corpsource = "AT&T Bell Lab., Murray Hill, NJ, USA", keywords = "abstraction; input-output programs; kernel; multi-threaded; multiprocessing programs; multiprocessor; Plan 9 kernel; Streams; system call interface; Unix", treatment = "P Practical", } @TechReport{Saavedra-Barrera:1990:AMA, author = "Rafael H. Saavedra-Barrera and David E. Culler and Thorsten {Von Eiken}", title = "Analysis of multithreaded architectures for parallel computing", type = "Report", number = "UCB/CSD 90/569", institution = "University of California, Berkeley, Computer Science Division", address = "Berkeley, CA, USA", pages = "10", month = apr, year = "1990", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "To appear in the 2nd Annual ACM Symposium on Parallel Algorithms and Architectures, Crete, Greece, July 1990.", abstract = "Multithreading has been proposed as an architectural strategy for tolerating latency in multiprocessors and, through limited empirical studies, shown to offer promise. This paper develops an analytical model of multithreaded processor behavior based on a small set of architectural and program parameters. The model gives rise to a large Markov chain, which is solved to obtain a formula for processor efficiency in terms of the number of threads per processor, the remote reference rate, the latency, and the cost of switching between threads. It is shown that a multithreaded processor exhibits three operating regimes: linear (efficiency is proportional to the number of threads), transition, and saturation (efficiency depends only on the remote reference rate and switch cost). Formulae for regime boundaries are derived. The model is embellished to reflect cache degradation due to multithreading, using an analytical model of cache behavior, demonstrating that returns diminish as the number threads becomes large. Predictions from the embellished model correlate well with published empirical measurements. Prescriptive use of the model under various scenarios indicates that multithreading is effective, but the number of useful threads per processor is fairly small.", acknowledgement = ack-nhfb, annote = "Supported in part by NASA. Supported in part by the National Science Foundation through the UCB Mammoth project.", keywords = "Computer architecture; Multiprocessors", } @Article{Schmitt:1990:CEM, author = "David A. Schmitt", title = "{C} Extensions For Multi-Threading", journal = j-CUJ, volume = "8", number = "8", pages = "33--??", month = aug, year = "1990", ISSN = "0898-9788", bibdate = "Fri Aug 30 16:52:23 MDT 1996", bibsource = "http://www.cuj.com/cbklist.htm; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "C Users Journal", } @MastersThesis{Stapleton:1990:DSS, author = "Joseph Francis Stapleton", title = "Dynamic server selection in a multithreaded network computing environment", type = "Thesis (M.S.)", school = "Iowa State University", address = "Ames, IA, USA", pages = "66", year = "1990", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @TechReport{Agarwal:1991:PTM, author = "Anant Agarwal", title = "Performance tradeoffs in multithreaded processors", type = "Technical report", number = "MIT/LCS/TR 501; VLSI memo no. 89-566", institution = "Laboratory for Computer Science, Massachusetts Institute of Technology", address = "Cambridge, MA, USA", pages = "39", year = "1991", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Article{Beddow:1991:MTC, author = "A. J. M. Beddow", title = "Multi-Threaded {C} Functions", journal = j-CUJ, volume = "9", number = "1", pages = "57--??", month = jan, year = "1991", ISSN = "0898-9788", bibdate = "Fri Aug 30 16:52:23 MDT 1996", bibsource = "http://www.cuj.com/cbklist.htm; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "C Users Journal", } @Article{Canetti:1991:PCP, author = "R. Canetti and L. P. Fertig and S. A. Kravitz and D. Malki and R. Y. Pinter and S. Porat and A. Teperman", title = "The parallel {C} ({pC}) programming language", journal = j-IBM-JRD, volume = "35", number = "5/6", pages = "727--741", month = sep # "\slash " # nov, year = "1991", CODEN = "IBMJAE", ISSN = "0018-8646 (print), 2151-8556 (electronic)", ISSN-L = "0018-8646", bibdate = "Tue Mar 25 14:26:59 MST 1997", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "The authors describe pC (parallel C), an extension of the ANSI C programming language to support medium- to large-grain parallel programming in both shared- and distributed-memory environments. pC aims to make programming for parallel processors accessible to the C community by enriching the C programming model with a small set of constructs supporting parallelism. pC supports shared- and distributed-memory environments via a hierarchical computational model. A pC application comprises a static collection of tasks with disjoint memory spaces. A dynamic collection of threads runs within each task, sharing the data and code of the task. Language constructs specify concurrent execution of threads within a single task. Additional language constructs specify the interactions between threads through the following mechanisms: initiation of threads in remote tasks by remote function call, mailbox-based message passing, and synchronization primitives. The paper introduces the computational model and language constructs of pC and describes a prototype pC compiler and run-time system for the Mach operating system. Several program examples illustrate the utility of pC constructs.", acknowledgement = ack-nhfb, affiliation = "Dept. of Comput. Sci., Technion-Israel Inst. of Technol., Haifa, Israel", classcodes = "C6140D (High level languages); C6110P (Parallel programming); C6150C (Compilers, interpreters and other processors)", classification = "C6110P (Parallel programming); C6140D (High level languages); C6150C (Compilers, interpreters and other processors)", corpsource = "Dept. of Comput. Sci., Technion-Israel Inst. of Technol., Haifa, Israel", fjournal = "IBM Journal of Research and Development", keywords = "ANSI C programming language; C language; C programming; C programming model; Disjoint memory spaces; disjoint memory spaces; Distributed-memory; distributed-memory; function call; Hierarchical computational model; hierarchical computational model; Language constructs; language constructs; Mach; Mach operating system; Mailbox-based message passing; mailbox-based message passing; model; operating system; Parallel C; parallel C; parallel languages; Parallel programming; parallel programming; Parallelism; parallelism; PC; pC; PC compiler; pC compiler; program compilers; remote; Remote function call; Run-time system; run-time system; Shared memory; shared memory; Synchronization; synchronization; Tasks; tasks; Threads; threads", thesaurus = "C language; Parallel languages; Program compilers", treatment = "P Practical", } @Article{Ching:1991:EAP, author = "W.-M. Ching and D. Ju", title = "Execution of automatically parallelized {APL} programs on {RP3}", journal = j-IBM-JRD, volume = "35", number = "5/6", pages = "767--777", month = sep # "\slash " # nov, year = "1991", CODEN = "IBMJAE", ISSN = "0018-8646 (print), 2151-8556 (electronic)", ISSN-L = "0018-8646", bibdate = "Tue Mar 25 14:26:59 MST 1997", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "The authors have implemented an experimental APL/C compiler, which accepts ordinary APL programs and produces C programs. They have also implemented a run-time environment that supports the parallel execution of these C programs on the RP3 computer, a shared-memory, 64-way MIMD machine built at the IBM Thomas J. Watson Research Center. The APL/C compiler uses the front end of the APL/370 compiler and imposes the same restrictions, but requires no parallelization directives from the user. The run-time environment is based on simple synchronization primitives and is implemented using Mach threads. They report the speedups of several compiled programs running on RP3 under the Mach operating system. The current implementation exploits only data parallelism. They discuss the relationship between the style of an APL program and its expected benefit from the automatic parallel execution provided by the compiler.", acknowledgement = ack-nhfb, affiliation = "IBM Thomas J. Watson Res. Center, Yorktown Heights, NY, USA", classcodes = "C6150C (Compilers, interpreters and other processors); C6150N (Distributed systems); C6140D (High level languages)", classification = "C6140D (High level languages); C6150C (Compilers, interpreters and other processors); C6150N (Distributed systems)", corpsource = "IBM Thomas J. Watson Res. Center, Yorktown Heights, NY, USA", fjournal = "IBM Journal of Research and Development", keywords = "APL; APL/370 compiler; APL/C; APL/C compiler; Automatically parallelized APL programs; automatically parallelized APL programs; C language; C programs; compiler; compilers; Data parallelism; data parallelism; Mach operating; Mach operating system; Mach threads; multiprocessing programs; program; RP3; Shared-memory; shared-memory; synchronisation; Synchronization primitives; synchronization primitives; system", thesaurus = "APL; C language; Multiprocessing programs; Program compilers; Synchronisation", treatment = "P Practical", } @Article{Chiueh:1991:MTV, author = "Tzi-cker Chiueh", title = "Multi-threaded vectorization", journal = j-COMP-ARCH-NEWS, volume = "19", number = "3", pages = "352--361", month = may, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:01 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", } @Article{Culler:1991:FGPa, author = "David E. Culler and Anurag Sah and Klaus E. Schauser and Thorsten von Eicken and John Wawrzynek", title = "Fine-grain parallelism with minimal hardware support: a compiler-controlled threaded abstract machine", journal = j-COMP-ARCH-NEWS, volume = "19", number = "2", pages = "164--175", month = apr, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:40 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", } @Article{Culler:1991:FGPb, author = "David E. Culler and Anurag Sah and Klaus E. Schauser and Thorsten von Eicken and John Wawrzynek", title = "Fine-Grain Parallelism with Minimal Hardware Support: {A} Compiler-Controlled Threaded Abstract Machine", journal = j-SIGPLAN, volume = "26", number = "4", pages = "164--175", month = apr, year = "1991", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sat May 01 18:50:04 1999", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", } @Article{Culler:1991:FGPc, author = "David E. Culler and Anurag Sah and Klaus E. Schauser and Thorsten von Eicken and John Wawrzynek", title = "Fine-grain parallelism with minimal hardware support: a compiler-controlled threaded abstract machine", journal = j-OPER-SYS-REV, volume = "25", number = "3S", pages = "164--175", month = apr, year = "1991", CODEN = "OSRED8", ISSN = "0163-5980", ISSN-L = "0163-5980", bibdate = "Sat Aug 26 15:24:15 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Operating Systems Review", } @Article{Draves:1991:UCI, author = "Richard P. Draves and Brian N. Bershad and Richard F. Rashid and Randall W. Dean", title = "Using continuations to implement thread management and communication in operating systems", journal = j-OPER-SYS-REV, volume = "25", number = "5", pages = "122--136", month = oct, year = "1991", CODEN = "OSRED8", ISSN = "0163-5980", ISSN-L = "0163-5980", bibdate = "Sat Aug 26 08:55:57 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Operating Systems Review", } @Article{Gallmeister:1991:EEP, author = "Bill O. Gallmeister and Chris Lanier", title = "Early experience with {POSIX} 1003.4 and {POSIX} 1003.4 {A}", journal = j-PROC-REAL-TIME-SYS-SYMP, pages = "190--198 (of ix + 307)", year = "1991", CODEN = "PRSYEA", ISBN = "0-8186-2450-7", ISBN-13 = "978-0-8186-2450-6", LCCN = "QA 76.54 R43 1991", bibdate = "Mon Dec 22 09:06:02 1997", bibsource = "Compendex database; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "IEEE catalog number 91CH3090-8.", abstract = "Two proposed IEEE standards for real-time operating systems support, POSIX.4 and POSIX.4a, are proceeding towards IEEE approval and will eventually become international standards. The authors provide a brief overview of the facilities of POSIX.4 and POSIX.4a. They concentrate on a few of the critical features that POSIX.4 and POSIX.4a provide and describe the POSIX.4 scheduling interface. The POSIX.4a support for multiple threads of control is also described. The features found in POSIX.4 and POSIX.4a for synchronization of multiple threads, are discussed, and the POSIX.4 interprocess communication facility is presented. The performance numbers are given to allow comparisons of the facilities of traditional UNIX systems, the facilities of a representative hard real-time system (LynxOS), and the facilities of POSIX.4 and POSIX.4a.", acknowledgement = ack-nhfb, classification = "722; 723; 902", conference = "Proceedings of the 12th Real-Time Systems Symposium", conferenceyear = "1991", fjournal = "Proceedings --- Real-Time Systems Symposium", journalabr = "Proc Real Time Syst Symp", keywords = "Computer Operating Systems--Standards; Computer Systems, Digital; POSIX.4a Standards; Real Time Operation; Real-Time Operating Systems", meetingaddress = "San Antonio, TX, USA", meetingdate = "Dec 4--6 1991", meetingdate2 = "12/04--06/91", publisherinfo = "IEEE Service Center", sponsor = "IEEE Computer Soc", } @TechReport{Glenn:1991:CMH, author = "Ray R. Glenn", title = "Characterizing memory hot spots in a shared memory {MIMD} machine", type = "Technical report", number = "SRC-TR-91-039", institution = inst-SRC-IDA, address = inst-SRC-IDA:adr, pages = "24", day = "15", month = oct, year = "1991", bibdate = "Fri Aug 30 08:01:51 MDT 1996", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "This paper analyzes two memory hot spot problems associated with massively parallel MIMD computers. The first is the memory stride problem, which is similar to stride problems found in existing supercomputers. The second hot spot problem occurs in designs that use two separate memory accesses to lock and unlock critical sections (split transaction) and employ a first come/first serve queuing mechanism for shared memory locations. A bistability in throughput brought about by these conditions is analyzed and experimentally demonstrated. Simple equations are presented which predict the throughput at a critical section of code as a function of the number of applied threads. In particular, the mean size of the work items that can be executed in parallel without the possibility of stalling is proportional to the square of the number of threads applied.", acknowledgement = ack-nhfb, keywords = "Multiprocessors", } @InProceedings{Hirata:1991:MPA, author = "H. Hirata and Y. Mochizuki and A. Nishimura and Y. Nakase", title = "A Multithreaded Processor Architecture with Simultaneous Instruction Issuing", crossref = "Anonymous:1991:PIS", pages = "87--96", year = "1991", bibdate = "Mon Aug 26 10:38:41 MDT 1996", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @InProceedings{Hironaka:1991:SVP, author = "T. Hironaka and T. Hashimoto and K. Okazaki and K. Murakami", title = "A Single-Chip Vector-Processor Prototype Based on Multithreaded Streaming\slash {FIFO} ({MSFV}) Architecture", crossref = "Anonymous:1991:PIS", pages = "77--86", year = "1991", bibdate = "Mon Aug 26 10:38:41 MDT 1996", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Article{Horiguchi:1991:PEP, author = "Susumu Horiguchi and Takeo Nakada", title = "Performance Evaluation of Parallel Fast {Fourier} Transform on a Multiprocessor Workstation", journal = j-J-PAR-DIST-COMP, volume = "12", number = "2", pages = "158--163", month = jun, year = "1991", CODEN = "JPDCER", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Sat Apr 12 17:13:17 MDT 1997", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, classification = "C4190 (Other numerical methods); C4240 (Programming and algorithm theory); C5440 (Multiprocessor systems and techniques)", corpsource = "Dept. of Inf. Sci., Tohoku Univ., Sendai, Japan", fjournal = "Journal of Parallel and Distributed Computing", keywords = "algorithms; cache protocols; fast Fourier transform; fast Fourier transforms; FFT; floating-; multiprocess operating system; multiprocessing systems; multiprocessor workstation; multithread operating system; operating systems; parallel; parallel FFT; performance; performance evaluation; point coprocessors", treatment = "P Practical", } @Article{Jolitz:1991:PUB, author = "W. F. Jolitz and L. G. Jolitz", title = "Porting {UNIX} to the 386. The basic kernel Multiprogramming and multitasking. {II}", journal = j-DDJ, volume = "16", number = "10", pages = "62, 64, 66, 68, 70, 72, 118--120", month = oct, year = "1991", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Tue Sep 10 09:11:02 MDT 1996", bibsource = "http://www.ddj.com/index/author/index.htm; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, classification = "C6110 (Systems analysis and programming); C6150J (Operating systems)", fjournal = "Dr. Dobb's Journal of Software Tools", keywords = "386BSD kernel; Multiple simultaneous process execution; Multiprogramming; Multitasking; Multithread operations; Operating systems; Porting; Sleep( ); Swch( ); Switching mechanisms; UNIX; Wakeup( )", thesaurus = "C listings; Microprocessor chips; Multiprogramming; Software portability; Unix", } @InProceedings{Kuchlin:1991:MCI, author = "Wolfgang K{\"u}chlin", title = "On the multi-threaded computation of integral polynomial greatest common divisors", crossref = "Watt:1991:IPI", pages = "333--342", year = "1991", bibdate = "Thu Mar 12 08:38:03 MST 1998", bibsource = "http://www.acm.org/pubs/toc/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org:80/pubs/citations/proceedings/issac/120694/p333-kuchlin/", abstract = "Reports experiences and practical results from parallelizing the Brown--Collins polynomial g.c.d. algorithm, starting from Collins' SAC-2 implementation IPGCDC. The parallelization environment is PARSAC-2, a multi-threaded version of SAC-2 programmed in C with the parallelization constructs of the C Threads library. IPGCDC computes the g.c.d. and its co-factors of two polynomials in $Z(x_1,\ldots{},x_r)$, by first reducing the problem to multiple calculations of modular polynomial g.c.d.'s in $Z_p(x_1,\ldots{},x_r)$, and then recovering the result by Chinese remaindering. After studying timings of the SAC-2 algorithm, the author first parallelizes the Chinese remainder algorithm, and then parallelizes the main loop of IPGCDC by executing the modular g.c.d. computations concurrently. Finally, he determines speed-up's and speed-up efficiencies of our parallel algorithms over a wide range of polynomials. The experiments were conducted on a 12 processor Encore Multimax under Mach.", acknowledgement = ack-nhfb, affiliation = "Dept. of Comput. and Inf. Sci., Ohio State Univ., Columbus, OH, USA", classification = "C4240 (Programming and algorithm theory); C7310 (Mathematics)", keywords = "algorithms; Brown--Collins polynomial g.c.d. algorithm; Chinese remaindering; Encore Multimax; Multi-threaded computation; PARSAC-2; Polynomial greatest common divisors", subject = "{\bf G.1.0} Mathematics of Computing, NUMERICAL ANALYSIS, General, Parallel algorithms. {\bf F.2.1} Theory of Computation, ANALYSIS OF ALGORITHMS AND PROBLEM COMPLEXITY, Numerical Algorithms and Problems, Computations on polynomials. {\bf I.1.0} Computing Methodologies, SYMBOLIC AND ALGEBRAIC MANIPULATION, General. {\bf I.1.3} Computing Methodologies, SYMBOLIC AND ALGEBRAIC MANIPULATION, Languages and Systems. {\bf D.3.2} Software, PROGRAMMING LANGUAGES, Language Classifications, C.", thesaurus = "Mathematics computing; Parallel algorithms; Symbol manipulation", } @Article{Man:1991:MLC, author = "Richard F. Man", title = "A Multithreading Library In {C} For Subsumption Architecture", journal = j-CUJ, volume = "9", number = "11", pages = "42--??", month = nov, year = "1991", ISSN = "0898-9788", bibdate = "Fri Aug 30 16:52:23 MDT 1996", bibsource = "http://www.cuj.com/cbklist.htm; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "C Users Journal", } @Article{Marsh:1991:FCU, author = "Brian D. Marsh and Michael L. Scott and Thomas J. LeBlanc and Evangelos P. Markatos", title = "First-class user-level threads", journal = j-OPER-SYS-REV, volume = "25", number = "5", pages = "110--121", month = oct, year = "1991", CODEN = "OSRED8", ISSN = "0163-5980", ISSN-L = "0163-5980", bibdate = "Sat Aug 26 08:55:57 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Operating Systems Review", } @PhdThesis{Mennemeier:1991:HMS, author = "Lawrence Mennemeier", title = "Hardware mechanisms to support concurrent threads on {RISC} and superscalar multiprocessors", type = "Thesis ({M.S.})", school = "University of California, Santa Cru", pages = "vii + 39", year = "1991", LCCN = "QA76.5.M44 1991", bibdate = "Fri May 10 12:18:17 MDT 2002", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "Masters theses -- University of California, Santa Cruz -- 1991; multiprocessors; parallel processing (electronic computers)", } @Article{Papadopoulos:1991:MRV, author = "Gregory M. Papadopoulos and Kenneth R. Traub", title = "Multithreading: a revisionist view of dataflow architectures", journal = j-COMP-ARCH-NEWS, volume = "19", number = "3", pages = "342--351", month = may, year = "1991", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:01 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", } @PhdThesis{Park:1991:PTM, author = "Won Woo Park", title = "Performance-area trade-offs in multithreaded processing unit", type = "Thesis (Ph.D.)", school = "University of Texas at Austin", address = "Austin, TX, USA", pages = "xvii + 165", year = "1991", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "Computer architecture; Multiprocessors; Parallel processing (Electronic computers)", } @MastersThesis{Pham:1991:EMD, author = "Thuan Quang Pham", title = "The experimental migration of a distributed application to a multithreaded environment", type = "Thesis (M.S.)", school = "Massachusetts Institute of Technology, Department of Electrical Engineering and Computer Science", address = "Cambridge, MA, USA", pages = "51", year = "1991", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Article{Richman:1991:EHC, author = "Scott Richman", title = "Examining the {Hamilton C} shell ({Unix} power for {OS/2})", journal = j-DDJ, volume = "16", number = "1", pages = "98, 100, 102, 104--106", month = jan, year = "1991", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Tue Sep 10 09:11:02 MDT 1996", bibsource = "http://www.ddj.com/index/author/index.htm; http://www.math.utah.edu/pub/tex/bib/multithreading.bib; UnCover database", abstract = "Doug Hamilton's C Shell helps you create more powerful OS/2 programs.", acknowledgement = ack-nhfb, classification = "C6115 (Programming support); C6150E (General utility programs); C6150J (Operating systems)", fjournal = "Dr. Dobb's Journal of Software Tools", keywords = "C shell environment; C++ programs; High-performance file system; Large command lines; Long filenames; OS/2 features; Pipes; Presentation Manager; Script language; Script program; Shell scripts; Text windows; Threads; Utilities", thesaurus = "C listings; Software packages; Software tools; Utility programs", } @TechReport{Saavedra-Barrera:1991:ASM, author = "Rafael H. Saavedra-Barrera and David E. Culler", title = "An analytical solution for a {Markov} chain modeling multithreaded execution", type = "Report", number = "UCB/CSD 91/623", institution = "University of California, Berkeley, Computer Science Division", address = "Berkeley, CA, USA", pages = "24", month = apr, year = "1991", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Multithreading is an architectural technique aimed at maintaining high processor utilization in the presence of large memory or interprocessor communication latency. While waiting for a remote reference to complete, the processor switches to another execution thread. Several realizations of this concept have been proposed, but little data is available on the actual costs and benefits. This paper presents an analytical model of multithreaded execution, which may serve to guide and explain empirical studies. The model is based on three key parameters: thread run-length, switch cost, and latency. A closed-form expression for processor utilization is obtained for deterministic and stochastic run-lengths. The derivation involves identifying specific patterns in the very large set of equations forming the Markov chain. Using this result, three operating regimes are identified for a multithreaded processor subject to long latencies: linear, where utilization is proportional to the number of threads per processor, saturation, where utilization is determined only by the run-length and switch cost, and transition between the other regimes. The model can be used to estimate the effects of several architectural variations.", acknowledgement = ack-nhfb, annote = "Supported in part by NASA under consortium agreement NCA2-128 and cooperative agreement NCC2-550. Supported in part by the National Science Foundation.", keywords = "Computer architecture; Markov chains", } @MastersThesis{Schauser:1991:CDT, author = "Klaus Erik Schauser", title = "Compiling dataflow into threads: efficient compiler-controlled multithreading for lenient parallel languages", type = "Thesis (M.S.)", school = "University of California, Berkeley, Computer Science Division", address = "Berkeley, CA, USA", pages = "71", day = "2", month = jul, year = "1991", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "Also available as Report UCB/CSD 91/644", abstract = "Powerful non-strict parallel languages require fast dynamic scheduling. This thesis explores how the need for multithreaded execution can be addressed as a compilation problem, to achieve switching rates approaching what hardware mechanisms might provide. Compiler-controlled multithreading is examined through compilation of a lenient parallel language, ID90, for a threaded abstract machine, TAM. A key feature of TAM is that synchronization is explicit and occurs only at the start of a thread, so that a simple cost model can be applied. A scheduling hierarchy allows the compiler to schedule logically related threads closely together in time and to use registers across threads. Remote communication is via message sends and split-phase memory accesses. Messages and memory replies are received by compiler-generated message handlers which rapidly integrate these events with thread scheduling. To compile ID90 for TAM, we employ a new parallel intermediate form, dual-graphs, with distinct control and data arcs. This provides a clean framework for partitioning the program into threads, scheduling threads, and managing registers under asynchronous execution. The compilation process is described and preliminary measurements of the effectiveness of the approach are discussed. Previous to this work, execution of Id90 programs was limited to specialized architectures or dataflow graph interpreters. By compiling via TAM, we have achieved more than two orders of magnitude performance improvement over graph interpreters on conventional machines, making this Id90 implementation competitive with machines supporting dynamic instruction scheduling in hardware. Timing measurements show that our Id90 implementation on a standard RISC can achieve a performance close to Id90 on one processor of the recent dataflow machine Monsoon. It can be seen that the TAM partitioning presented in this thesis reduces the control overhead substantially and that more aggressive partitioning would yield modest additional benefit. There is, however, considerable room for improvement in scheduling and register management.", acknowledgement = ack-nhfb, annote = "Supported in part by the National Science Foundation. Supported in part by Motorola Inc., the TRW Foundation, and the International Computer Science Institute", keywords = "Compilers (Computer programs); Parallel programming (Computer science)", } @TechReport{Schauser:1991:CML, author = "Klaus Erik Schauser and David E. Culler and Thorsten {von Eicken}", title = "Compiler-controlled multithreading for lenient parallel languages", type = "Report", number = "UCB/CSD 91/640", institution = "University of California, Berkeley, Computer Science Division", address = "Berkeley, CA, USA", pages = "21", day = "30", month = jul, year = "1991", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "A version of this report is to appear in the Proceedings of FPCA '91 Conference on Functional Programming Languages and Computer Architecture, Aug. 1991, Springer-Verlag", abstract = "Tolerance to communication latency and inexpensive synchronization are critical for general-purpose computing on large multiprocessors. Fast dynamic scheduling is required for powerful nonstrict parallel languages. However, machines that support rapid switching between multiple execution threads remain a design challenge. This paper explores how multithreaded execution can be addressed as a compilation problem, to achieve switching rates approaching what hardware mechanisms might provide. Compiler-controlled multithreading is examined through compilation of a lenient parallel language, Id90, for a threaded abstract machine, TAM. A key feature of TAM is that synchronization is explicit and occurs only at the start of a thread, so that a simple cost model can be applied. A scheduling hierarchy allows the compiler to schedule logically related threads closely together in time and to use registers across threads. Remote communication is via message sends and split-phase memory accesses. Messages and memory replies are received [sic] by compiler-generated message handlers which rapidly integrate these events with thread scheduling. To compile Id90 for TAM, we employ a new parallel intermediate form, dual-graphs, with distinct control and data arcs. This provides a clean framework for partitioning the program into threads, scheduling threads, and managing registers under asynchronous execution. The compilation process is described and preliminary measurements of its effectiveness are discussed. Dynamic execution measurements are obtained via a second compilation step, which translates TAM into native code for existing machines with instrumentation incorporated. These measurements show that the cost of compiler-controlled multithreading is within a small factor of the cost of control flow in sequential languages.", acknowledgement = ack-nhfb, annote = "Supported in part by the National Science Foundation PYI Award. Supported in part by Motorola Inc., the TRW Foundation and the Semiconductor Research Corporation Supported in part by J. Wawrzynek's PYI Award. Supported in part by NSF Infrastructure Grant.", keywords = "Compilers (Computer programs); Parallel programming (Computer science)", } @Article{Schwan:1991:RTT, author = "Karsten Schwan and Hongyi Zhou and Ahmed Gheith", title = "Real-time threads", journal = j-OPER-SYS-REV, volume = "25", number = "4", pages = "35--46", month = oct, year = "1991", CODEN = "OSRED8", ISSN = "0163-5980", ISSN-L = "0163-5980", bibdate = "Sat Aug 26 08:55:51 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Operating Systems Review", } @Article{Speer:1991:DTP, author = "Thomas G. Speer and Mark W. Storm", title = "{Digital}'s Transaction Processing Monitors", journal = j-DEC-TECH-J, volume = "3", number = "1", pages = "18--32", month = "Winter", year = "1991", CODEN = "DTJOEL", ISSN = "0898-901X", bibdate = "Thu Mar 20 18:15:43 MST 1997", bibsource = "/usr/local/src/bib/bibliography/Database/Graefe.bib; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "ftp://ftp.digital.com/pub/Digital/info/DTJ/v3n1/Digitals_Transaction_Processi_01oct1991DTJ102P8.ps; http://www.digital.com:80/info/DTJ102/DTJ102SC.TXT", abstract = "Digital provides two transaction processing (TP) monitor products --- ACMS (Application Control and Management System) and DECintact (Integrated Application Control). Each monitor is a unified set of transaction processing services for the application environment. These services are layered on the VMS operating system. Although there is a large functional overlap between the two, both products achieve similar goals by means of some significantly different implementation strategies. Flow control and multithreading in the ACMS monitor is managed by means of a fourth-generation language (4GL) task definition language. Flow control and multithreading in the DECintact monitor is managed at the application level by third-generation language (3GL) calls to a library of services. The ACMS monitor supports a deferred task model of queuing, and the DECintact monitor supports a message-based model. Over time, the persistent distinguishing feature between the two monitors will be their different application programming inter faces.", acknowledgement = ack-nhfb, affiliation = "Digital Equipment Corp., Maynard, MA, USA", classcodes = "C6150J (Operating systems)", classification = "C6150J (Operating systems)", corpsource = "Digital Equipment Corp., Maynard, MA, USA", fjournal = "Digital Technical Journal", keywords = "ACMS; Application; Application Control; Application Control and Management System; Application programming interfaces; application programming interfaces; Control and Management System; DECintact; Digital; Integrated; Integrated Application Control; message-based model; Message-based model; monitors; Monitors; Multithreading; multithreading; Queuing; queuing; supervisory programs; task definition language; Task definition language; transaction processing; Transaction processing; transaction processing; VMS operating system", thesaurus = "Supervisory programs; Transaction processing", treatment = "P Practical", } @Article{Agarwal:1992:PTM, author = "Anant Agarwal", title = "Performance tradeoffs in multithreaded processors", journal = j-IEEE-TRANS-PAR-DIST-SYS, volume = "3", number = "5", pages = "525--539", month = sep, year = "1992", CODEN = "ITDSEO", DOI = "http://dx.doi.org/10.1109/71.159037", ISSN = "1045-9219 (print), 1558-2183 (electronic)", ISSN-L = "1045-9219", bibdate = "Fri Apr 11 15:20:39 MDT 1997", bibsource = "Compendex database; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, affiliation = "Lab for Comput Sci, MIT, Cambridge, MA, USA", classification = "722.1; 722.4; C4230M (Multiprocessor interconnection); C4240P (Parallel programming and algorithm theory); C5220P (Parallel architecture); C5320G (Semiconductor storage); C5440 (Multiprocessor systems and techniques); C5470 (Performance evaluation and testing); C6120 (File organisation)", corpsource = "Lab. for Comput. Sci., MIT, Cambridge, MA, USA", fjournal = "IEEE Transactions on Parallel and Distributed Systems", journalabr = "IEEE Trans Parallel Distrib Syst", keywords = "buffer storage; cache interference; Cache memories; caches; contention; context-switching overhead; data-sharing; Digital storage; interconnection networks; Interconnection networks; multiprocessing systems; multiprocessor; multithreaded processors; network; network bandwidth; parallel; parallel algorithms; Parallel processing systems; Performance; Performance analysis; performance evaluation; Pipeline processing systems; programming; storage management; switching theory", treatment = "P Practical; T Theoretical or Mathematical", } @InProceedings{Alverson:1992:EHP, author = "G. A. Alverson and R. Alverson and D. Callahan and B. Koblenz", title = "Exploiting Heterogeneous Parallelism on a Multi-threaded Multiprocessor", crossref = "ACM:1992:CPI", pages = "188--197", year = "1992", bibdate = "Mon Aug 26 10:38:41 MDT 1996", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Article{Anderson:1992:SAE, author = "Thomas E. Anderson and Brian N. Bershad and Edward D. Lazowska and Henry M. Levy", title = "Scheduler Activations: Effective Kernel Support for the User-Level Management of Parallelism", journal = j-TOCS, volume = "10", number = "1", pages = "53--79", month = feb, year = "1992", CODEN = "ACSYEC", ISSN = "0734-2071", ISSN-L = "0734-2071", bibdate = "Wed Jan 13 18:36:53 MST 1999", bibsource = "http://www.acm.org/pubs/contents/journals/tocs/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org:80/pubs/citations/journals/tocs/1992-10-1/p53-anderson/", abstract = "{\em Threads\/} are the vehicle for concurrency in many approaches to parallel programming. Threads can be supported either by the operating system kernel or by user-level library code in the application address space, but neither approach has been fully satisfactory. This paper addresses this dilemma. First, we argue that the performance of kernel threads is {\em inherently\/} worse than that of user-level threads, rather than this being an artifact of existing implementations; managing parallelism at the user level is essential to high-performance parallel computing. Next, we argue that the problems encountered in integrating user-level threads with other system services is a consequence of the lack of kernel support for user-level threads provided by contemporary multiprocessor operating systems; kernel threads are the {\em wrong abstraction\/} on which to support user-level management of parallelism. Finally, we describe the design, implementation, and performance of a new kernel interface and user-level thread package that together provide the same functionality as kernel threads without compromising the performance and flexibility advantages of user-level management of parallelism.", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Computer Systems", keywords = "design; measurement; performance", subject = "{\bf D.4.1} Software, OPERATING SYSTEMS, Process Management, Scheduling. {\bf D.4.4} Software, OPERATING SYSTEMS, Communications Management, Input/output. {\bf D.4.7} Software, OPERATING SYSTEMS, Organization and Design. {\bf D.4.8} Software, OPERATING SYSTEMS, Performance.", } @MastersThesis{Arunachalam:1992:EMM, author = "Prakash Arunachalam", title = "Evaluation of a multithreaded microprocessor with {MIPS R3000} instruction set", type = "Thesis (M.S. in Engineering)", school = "University of Texas at Austin", address = "Austin, TX, USA", pages = "vii + 45", year = "1992", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "Computer architecture; MIPS R3000 series microprocessors; Parallel processing (Electronic computers); Reduced instruction set computers; RISC microprocessors", } @Article{Bauer:1992:PCE, author = "Barr E. Bauer", title = "Parallel {C} extensions", journal = j-DDJ, volume = "17", number = "8", pages = "110, 112--114, 124, 127", month = aug, year = "1992", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Tue Sep 10 10:06:23 MDT 1996", bibsource = "http://www.ddj.com/index/author/index.htm; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, affiliation = "Schering-Plough Res. Inst., Bloomfield, NJ, USA", classification = "C6110P (Parallel programming); C6140D (High level languages); C6150C (Compilers, interpreters and other processors)", fjournal = "Dr. Dobb's Journal of Software Tools", keywords = "C extensions; C programs; Parallel execution regions; Parallel execution threads; Parallelized program; Serial program; Silicon Graphics IRIS Power C compiler", thesaurus = "C language; C listings; Parallel languages; Program compilers", } @Article{Bershad:1992:FME, author = "Brian N. Bershad and David D. Redell and John R. Ellis", title = "Fast mutual exclusion for uniprocessors", journal = j-SIGPLAN, volume = "27", number = "9", pages = "223--233", month = sep, year = "1992", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:16:26 MST 2003", bibsource = "http://portal.acm.org/; http://www.acm.org/pubs/toc/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org:80/pubs/citations/proceedings/asplos/143365/p223-bershad/", abstract = "In this paper we describe restartable atomic sequences, an {\em optimistic\/} mechanism for implementing simple atomic operations (such as {\em Test-And-Set\/}) on a uniprocessor. A thread that is suspended within a restartable atomic sequence is resumed by the operating system at the beginning of the sequence, rather than at the point of suspension. This guarantees that the thread eventually executes the sequence {\em atomically\/}. A restartable atomic sequence has significantly less overhead than other software-based synchronization mechanisms, such as kernel emulation or software reservation. Consequently, it is an attractive alternative for use on uniprocessors that do no support atomic operations. Even on processors that do support atomic operations in hardware, restartable atomic sequences can have lower overhead. We describe different implementations of restartable atomic sequences for the Mach 3.0 and Taos operating systems. These systems' thread management packages rely on atomic operations to implement higher-level mutual exclusion facilities. We show that improving the performance of low-level atomic operations, and therefore mutual exclusion mechanisms, improves application performance.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "design; languages; measurement; performance", subject = "{\bf D.4.1} Software, OPERATING SYSTEMS, Process Management, Mutual exclusion.", } @MastersThesis{Blumofe:1992:MSM, author = "Robert D. (Robert David) Blumofe", title = "Managing storage for multithreaded computations", type = "Thesis (M.S.)", school = "Massachusetts Institute of Technology, Laboratory for Computer Science, Department of Electrical Engineering and Computer Science", address = "Cambridge, MA, USA", pages = "83", year = "1992", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "Also available as Report MIT/LCS/TR 552.", acknowledgement = ack-nhfb, } @Article{Boothe:1992:IMT, author = "Bob Boothe and Abhiram Ranade", title = "Improved multithreading techniques for hiding communication latency in multiprocessors", journal = j-COMP-ARCH-NEWS, volume = "20", number = "2", pages = "214--223", month = may, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", } @Article{Cattaneo:1992:ACT, author = "G. Cattaneo and G. Di Giore and M. Ruotolo", title = "Another {C} Threads Library", journal = j-SIGPLAN, volume = "27", number = "12", pages = "81--90", month = dec, year = "1992", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:16:30 MST 2003", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", } @MastersThesis{Chowdhury:1992:PEA, author = "Indranil Chowdhury", title = "Performance evaluation and architecture of an instruction cache for multithreaded {RISC} processor", type = "Thesis (M.S. in Engineering)", school = "University of Texas at Austin", address = "Austin, TX, USA", pages = "x + 93", year = "1992", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "Cache memory -- Evaluation -- Simulation methods; Computer architecture; Microprocessors; Reduced instruction set computers", } @Article{Culler:1992:AMM, author = "David E. Culler and Michial Gunter and James C. Lee", title = "Analysis of multithreaded microprocessors under multiprogramming", journal = j-COMP-ARCH-NEWS, volume = "20", number = "2", pages = "438--438", month = may, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", } @TechReport{Culler:1992:AMMa, author = "David E. Culler and Michial Gunter and James C. Lee", title = "Analysis of multithreaded microprocessors under multiprogramming", type = "Report", number = "UCB/CSD 92/687", institution = "University of California, Berkeley, Computer Science Division", address = "Berkeley, CA, USA", pages = "17", month = may, year = "1992", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Multithreading has been proposed as a means of tolerating long memory latencies in multiprocessor systems. Fundamentally, it allows multiple concurrent subsystems (cpu, network, and memory) to be utilized simultaneously. This is advantageous on uniprocessor systems as well, since the processor is utilized while the memory system services misses. We examine multithreading on high-performance uniprocessors as a means of achieving better cost/performance on multiple processes. Processor utilization and cache behavior are studied both analytically and through simulation of timesharing and multithreading using interleaved reference traces. Multithreading is advantageous when one has large on-chip caches (32 kilobytes), associativity of two, and a memory access cost of roughly 50 instruction times. At this point, a small number of threads (2-4) is sufficient, the thread switch need not be extraordinarily fast, and the memory system need support only one or two outstanding misses. The increase in processor real-estate to support multithreading is modest, given the size of the cache and floating-point units. A surprising observation is that miss ratios may be lower with multithreading than with timesharing under a steady-state load. This occurs because switch-on-miss multithreading introduces unfair thread scheduling, giving more CPU cycles to processes with better cache behavior.", acknowledgement = ack-nhfb, annote = "Supported in part by the National Science Foundation. Supported in part by Motorola Inc. and the TRW Foundation", keywords = "Microprocessors; Multiprogramming (Electronic computers)", } @Article{Culler:1992:AMMb, author = "David E. Culler and Michial Gunter and James C. Lee", title = "Analysis of multithreaded microprocessors under multiprogramming", journal = j-COMP-ARCH-NEWS, volume = "20", number = "2", pages = "438--438", month = may, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", } @Article{Day:1992:INB, author = "Michael Day", title = "Implementing {NLM-Based} Client\slash Server Architectures", journal = j-DDJ, volume = "17", number = "10", pages = "78--84", month = oct, year = "1992", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Tue Sep 03 09:15:34 1996", bibsource = "http://www.ddj.com/index/author/index.htm; http://www.math.utah.edu/pub/tex/bib/multithreading.bib; UnCover database", abstract = "NetWare NLMs take full advantage of the multitasking, multithreaded architecture of the operating system. Michael presents a distributed file manager made up of two modules: ENGINE.NLM, an NLM running on a NetWare 3.x server, and CLIENT.EXE, a DOS-based front end running on the client.", acknowledgement = ack-nhfb, classification = "C6150N (Distributed systems)", fjournal = "Dr. Dobb's Journal of Software Tools", keywords = "32-Bit protected-mode programs; Client/server architectures; Distributed file manager; DOS-based front end; Multitasking; Multithreaded architecture; NetWare 3.x operating system; Netware Loadable Modules; Networked system", thesaurus = "Distributed processing; File servers", } @Article{Day:1992:INC, author = "Michael Day", title = "Implementing {NLM-Based} Client\slash Server Architectures", journal = j-DDJ, volume = "17", number = "10", pages = "78--84", month = oct, year = "1992", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Tue Sep 03 09:15:34 1996", bibsource = "http://www.ddj.com/index/author/index.htm; http://www.math.utah.edu/pub/tex/bib/multithreading.bib; UnCover database", abstract = "NetWare NLMs take full advantage of the multitasking, multithreaded architecture of the operating system. Michael presents a distributed file manager made up of two modules: ENGINE.NLM, an NLM running on a NetWare 3.x server, and CLIENT.EXE, a DOS-based front end running on the client.", acknowledgement = ack-nhfb, classification = "C6150N (Distributed systems)", fjournal = "Dr. Dobb's Journal of Software Tools", keywords = "32-Bit protected-mode programs; Client/server architectures; Distributed file manager; DOS-based front end; Multitasking; Multithreaded architecture; NetWare 3.x operating system; Netware Loadable Modules; Networked system", thesaurus = "Distributed processing; File servers", } @Article{DHollander:1992:PLL, author = "Erik H. D'Hollander", title = "Partitioning and labeling of loops by unimodular transformations", journal = j-IEEE-TRANS-PAR-DIST-SYS, volume = "3", number = "4", pages = "465--476", month = jul, year = "1992", CODEN = "ITDSEO", DOI = "http://dx.doi.org/10.1109/71.149964", ISSN = "1045-9219 (print), 1558-2183 (electronic)", ISSN-L = "1045-9219", MRclass = "68Q10 (68Q22)", MRnumber = "93f:68030", bibdate = "Mon Apr 14 07:37:07 1997", bibsource = "Compendex database; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, affiliation = "Dept of Electr Eng, State Univ of Ghent, Belgium", classification = "722; 723; C4240P (Parallel programming and algorithm theory); C6110P (Parallel programming); C6150C (Compilers, interpreters and other processors)", corpsource = "Dept. of Electr. Eng., State Univ. of Ghent, Belgium", fjournal = "IEEE Transactions on Parallel and Distributed Systems", journalabr = "IEEE Trans Parallel Distrib Syst", keywords = "computational complexity; Computer Programming --- Algorithms; Computer Systems Programming; constant dependence vectors; dependence matrix; dependent iterations; do-loops; fold nested loop; independent subsets; invariant dependence; join; labelling algorithm; loop labelling; loop partitioning; Multiprocessing Programs; multithreaded dynamic scheduling; n-; parallel; parallel algorithms; parallel DO-ALL loops; partitioning algorithm; Partitioning Algorithms; primitive; program compilers; Program Transformations; programming; programming theory; relation; scheduling; serial loop; transformation; unimodular; Unimodular Transformations; unimodular transformations", treatment = "T Theoretical or Mathematical", } @MastersThesis{Donalson:1992:DDP, author = "Douglas Dale Donalson", title = "{DISC}: a dynamic performance evaluation of a multi-thread architecture", type = "Thesis ({M.S.})", school = "Electrical and Computer Engineering Department, University of California, Santa Barbara", address = "Santa Barbara, CA, USA", pages = "ix + 88", year = "1992", LCCN = "TK174.C2 S25 DOND 1992", bibdate = "Sat Apr 20 11:18:53 2002", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @TechReport{Felten:1992:IPM, author = "Edward W. Felten and Dylan James McNamee", title = "Improving the performance of message-passing applications by multithreading", type = "Technical report", number = "92-09-07", institution = "University of Washington, Dept. of Computer Science and Engineering", address = "Seattle, WA, USA", pages = "6", year = "1992", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Achieving maximum performance in message-passing programs requires that calculation and communication be overlapped. However, the program transformations required to achieve this overlap are error-prone and add significant complexity to the application program. We argue that calculation/communication overlap can be achieved easily and consistently by executing multiple threads of control on each processor, and that this approach is practical on message-passing architectures without any special hardware support. We present timing data for a typical message-passing application, to demonstrate the advantages of our scheme.", acknowledgement = ack-nhfb, annote = "Supported in part by the National Science Foundation. Supported in part by the Washington Technology Center, Digital Equipment Corporation, Apple Computer Company, a Mercury Seven Fellowship and an AT\&T Ph.D. Scholarship", keywords = "Operating systems", } @TechReport{Gokhale:1992:ICI, author = "Maya B. Gokhale and William W. Carlson", title = "An introduction to compilation issues for parallel machines", type = "Technical report", number = "SRC-TR-92-062", institution = inst-SRC-IDA, address = inst-SRC-IDA:adr, pages = "38", day = "8", month = sep, year = "1992", bibdate = "Fri Aug 30 08:01:51 MDT 1996", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "The exploitation of today's high-performance computer systems requires the effective use of parallelism in many forms and at numerous levels. This survey article discusses program analysis and restructuring techniques that target parallel architectures. We first describe various categories of architectures that are oriented toward parallel computation models: vector architectures, shared memory multiprocessors, massively parallel machines, message-passing architectures, VLIWs, and multithreaded architectures. We then describe a variety of optimization techniques that can be applied to sequential programs to effectively utilize the vector and parallel processing units. After an overview of basic dependence analysis, we present restructuring transformations on DO loops targeted both to vectorization and to concurrent execution, interprocedural and pointer analysis, task scheduling, instruction level parallelization, and compiler-assisted data placement. We conclude that although tremendous advances have been made in dependence theory and in the development of a `toolkit' of transformations, parallel systems are used most effectively when the programmer interacts in the optimization process.", acknowledgement = ack-nhfb, keywords = "Compilers (Computer programs); Computer architecture; Parallel processing (Electronic computers)", } @TechReport{Haines:1992:SMC, author = "Matt Haines and Anton Pedro Willem Bohm", title = "Software multithreading in a conventional distributed memory multiprocessor", type = "Technical report", number = "CS-92-126", institution = "Colorado State University, Dept. of Computer Science", address = "Fort Collins, CO, USA", pages = "25", day = "25", month = sep, year = "1992", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Today's most powerful computers are distributed memory multiprocessors. Although they possess massive amounts of available resources, it is often difficult to exploit these resources efficiently. Compilers that can cope with the complexities of these systems are being constructed, but their scope of effect is often limited due to the complexity of the analysis and the lack of runtime information. Novel architectures that can better tolerate latencies are under construction, but their effectiveness is unproven, and they do little to ease the burden on current commercial machines. Therefore we are designing a runtime system, called VISA, that attempts to avoid and tolerate latencies on conventional distributed memory multiprocessors, as well as provide a single addressing space to ease the burden of programming or code generation. The goal of our runtime system is to serve as a tool for studying the effects of latency avoidance and latency tolerance on programs running on these conventional architectures. In this paper we describe the design and implementation of multithreading in the VISA runtime system for the purpose of latency tolerance. In particular, we examine machine-independent designs for thread representation, thread switching, and split-phased transactions. We quantify the cost of multithreading for our environment, present a test program for which multithreading degrades performance, and present a program for which multithreading enhances performance.", acknowledgement = ack-nhfb, annote = "Supported in part by a grant from Sandia National Laboratories", keywords = "Multiprocessors", } @Article{Halladay:1992:PUM, author = "Steve Halladay and Michael Wiebel", title = "A Practical Use For Multiple Threads", journal = j-CUJ, volume = "10", number = "1", pages = "73--??", month = jan, year = "1992", ISSN = "0898-9788", bibdate = "Fri Aug 30 16:52:23 MDT 1996", bibsource = "http://www.cuj.com/cbklist.htm; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "C Users Journal", } @Article{Hirata:1992:EPA, author = "Hiroaki Hirata and Kozo Kimura and Satoshi Nagamine and Yoshiyuki Mochizuki and Akio Nishimura and Yoshimori Nakase and Teiji Nishizawa", title = "An elementary processor architecture with simultaneous instruction issuing from multiple threads", journal = j-COMP-ARCH-NEWS, volume = "20", number = "2", pages = "136--145", month = may, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", } @Article{Hirata:1992:MPA, author = "H. Hirata and Y. Mochizuki and A. Nishmura and Y. Nakase and T. Nishizawa", title = "A multithreaded processor architecture with simultaneous instruction issuing", journal = j-SUPERCOMPUTER, volume = "9", number = "3", pages = "23--39", month = may, year = "1992", CODEN = "SPCOEL", ISSN = "0168-7875", bibdate = "Wed Mar 18 08:37:01 MST 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, affiliation = "Media Res. Lab., Matsushita Electr. Ind. Co., Osaka, Japan", classification = "C5220P (Parallel architecture); C6110P (Parallel programming); C6150J (Operating systems)", corpsource = "Media Res. Lab., Matsushita Electr. Ind. Co., Osaka, Japan", fjournal = "Supercomputer", keywords = "functional unit; independent instruction streams; multiprogramming; multithreaded processor architecture; parallel processing; scheduling; simultaneous instruction issuing; vector machines; VLW machines", pubcountry = "Netherlands", treatment = "P Practical", } @InProceedings{Hironaka:1992:BVP, author = "T. Hironaka and T. Hashimoto and K. Okazaki and K. Murakami", title = "Benchmarking a Vector-Processor Prototype Based on Multithreaded Streaming\slash {FIFO} Vector ({MSFV}) Architecture", crossref = "ACM:1992:CPI", pages = "272--281", year = "1992", bibdate = "Mon Aug 26 10:38:41 MDT 1996", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Article{Hum:1992:HSM, author = "Herbert H. J. Hum and Guang R. Gao", title = "A high-speed memory organization for hybrid dataflow\slash {von Neumann} computing", journal = j-FUT-GEN-COMP-SYS, volume = "8", number = "4", pages = "287--301", month = sep, year = "1992", CODEN = "FGSEVI", ISSN = "0167-739X (print), 1872-7115 (electronic)", ISSN-L = "0167-739X", bibdate = "Fri Jul 15 09:06:02 MDT 2005", bibsource = "ftp://ftp.ira.uka.de/bibliography/Os/threads.bib; http://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.sciencedirect.com/science/journal/0167739X", abstract = "The paper proposes a novel organization of high-speed memories, known as the register-cache, for a multi-threaded architecture. Viewed from the execution unit, its contents are addressable as ordinary CPU registers using relatively short addresses. From the main memory perspective, it is content addressable. In this register-cache organization, a number of registers are grouped into a block of registers where a register in a block is accessed using an offset from the address of the block, an offset value which is embedded in the compiler generated code. The binding of register block locations to register-cache line addresses is adaptively performed at runtime, thus resulting in a dynamically allocated register file. In this execution model, a program is compiled into a number of instruction threads called super-actors. A super-actor becomes ready for execution only when its input data are physically residing in the register-cache and space is reserved in the register-cache to store its result.", acknowledgement = ack-nhfb, fjournal = "Future Generation Computer Systems", } @Article{Jagannathan:1992:CSC, author = "Suresh Jagannathan and Jim Philbin", title = "A customizable substrate for concurrent languages", journal = j-SIGPLAN, volume = "27", number = "7", pages = "55--67", month = jul, year = "1992", CODEN = "SINODQ", ISBN = "0-89791-475-9", ISBN-13 = "978-0-89791-475-8", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", LCCN = "QA76.7.S53 1992", bibdate = "Sun Dec 14 09:16:22 MST 2003", bibsource = "Compendex database; http://www.acm.org/pubs/contents/proceedings/pldi/143095/index.html; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org:80/pubs/citations/proceedings/pldi/143095/p55-jagannathan/", abstract = "We describe an approach to implementing a wide-range of concurrency paradigms in high-level (symbolic) programming languages. The focus of our discussion is STING, a dialect of Scheme, that supports lightweight threads of control and virtual processors as first-class objects. Given the significant degree to which the behavior of these objects may be customized, we can easily express a variety of concurrency paradigms and linguistic structures within a common framework without loss of efficiency. Unlike parallel systems that rely on operating system services for managing concurrency, STING implements concurrency management entirely in terms of Scheme objects and procedures. It, therefore, permits users to optimize the runtime behavior of their applications without requiring knowledge of the underlying runtime system. This paper concentrates on (a) the implications of the design for building asynchronous concurrency structures, (b) organizing large-scale concurrent computations, and (c) implementing robust programming environments for symbolic computing.", acknowledgement = ack-nhfb, affiliation = "NEC Research Inst", affiliationaddress = "Princeton, NJ, USA", annote = "Published as part of the Proceedings of PLDI'92.", classification = "723.1", conference = "Proceedings of the ACM SIGPLAN '92 Conference on Programming Language Design and Implementation", conferenceyear = "1992", fjournal = "ACM SIGPLAN Notices", journalabr = "SIGPLAN Not", keywords = "algorithms; Computer programming languages; Concurrency paradigms; Concurrency structures; design; languages; Parallel processing systems; performance; Robust programming; Symbolic programming languages", meetingaddress = "San Francisco, CA, USA", meetingdate = "Jun 17--19 1992", meetingdate2 = "06/17--19/92", sponsor = "ACM", subject = "{\bf D.3.2} Software, PROGRAMMING LANGUAGES, Language Classifications, Concurrent, distributed, and parallel languages. {\bf D.3.2} Software, PROGRAMMING LANGUAGES, Language Classifications, SCHEME. {\bf D.1.3} Software, PROGRAMMING TECHNIQUES, Concurrent Programming, Parallel programming.", } @Article{Koopman:1992:CBC, author = "Philip J. {Koopman, Jr.} and Peter Lee and Daniel P. Siewiorek", title = "Cache Behavior of Combinator Graph Reduction", journal = j-TOPLAS, volume = "14", number = "2", pages = "265--297", month = apr, year = "1992", CODEN = "ATPSDT", ISSN = "0164-0925 (print), 1558-4593 (electronic)", ISSN-L = "0164-0925", bibdate = "Sat Jan 06 14:28:31 1996", bibsource = "Compiler/Compiler.Lins.bib; Compiler/garbage.collection.bib; Compiler/Heaps.bib; Compiler/TOPLAS.bib; http://www.math.utah.edu/pub/tex/bib/multithreading.bib; Theory/CLiCS.bib", note = "Also see~\cite{Koopman:1992:CBC}.", URL = "http://www.acm.org/pubs/toc/Abstracts/0164-0925/128867.html", abstract = "The results of cache-simulation experiments with an abstract machine for reducing combinator graphs are presented. The abstract machine, called TIGRE, exhibits reduction rates that, for similar kinds of combinator graphs on similar kinds of hardware, compare favorably with previously reported techniques. Furthermore, TIGRE maps easily and efficiently onto standard computer architectures, particularly those that allow a restricted form of self-modifying code. This provides some indication that the conventional ``stored program'' organization of computer systems is not necessarily an inappropriate one for functional programming language implementations.\par This is not to say, however, that present day computer systems are well equipped to reduce combinator graphs. In particular, the behavior of the cache memory has a significant effect on performance. In order to study and quantify this effect, trace-driven cache simulations of a TIGRE graph reducer running on a reduced instruction-set computer are conducted. The results of these simulations are presented with the following hardware-cache parameters varied: cache size, block size, associativity, memory update policy, and write-allocation policy. To begin with, the cache organization of a commercially available system is used and then the performance sensitivity with respect to variations of each parameter are measured. From the results of the simulation study, a conclusion is made that combinator-graph reduction using TIGRE runs most efficiently when using a cache memory with an allocate-on-write-miss strategy, moderately large block size (preferably with subblock placement), and copy-back memory updates.", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Programming Languages and Systems", keywords = "algorithms; languages; performance; theory; threading", sjb = "In amongst all the cache stuff is a description of how subroutine threading can form the basis for a relatively efficient method of performing combinator graph reduction.", subject = "{\bf B.3.2}: Hardware, MEMORY STRUCTURES, Design Styles, Cache memories. {\bf B.3.3}: Hardware, MEMORY STRUCTURES, Performance Analysis and Design Aids, Simulation. {\bf D.1.1}: Software, PROGRAMMING TECHNIQUES, Applicative (Functional) Programming. {\bf D.3.2}: Software, PROGRAMMING LANGUAGES, Language Classifications, Applicative languages. {\bf D.3.4}: Software, PROGRAMMING LANGUAGES, Processors, Compilers. {\bf D.3.4}: Software, PROGRAMMING LANGUAGES, Processors, Interpreters. {\bf G.2.1}: Mathematics of Computing, DISCRETE MATHEMATICS, Combinatorics.", } @Article{Nikhil:1992:MMP, author = "R. S. Nikhil and G. M. Papadopoulos and Arvind", title = "{T}: a multithreaded massively parallel architecture", journal = j-COMP-ARCH-NEWS, volume = "20", number = "2", pages = "156--167", month = may, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", } @Article{Ogata:1992:DIH, author = "Kazuhiro Ogata and Satoshi Kurihara and Mikio Inari and Norihisa Doi", title = "The design and implementation of {HoME}", journal = j-SIGPLAN, volume = "27", number = "7", pages = "44--54", month = jul, year = "1992", CODEN = "SINODQ", ISBN = "0-89791-475-9", ISBN-13 = "978-0-89791-475-8", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", LCCN = "QA76.7.S53 1992", bibdate = "Sun Dec 14 09:16:22 MST 2003", bibsource = "Compendex database; http://www.acm.org/pubs/contents/proceedings/pldi/143095/index.html; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org:80/pubs/citations/proceedings/pldi/143095/p44-ogata/", abstract = "HoME is a version of Smalltalk which can be efficiently executed on a multiprocessor and can be executed in parallel by combining a Smalltalk process with a Mach thread and executing the process on the thread. HoME is nearly the same as ordinary Smalltalk except that multiple processes may execute in parallel. Thus, almost all applications running on ordinary Smalltalk can be executed on HoME without changes in their code. HoME was designed and implemented based on the following fundamental policies: (1) theoretically, an infinite number of processes can become active; (2) the moment a process is scheduled, it becomes active; (3) no process switching occurs; (4) HoME is equivalent to ordinary Smalltalk except for the previous three policies. The performance of the current implementation of HoME running on OMRON LUNA-88K, which had four processors, was measured by benchmarks which execute in parallel with multiple processes. In all benchmarks, the results showed that HoME's performance is much better than HPS on the same workstation.", acknowledgement = ack-nhfb, affiliation = "Keio Univ", affiliationaddress = "Yokohama, Jpn", annote = "Published as part of the Proceedings of PLDI'92.", classification = "723.1", conference = "Proceedings of the ACM SIGPLAN '92 Conference on Programming Language Design and Implementation", conferenceyear = "1992", fjournal = "ACM SIGPLAN Notices", journalabr = "SIGPLAN Not", keywords = "Computer programming; design; HPS on Mach environment; languages; measurement; Object oriented programming; performance; Smalltalk", meetingaddress = "San Francisco, CA, USA", meetingdate = "Jun 17--19 1992", meetingdate2 = "06/17--19/92", sponsor = "ACM", subject = "{\bf D.1.3} Software, PROGRAMMING TECHNIQUES, Concurrent Programming. {\bf D.3.4} Software, PROGRAMMING LANGUAGES, Processors. {\bf D.3.2} Software, PROGRAMMING LANGUAGES, Language Classifications, Smalltalk. {\bf D.2.8} Software, SOFTWARE ENGINEERING, Metrics, Performance measures.", } @InProceedings{Papadopoulos:1992:MCS, author = "G. M. Papadopoulos and A. P. W. Bohm and A. T. Dahbura and R. R. Oldehoeft", title = "Multithreaded computer systems", crossref = "IEEE:1992:PSM", pages = "772--775", year = "1992", bibdate = "Wed Apr 15 15:37:20 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, classification = "C5220P (Parallel architecture)", corpsource = "Lab. for Comput. Sci., MIT, Cambridge, MA, USA", keywords = "architectural principles; data matching; multithreaded computer systems; parallel architectures; parallel machines; split-phase memory accesses", sponsororg = "IEEE; ACM", treatment = "P Practical", } @Article{Sato:1992:TBP, author = "Mitsuhisa Sato and Yuetsu Kodama and Shuichi Sakai and Yoshinori Yamaguchi and Yasuhito Koumura", title = "Thread-based programming for the {EM-4} hybrid dataflow machine", journal = j-COMP-ARCH-NEWS, volume = "20", number = "2", pages = "146--155", month = may, year = "1992", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:43 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", } @Article{Schwan:1992:MRT, author = "Karsten Schwan and Hongyi Zhou", title = "Multiprocessor real-time threads", journal = j-OPER-SYS-REV, volume = "26", number = "1", pages = "54--65", month = jan, year = "1992", CODEN = "OSRED8", ISSN = "0163-5980", ISSN-L = "0163-5980", bibdate = "Sat Aug 26 08:55:36 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Operating Systems Review", } @Article{Singh:1992:DRS, author = "Gurjot Singh and Moses Joseph and Dave Barnett", title = "Debugging real-time systems", journal = j-DDJ, volume = "17", number = "9", pages = "70, 72, 74, 76--77, 116--117", month = sep, year = "1992", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Tue Sep 10 10:06:23 MDT 1996", bibsource = "http://www.ddj.com/index/author/index.htm; http://www.math.utah.edu/pub/tex/bib/multithreading.bib; UnCover database", abstract = "Modular and incremental development and debugging lead to reliable real-time systems that perform the functions they're designed to. Our authors use this approach when building a simulated data-acquisition system.", acknowledgement = ack-nhfb, affiliation = "Lynx Real-Time Syst., Los Gatos, CA, USA", classification = "C6150G (Diagnostic, testing, debugging and evaluating systems)", fjournal = "Dr. Dobb's Journal of Software Tools", keywords = "Correctness; Debugging cycle; Ldb; POSIX; Real-time systems; User-friendly multithreaded debugger; Worst-case performance", thesaurus = "C listings; Program debugging; Real-time systems", } @Article{Singh:1992:DRT, author = "Gurjot Singh and Moses Joseph and Dave Barnett", title = "Debugging real-time systems", journal = j-DDJ, volume = "17", number = "9", pages = "70, 72, 74, 76--77, 116--117", month = sep, year = "1992", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Tue Sep 10 10:06:23 MDT 1996", bibsource = "http://www.ddj.com/index/author/index.htm; http://www.math.utah.edu/pub/tex/bib/multithreading.bib; UnCover database", abstract = "Modular and incremental development and debugging lead to reliable real-time systems that perform the functions they're designed to. Our authors use this approach when building a simulated data-acquisition system.", acknowledgement = ack-nhfb, affiliation = "Lynx Real-Time Syst., Los Gatos, CA, USA", classification = "C6150G (Diagnostic, testing, debugging and evaluating systems)", fjournal = "Dr. Dobb's Journal of Software Tools", keywords = "Correctness; Debugging cycle; Ldb; POSIX; Real-time systems; User-friendly multithreaded debugger; Worst-case performance", thesaurus = "C listings; Program debugging; Real-time systems", } @PhdThesis{Young-Myers:1992:DTC, author = "Helene Wen-Hsin Young-Myers", title = "Database transitive closure: a performance study of multithreaded algorithms", type = "Thesis (Ph.D.)", school = "College of Business and Management, University of Maryland at College Park", address = "College Park, MD, USA", pages = "ix + 198", year = "1992", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Article{Bic:1993:EUI, author = "Lubomir Bic and Mayez Al-Mouhamed", title = "The {EM-4} under Implicit Parallelism", journal = j-J-PAR-DIST-COMP, volume = "19", number = "3", pages = "255--261", month = nov, year = "1993", CODEN = "JPDCER", DOI = "http://dx.doi.org/10.1006/jpdc.1993.1109", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Thu Mar 9 09:18:53 MST 2000", bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1993.1109/production; http://www.idealibrary.com/links/doi/10.1006/jpdc.1993.1109/production/pdf", acknowledgement = ack-nhfb, classification = "C5220P (Parallel architecture); C6110P (Parallel programming)", corpsource = "Dept. of Inf. and Comput. Sci., California Univ., Irvine, CA, USA", fjournal = "Journal of Parallel and Distributed Computing", keywords = "analysis; benchmark programs; data distribution; data-dependency; Data-Distributed Execution; DDE; EM-4; implicit parallelism; interprocessor communication; iteration-level parallelism; loops; multithreading; parallel architectures; parallel programming; parallelization", treatment = "P Practical; T Theoretical or Mathematical", } @InProceedings{Blumofe:1993:SES, author = "Robert D. Blumofe and Charles E. Leiserson", title = "Space-efficient scheduling of multithreaded computations", crossref = "ACM:1993:PTF", pages = "362--371", year = "1993", bibdate = "Wed Feb 20 18:34:01 MST 2002", bibsource = "http://portal.acm.org/; http://www.acm.org/pubs/contents/proceedings/series/stoc/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org/pubs/articles/proceedings/stoc/167088/p362-blumofe/p362-blumofe.pdf; http://www.acm.org/pubs/citations/proceedings/stoc/167088/p362-blumofe/", acknowledgement = ack-nhfb, } @PhdThesis{Boothe:1993:EMC, author = "Bob Boothe", title = "Evaluation of multithreading and caching in large shared memory parallel computers", type = "Thesis (Ph.D.)", school = "University of California, Berkeley, Computer Science Division", address = "Berkeley, CA, USA", pages = "ix + 169", month = jul, year = "1993", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "Also available as Report UCB/CSD 93/766.", acknowledgement = ack-nhfb, annote = "Supported in part by the Air Force Office of Scientific Research (AFOSR/JSEP), by the NSF, and by an NSF Infrastructure Grant.", keywords = "Multiprocessors", } @MastersThesis{Chong:1993:EMC, author = "Yong-Kim Chong", title = "Effects of memory consistency models on multithreaded multiprocessor performance", type = "Thesis (M.S.)", school = "University of Southern California", address = "Los Angeles, CA, USA", pages = "viii + 89", year = "1993", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Article{Culler:1993:TCC, author = "David E. Culler and Seth Copen Goldstein and Klaus Erik Schauser and Thorsten {Von Eicken}", title = "{TAM} -- {A} Compiler Controlled {Threaded Abstract Machine}", journal = j-J-PAR-DIST-COMP, volume = "18", number = "3", pages = "347--370", month = jul, year = "1993", CODEN = "JPDCER", DOI = "http://dx.doi.org/10.1006/jpdc.1993.1070", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Thu Mar 9 09:18:52 MST 2000", bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1993.1070/production; http://www.idealibrary.com/links/doi/10.1006/jpdc.1993.1070/production/pdf", acknowledgement = ack-nhfb, classification = "C5220P (Parallel architecture)", corpsource = "Div. of Comput. Sci., California Univ., Berkeley, CA, USA", fjournal = "Journal of Parallel and Distributed Computing", keywords = "dataflow execution models; parallel architectures; parallel programming; parallel threads; self-scheduled machine language; Threaded Abstract Machine", treatment = "P Practical", } @Article{Dillon:1993:VEM, author = "Laura K. Dillon", title = "A visual execution model for {Ada} tasking", journal = j-TOSEM, volume = "2", number = "4", pages = "311--345", month = oct, year = "1993", CODEN = "ATSMER", ISSN = "1049-331X (print), 1557-7392 (electronic)", ISSN-L = "1049-331X", bibdate = "Fri Apr 20 08:21:35 MDT 2001", bibsource = "http://www.acm.org/pubs/toc/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org/pubs/articles/journals/tosem/1993-2-4/p311-dillon/p311-dillon.pdf; http://www.acm.org/pubs/citations/journals/tosem/1993-2-4/p311-dillon/", abstract = "A visual execution model for Ada tasking can help programmers attain a deeper understanding of the tasking semantics. It can illustrate subtleties in semantic definitions that are not apparent in natural language design. We describe a contour model of Ada tasking that depicts asynchronous tasks (threads of control), relationships between the environments in which tasks execute, and the manner in which tasks interact. The use of this high-level execution model makes it possible to see what happens during execution of a program. The paper provides an introduction to the contour model of Ada tasking and demonstrates its use.", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Software Engineering and Methodology", generalterms = "Algorithms; Design; Languages", keywords = "contour model; visual execution model", subject = "Software --- Software Engineering --- Design Tools and Techniques (D.2.2); Software --- Software Engineering --- Programming Environments (D.2.6); Software --- Programming Languages --- Formal Definitions and Theory (D.3.1): {\bf Semantics}; Software --- Programming Languages --- Language Classifications (D.3.2): {\bf Ada}; Software --- Programming Languages --- Language Constructs and Features (D.3.3): {\bf Concurrent programming structures}; Software --- Programming Techniques --- Concurrent Programming (D.1.3); Theory of Computation --- Logics and Meanings of Programs --- Semantics of Programming Languages (F.3.2): {\bf Operational semantics}; Software --- Programming Languages --- Processors (D.3.4): {\bf Interpreters}", } @InProceedings{Doligez:1993:CGG, author = "Damien Doligez and Xavier Leroy", title = "A concurrent, generational garbage collector for a multithreaded implementation of {ML}", crossref = "ACM:1993:CRT", pages = "113--123", year = "1993", bibdate = "Mon May 3 12:45:53 MDT 1999", bibsource = "http://www.acm.org/pubs/toc/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org:80/pubs/citations/proceedings/plan/158511/p113-doligez/", abstract = "This paper presents the design and implementation of a ``quasi real-time'' garbage collector for Concurrent Caml Light, an implementation of ML with threads. This two-generation system combines a fast, asynchronous copying collector on the young generation with a non-disruptive concurrent marking collector on the old generation. This design crucially relies on the ML compile-time distinction between mutable and immutable objects.", acknowledgement = ack-nhfb, keywords = "algorithms; design; experimentation; languages; performance", subject = "{\bf D.3.3} Software, PROGRAMMING LANGUAGES, Language Constructs and Features, Concurrent programming structures. {\bf D.3.4} Software, PROGRAMMING LANGUAGES, Processors, Compilers. {\bf D.3.2} Software, PROGRAMMING LANGUAGES, Language Classifications, LML.", } @Article{Eager:1993:CER, author = "Derek L. Eager and John Jahorjan", title = "Chores: Enhanced Run-Time Support for Shared-Memory Parallel Computing", journal = j-TOCS, volume = "11", number = "1", pages = "1--32", month = feb, year = "1993", CODEN = "ACSYEC", ISSN = "0734-2071", ISSN-L = "0734-2071", bibdate = "Wed Jan 13 18:36:53 MST 1999", bibsource = "http://www.acm.org/pubs/contents/journals/tocs/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org:80/pubs/citations/journals/tocs/1993-11-1/p1-eager/", abstract = "Parallel computing is increasingly important in the solution of large-scale numerical problems. The difficulty of efficiently hand-coding parallelism, and the limitations of parallelizing compilers, have nonetheless restricted its use by scientific programmers. In this paper we propose a new paradigm, {\em chores}, for the run-time support of parallel computing on shared-memory multiprocessors. We consider specifically uniform memory access shared-memory environments, although the chore paradigm should also be appropriate for use within the clusters of a large-scale nonuniform memory access machine. We argue that chore systems attain both the high efficiency of compiler approaches for the common case of data parallelism, and the flexibility and performance of user-level thread approaches for functional parallelism. These benefits are achieved within a single, simple conceptual model that almost entirely relieves the programmer and compiler from concerns of granularity, scheduling, and enforcement of synchronization constraints. Measurements of a prototype implementation demonstrate that the chore model can be supported more efficiently than can traditional approaches to either data or functional parallelism alone.", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Computer Systems", keywords = "design; measurement; performance", subject = "{\bf D.4.1} Software, OPERATING SYSTEMS, Process Management. {\bf D.4.9} Software, OPERATING SYSTEMS, Systems Programs and Utilities. {\bf D.4.7} Software, OPERATING SYSTEMS, Organization and Design, Distributed systems. {\bf C.3} Computer Systems Organization, SPECIAL-PURPOSE AND APPLICATION-BASED SYSTEMS. {\bf C.4} Computer Systems Organization, PERFORMANCE OF SYSTEMS.", } @MastersThesis{Estep:1993:LMM, author = "James L. Estep", title = "Lightweight multithreaded multimedia conference server", type = "Thesis (M.S.)", school = "West Virginia University", address = "Morgantown, WV, USA", pages = "vi + 57", year = "1993", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "Electronic data processing -- Distributed processing; Multimedia systems", } @PhdThesis{Fan:1993:LMC, author = "Xiaoming Fan", title = "Latency-directed multithreaded computation and its architectural support", type = "Thesis (Ph.D.)", school = "Universit{\"a}t Hamburg", address = "Aachen, Germany", pages = "xi + 174 + 22 + 11", year = "1993", ISBN = "3-8265-0021-0", ISBN-13 = "978-3-8265-0021-3", ISSN = "0945-0807", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "Summary in German.", series = "Berichte aus der Informatik", acknowledgement = ack-nhfb, keywords = "Computer architecture; Parallel processing (Electronic computers)", } @Article{Gao:1993:DMA, author = "Guang Gao and Jean-Luc Gaudiot and Lubomir Bic", title = "Dataflow and Multithreaded Architectures: {Guest Editors}' Introduction", journal = j-J-PAR-DIST-COMP, volume = "18", number = "3", pages = "271--??", month = jul, year = "1993", CODEN = "JPDCER", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Sat Apr 12 16:10:59 MDT 1997", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Journal of Parallel and Distributed Computing", xxnote = "Issue missing from UofUtah Marriott Library??", } @Article{Gao:1993:EHD, author = "G. R. Gao", title = "An Efficient Hybrid Dataflow Architecture Model", journal = j-J-PAR-DIST-COMP, volume = "19", number = "4", pages = "293--307", month = dec, year = "1993", CODEN = "JPDCER", DOI = "http://dx.doi.org/10.1006/jpdc.1993.1113", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Thu Mar 9 09:18:53 MST 2000", bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1993.1113/production; http://www.idealibrary.com/links/doi/10.1006/jpdc.1993.1113/production/pdf", acknowledgement = ack-nhfb, classification = "C5220P (Parallel architecture); C6110P (Parallel programming)C6150N (Distributed systems); C6150C (Compilers, interpreters and other processors)", corpsource = "Adv. Comput. Archit. and Program Structures Group, Montreal Univ., Que., Canada", fjournal = "Journal of Parallel and Distributed Computing", keywords = "architecture technique; compiling paradigm; concurrent operation; conventional; data-driven instruction; data-driven scheduling scheme; dataflow computers; dataflow software pipelining; efficient hybrid dataflow architecture model; execution; fast pipelined instruction; fine-grain parallelism; hybrid; limited balancing; loop parallelism; multiple instruction; parallel architectures; parallel programming; pipeline; processing; program compilers; scheduling; simple greedy runtime; space efficiency; threads", treatment = "P Practical", } @Book{Gao:1993:SID, author = "Guang R. Gao and Jean-Luc Gaudiot and Lubomir Bic", title = "Special issue on dataflow and multithreaded architectures", publisher = pub-AP, address = pub-AP:adr, pages = "271--389", year = "1993", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", series = "Journal of parallel and distributed computing; v. 18, no. 3", acknowledgement = ack-nhfb, } @InProceedings{Giering:1993:IAF, author = "E. W. Giering and F. Mueller and T. P. Baker", title = "Implementing {Ada 9X} Features using {POSIX} Threads: Design Issues", crossref = "ACM:1993:TCS", pages = "214--228", year = "1993", bibdate = "Sat Jul 05 17:12:34 1997", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Article{Hauser:1993:UTI, author = "Carl Hauser and Christian Jacobi and Marvin Theimer and Brent Welch and Mark Weiser", title = "Using threads in interactive systems: a case study", journal = j-OPER-SYS-REV, volume = "27", number = "5", pages = "94--105", month = dec, year = "1993", CODEN = "OSRED8", ISSN = "0163-5980", ISSN-L = "0163-5980", bibdate = "Sat Aug 26 08:55:54 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Operating Systems Review", } @Article{Hayden:1993:BIC, author = "Charles Hayden", title = "A brief introduction to {Concurrent Pascal}", journal = j-SIGPLAN, volume = "28", number = "3", pages = "353--354", month = mar, year = "1993", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:16:34 MST 2003", bibsource = "http://portal.acm.org/; http://www.acm.org/pubs/toc/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org:80/pubs/citations/proceedings/plan/154766/p353-hayden/", abstract = "Concurrent Pascal is designed for writing concurrent programs such as operating systems and real-time monitoring systems on shared-memory computers. A separate language, Sequential Pascal, is used as the language for applications programs run by operating systems written in Concurrent Pascal. Both languages are extensions of Wirth's Pascal, and share a common threaded code interpreter. The article describes how Concurrent Pascal differs from Wirth's Pascal.", acknowledgement = ack-nhfb, affiliation = "AT and T Bell Labs., Middletown, NJ, USA", classification = "C6110P (Parallel programming); C6140D (High level languages)", confdate = "20-23 April 1993", conflocation = "Cambridge, MA, USA", confname = "HOPL-II. The second ACM SIGPLAN conference on History of programming languages, April 20--23, 1993, Cambridge, MA", confsponsor = "ACM", fjournal = "ACM SIGPLAN Notices", keywords = "Concurrent Pascal; languages; Operating systems; Real-time monitoring systems; Sequential Pascal; Shared-memory computers; Threaded code interpreter", subject = "{\bf D.3.2} Software, PROGRAMMING LANGUAGES, Language Classifications, Concurrent Pascal. {\bf D.3.2} Software, PROGRAMMING LANGUAGES, Language Classifications, Pascal. {\bf D.3.3} Software, PROGRAMMING LANGUAGES, Language Constructs and Features, Procedures, functions, and subroutines.", thesaurus = "Parallel languages; Pascal", } @Article{Hidaka:1993:MTC, author = "Yasuo Hidaka and Hanpei Koike and Hidehiko Tanaka", title = "Multiple threads in cyclic register windows", journal = j-COMP-ARCH-NEWS, volume = "21", number = "2", pages = "131--142", month = may, year = "1993", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:46 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", } @Article{Hsieh:1993:CME, author = "Wilson C. Hsieh and Paul Wang and William E. Weihl", title = "Computation migration: enhancing locality for distributed-memory parallel systems", journal = j-SIGPLAN, volume = "28", number = "7", pages = "239--248", month = jul, year = "1993", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:16:39 MST 2003", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Computation migration is a technique that is based on compile-time program transformation, for accessing remote data in a distributed-memory parallel system. In contrast with RPC-style access, where the access is performed remotely, and with data migration, where the data is moved so that it is local, computation migration moves put of the current thread to the processor where the data resides. The access is performed at the remote processor, and the migrated thread portion continues to run on that same processor; this makes subsequent accesses in the thread portion local. The authors describe an implementation of computation migration that consists of two parts: a implementation that migrates single activation frames, and a high-level language annotation that allows a programmer to express when migration is desired. They performed experiments using two applications; these experiments demonstrate that computation migration is a valuable alternative to RPC and data migration.", acknowledgement = ack-nhfb, affiliation = "Lab. of Comput. Sci., MIT, Cambridge, MA, USA", classification = "C6110P (Parallel programming); C6120 (File organisation); C6150C (Compilers, interpreters and other processors)", confdate = "19-22 May 1993", conflocation = "San Diego, CA, USA", confsponsor = "ACM", fjournal = "ACM SIGPLAN Notices", keywords = "Compile-time program transformation; Computation migration; Current thread; Distributed-memory parallel system; High-level language annotation; Remote data; Remote processor; Single activation frames", thesaurus = "Distributed memory systems; Parallel programming; Program compilers; Storage management", } @Article{Huelsbergen:1993:CCG, author = "Lorenz Huelsbergen and James R. Larus", title = "A concurrent copying garbage collector for languages that distinguish (im)mutable data", journal = j-SIGPLAN, volume = "28", number = "7", pages = "73--82", month = jul, year = "1993", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:16:39 MST 2003", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, affiliation = "Dept. of Comput. Sci., Wisconsin-Madison Univ., WI, USA", classification = "C6110P (Parallel programming); C6120 (File organisation); C6150C (Compilers, interpreters and other processors); C6150N (Distributed systems)", confdate = "19-22 May 1993", conflocation = "San Diego, CA, USA", confsponsor = "ACM", fjournal = "ACM SIGPLAN Notices", keywords = "Concurrent collection; Concurrent compacting garbage collector; Garbage-collection pauses; Immutable data; Minimal mutator/collector synchronization; Multiple mutator threads; Mutable data; Pure functional languages; Shared-memory parallel computers; Standard ML compiler", thesaurus = "Parallel programming; Program compilers; Shared memory systems; Storage allocation; Storage management", } @InProceedings{Klarlund:1993:GT, author = "Nils Klarlund and Michael I. Schwartzbach", title = "Graph types", crossref = "ACM:1993:CRT", pages = "196--205", year = "1993", bibdate = "Mon May 3 12:45:53 MDT 1999", bibsource = "http://www.acm.org/pubs/toc/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org:80/pubs/citations/proceedings/plan/158511/p196-klarlund/", abstract = "Recursive data structures are abstractions of simple records and pointers. They impose a shape invariant, which is verified at compile-time and exploited to automatically generate code for building, copying, comparing, and traversing values without loss of efficiency. However, such values are always tree shaped, which is a major obstacle to practical use. We propose a notion of graph types, which allow common shapes, such as doubly-linked lists or threaded trees, to be expressed concisely and efficiently. We define regular languages of routing expressions to specify relative addresses of extra pointers in a canonical spanning tree. An efficient algorithm for computing such addresses is developed. We employ a second-order monadic logic to decide well-formedness of graph type specifications. This logic can also be used for automated reasoning about pointer structures.", acknowledgement = ack-nhfb, keywords = "algorithms; languages; theory", subject = "{\bf F.3.3} Theory of Computation, LOGICS AND MEANINGS OF PROGRAMS, Studies of Program Constructs, Type structure. {\bf D.3.3} Software, PROGRAMMING LANGUAGES, Language Constructs and Features, Data types and structures. {\bf F.2.2} Theory of Computation, ANALYSIS OF ALGORITHMS AND PROBLEM COMPLEXITY, Nonnumerical Algorithms and Problems, Computations on discrete structures. {\bf G.2.2} Mathematics of Computing, DISCRETE MATHEMATICS, Graph Theory, Trees.", } @Article{Lee:1993:TW, author = "David Lee", title = "Threads for {Windows} 3", journal = j-DDJ, volume = "18", number = "10", pages = "84--??", month = "Fall", year = "1993", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Tue Sep 03 09:15:44 1996", bibsource = "http://www.ddj.com/index/author/index.htm; http://www.math.utah.edu/pub/tex/bib/multithreading.bib; UnCover database", note = "Special Issue: Windows Sourcebook.", abstract = "Unlike NT, Windows 3 doesn't provide direct support for threads. With the techniques David illustrates here, you can implement non-preemptive threads in Windows 3.", acknowledgement = ack-nhfb, fjournal = "Dr. Dobb's Journal of Software Tools", } @Article{Lim:1993:WAS, author = "Beng-Hong Lim and Anant Agarwal", title = "Waiting Algorithms for Synchronization in Large-Scale Multiprocessors", journal = j-TOCS, volume = "11", number = "3", pages = "253--294", month = aug, year = "1993", CODEN = "ACSYEC", ISSN = "0734-2071", ISSN-L = "0734-2071", bibdate = "Wed Jan 13 18:36:53 MST 1999", bibsource = "http://www.acm.org/pubs/contents/journals/tocs/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org:80/pubs/citations/journals/tocs/1993-11-3/p253-lim/", abstract = "Through analysis and experiments, this paper investigates two-phase waiting algorithms to minimize the cost of waiting for synchronization in large-scale multiprocessors. In a two-phase algorithm, a thread first waits by polling a synchronization variable. If the cost of polling reaches a limit {\em Lpoll\/} and further waiting is necessary, the thread is blocked, incurring an additional fixed cost, {\em B}. The choice of {\em Lpoll\/} is a critical determinant of the performance of two-phase algorithms. We focus on methods for statically determining {\em Lpoll\/} because the run-time overhead of dynamically determining {\em Lpoll\/} can be comparable to the cost of blocking in large-scale multiprocessor systems with lightweight threads. Our experiments show that {\em always-block\/} ({\em Lpoll\/} = 0) is a good waiting algorithm with performance that is usually close to the best of the algorithms compared. We show that even better performance can be achieved with a static choice of {\em Lpoll\/} based on knowledge of likely wait-time distributions. Motivated by the observation that different synchronization types exhibit different wait-time distributions, we prove that a static choice of {\em Lpoll\/} can yield close to optimal on-line performance against an adversary that is restricted to choosing wait times from a fixed family of probability distributions. This result allows us to make an optimal static choice of {\em Lpoll\/} based on synchronization type. For exponentially distributed wait times, we prove that setting {\em Lpoll\/} = 1n(e-1){\em B\/} results in a waiting cost that is no more than {\em e/(e-1)\/} times the cost of an optimal off-line algorithm. For uniformly distributed wait times, we prove that setting {\em L\/}poll=1/2(square root of 5 -1){\em B\/} results in a waiting cost that is no more than (square root of 5 + 1)/2 (the golden ratio) times the cost of an optimal off-line algorithm. Experimental measurements of several parallel applications on the Alewife multiprocessor simulator corroborate our theoretical findings.", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Computer Systems", keywords = "algorithms; experimentation; performance; theory", subject = "{\bf D.4.1} Software, OPERATING SYSTEMS, Process Management, Synchronization. {\bf D.4.1} Software, OPERATING SYSTEMS, Process Management, Mutual exclusion. {\bf C.4} Computer Systems Organization, PERFORMANCE OF SYSTEMS. {\bf C.1.2} Computer Systems Organization, PROCESSOR ARCHITECTURES, Multiple Data Stream Architectures (Multiprocessors), Parallel processors**. {\bf D.4.8} Software, OPERATING SYSTEMS, Performance, Measurements. {\bf D.4.8} Software, OPERATING SYSTEMS, Performance, Stochastic analysis.", } @Article{McCann:1993:DPA, author = "Cathy McCann and Raj Vaswani and John Zahorjan", title = "A Dynamic Processor Allocation Policy for Multiprogrammed Shared-Memory Multiprocessors", journal = j-TOCS, volume = "11", number = "2", pages = "146--178", month = may, year = "1993", CODEN = "ACSYEC", ISSN = "0734-2071", ISSN-L = "0734-2071", bibdate = "Wed Jan 13 18:36:53 MST 1999", bibsource = "http://www.acm.org/pubs/contents/journals/tocs/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org:80/pubs/citations/journals/tocs/1993-11-2/p146-mccann/", abstract = "We propose and evaluate empirically the performance of a dynamic processor-scheduling policy for multiprogrammed shared-memory multiprocessors. The policy is dynamic in that it reallocates processors from one parallel job to another based on the currently realized parallelism of those jobs. The policy is suitable for implementation in production systems in that: ---It interacts well with very efficient user-level thread packages, leaving to them many low-level thread operations that do not require kernel intervention. ---It deals with thread blocking due to user I/O and page faults. ---It ensures fairness in delivering resources to jobs. ---Its performance, measured in terms of average job response time, is superior to that of previously proposed schedulers, including those implemented in existing systems. It provides good performance to very short, sequential (e.g., interactive) requests. We have evaluated our scheduler and compared it to alternatives using a set of prototype implementations running on a Sequent Symmetry multiprocessor. Using a number of parallel applications with distinct qualitative behaviors, we have both evaluated the policies according to the major criterion of overall performance and examined a number of more general policy issues, including the advantage of ``space sharing'' over ``time sharing'' the processors of a multiprocessor, and the importance of cooperation between the kernel and the application in reallocating processors between jobs. We have also compared the policies according to other criteia important in real implementations, in particular, fairness and respone time to short, sequential requests. We conclude that a combination of performance and implementation considerations makes a compelling case for our dynamic scheduling policy.", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Computer Systems", keywords = "design; measurement; performance", subject = "{\bf D.4.1} Software, OPERATING SYSTEMS, Process Management, Scheduling. {\bf D.4.1} Software, OPERATING SYSTEMS, Process Management, Multiprocessing/multiprogramming/multitasking. {\bf C.1.2} Computer Systems Organization, PROCESSOR ARCHITECTURES, Multiple Data Stream Architectures (Multiprocessors).", } @Article{Morrisett:1993:PLP, author = "J. Gregory Morrisett and Andrew P. Tolmach", title = "Procs and locks: a portable multiprocessing platform for {Standard ML} of {New Jersey}", journal = j-SIGPLAN, volume = "28", number = "7", pages = "198--207", month = jul, year = "1993", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:16:39 MST 2003", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "A portable platform has been built for running Standard ML of New Jersey programs on multiprocessors. It can be used to implement user-level thread packages for multiprocessors within the ML language with first-class continuations. The platform supports experimentation with different thread scheduling policies and synchronization constructs. It has been used to construct a Modula-3 style thread package and a version of Concurrent ML, and has been ported to three different multiprocessors running variants of Unix. The authors describe the platform's design, implementation, and performance.", acknowledgement = ack-nhfb, affiliation = "Carnegie Mellon Univ., Pittsburg, PA, USA", classification = "C6110P (Parallel programming); C6140D (High level languages); C6150C (Compilers, interpreters and other processors)", confdate = "19-22 May 1993", conflocation = "San Diego, CA, USA", confsponsor = "ACM", fjournal = "ACM SIGPLAN Notices", keywords = "Concurrent ML; First-class continuations; Functional language; Modula-3 style thread package; New Jersey programs; Portable multiprocessing platform; Portable platform; Standard ML; Synchronization constructs; Thread scheduling policies; User-level thread packages", thesaurus = "Multiprocessing systems; Parallel languages; Parallel programming; Scheduling", } @Article{Najjar:1993:QAD, author = "Walid A. Najjar and A. P. Wim Bohm and W. Marcus Miller", title = "A Quantitative Analysis of Dataflow Program Execution --- Preliminaries to a Hybrid Design", journal = j-J-PAR-DIST-COMP, volume = "18", number = "3", pages = "314--326", month = jul, year = "1993", CODEN = "JPDCER", DOI = "http://dx.doi.org/10.1006/jpdc.1993.1067", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Thu Mar 9 09:18:52 MST 2000", bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1993.1067/production; http://www.idealibrary.com/links/doi/10.1006/jpdc.1993.1067/production/pdf", acknowledgement = ack-nhfb, classification = "C6110B (Software engineering techniques); C6110P (Parallel programming)", corpsource = "Dept. of Comput. Sci., Colorado State Univ., Fort Collins, CO, USA", fjournal = "Journal of Parallel and Distributed Computing", keywords = "benchmarks; dataflow program execution; dynamic measure; fine grain intrathread locality; instruction level locality; parallel programming; software metrics", treatment = "T Theoretical or Mathematical", } @Article{Natarajan:1993:PVM, author = "Venkat Natarajan and Derek Chiou and Boon Seong Ang", title = "Performance visualization on {Monsoon}", journal = j-J-PAR-DIST-COMP, volume = "18", number = "2", pages = "169--180", month = jun, year = "1993", CODEN = "JPDCER", DOI = "http://dx.doi.org/10.1006/jpdc.1993.1054", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Thu Mar 9 09:18:52 MST 2000", bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1993.1054/production; http://www.idealibrary.com/links/doi/10.1006/jpdc.1993.1054/production/pdf", acknowledgement = ack-nhfb, classification = "C5440 (Multiprocessor systems and techniques); C5470 (Performance evaluation and testing); C7430 (Computer engineering)", corpsource = "Motorola Cambridge Res. Center, MA, USA", fjournal = "Journal of Parallel and Distributed Computing", keywords = "algorithm; application program; compiler; computer evaluation; data analysis; data collection; data visualisation; MIT; Monsoon; Motorola; multiprocessor machine; multithreaded; operating system; parallel machine; parallel machines; performance evaluation; performance evaluation tool; programming language; visualization", treatment = "P Practical", } @InProceedings{Odersky:1993:CNA, author = "Martin Odersky and Dan Rabin and Paul Hudak", title = "Call by name, assignment, and the lambda calculus", crossref = "ACM:1993:CRT", pages = "43--56", year = "1993", bibdate = "Mon May 3 12:45:53 MDT 1999", bibsource = "http://www.acm.org/pubs/toc/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org:80/pubs/citations/proceedings/plan/158511/p43-odersky/", abstract = "We define an extension of the call-by-name lambda calculus with additional constructs and reduction rules that represent mutable variables and assignments. The extended calculus has neither a concept of an explicit store nor a concept of evaluation order; nevertheless, we show that programs in the calculus can be implemented using a single-threaded store. We also show that the new calculus has the Church--Rosser property and that it is a conservative extension of classical lambda calculus with respect to operational equivalence; that is, all algebraic laws of the functional subset are preserved.", acknowledgement = ack-nhfb, keywords = "languages; theory", subject = "{\bf F.4.1} Theory of Computation, MATHEMATICAL LOGIC AND FORMAL LANGUAGES, Mathematical Logic, Lambda calculus and related systems. {\bf F.3.3} Theory of Computation, LOGICS AND MEANINGS OF PROGRAMS, Studies of Program Constructs, Type structure.", } @Article{Plauger:1993:MCS, author = "Dave Plauger", title = "Making {C++} Save for Threads", journal = j-CUJ, volume = "11", number = "2", pages = "58--??", month = feb, year = "1993", ISSN = "0898-9788", bibdate = "Fri Aug 30 16:52:23 MDT 1996", bibsource = "http://www.cuj.com/cbklist.htm; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "C Users Journal", } @InProceedings{Raghunath:1993:DIN, author = "M. T. Raghunath and Abhiram Ranade", title = "Designing Interconnection Networks for Multi-Level Packaging", crossref = "IEEE:1993:PSP", pages = "772--781", year = "1993", bibdate = "Wed Apr 15 12:04:03 MDT 1998", bibsource = "Compendex database; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, affiliation = "Univ of California", affiliationaddress = "Berkeley, CA, USA", classification = "723; C5220P (Parallel architecture); C5440 (Multiprocessing systems)", corpsource = "Comput. Sci. Div., California Univ., Berkeley, CA, USA", keywords = "communication bandwidth; complete graphs; Computer networks; generic set; global communication performance; high bandwidth channels; high degree deBruijn graphs; Interconnection network design; interconnection networks design; Large scale parallel machines; large scale parallel machines; latencies; Multilevel packaging; multilevel packaging; multiprocessor interconnection networks; multithreading; network organizations; network topology; packaging; packaging constraints; packaging hierarchy; packaging restrictions; packaging technology; Parallel processing systems; Random traffic model; random traffic model", sponsororg = "IEEE; ACM SIGARCH", treatment = "P Practical", } @MastersThesis{Rajagopal:1993:DMI, author = "Arjun Rajagopal", title = "Design of a multithreaded instruction cache for a hyperscalar processor", type = "Thesis (M.S.)", school = "Department of Electrical Engineering, Texas A\&M University", address = "College Station, TX, USA", pages = "ix + 84", year = "1993", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "Major electrical engineering", } @MastersThesis{Srinivasan:1993:SDS, author = "Sumathi Srinivasan", title = "System design and simulation for the {Demus-2} multithreaded processor", type = "Thesis (M. Eng.)", school = "Department of Electrical and Computer Engineering, McMaster University", address = "Hamilton, ON, Canada", pages = "x + 109", year = "1993", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "Computer architecture; Computers, Pipeline; McMaster University. -- Dissertations; Parallel processing (Electronic computers)", } @Article{Volkman:1993:CCP, author = "Victor R. Volkman", title = "Convert {C} Programs into Multithreaded Applications", journal = j-CUJ, volume = "11", type = "User Report", number = "4", pages = "87--??", month = apr, year = "1993", ISSN = "0898-9788", bibdate = "Fri Aug 30 16:52:23 MDT 1996", bibsource = "http://www.cuj.com/cbklist.htm; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "C Users Journal", } @Article{Volkman:1993:CDB, author = "Victor R. Volkman and John English", title = "Class {{\tt DOSThread}}: {A} Base Class for Multithreaded {DOS} Programs", journal = j-CUJ, volume = "11", type = "CUG library disk documentation", number = "12", pages = "113--??", month = dec, year = "1993", ISSN = "0898-9788", bibdate = "Fri Aug 30 16:52:23 MDT 1996", bibsource = "http://www.cuj.com/cbklist.htm; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "C Users Journal", } @Article{Waldspurger:1993:RRF, author = "Carl A. Waldspurger and William E. Weihl", title = "Register relocation: flexible contexts for multithreading", journal = j-COMP-ARCH-NEWS, volume = "21", number = "2", pages = "120--130", month = may, year = "1993", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:46 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", } @TechReport{Young-Myers:1993:ESTa, author = "Helene Young-Myers and Louiqa Raschid", title = "An experimental study of three dataflow paradigms in multithreaded database transitive closure algorithms on shared memory multiprocessors", type = "Technical report", number = "CS-TR-3060; UMIACS-TR-93-33", institution = inst-U-MARYLAND, address = inst-U-MARYLAND:adr, pages = "21", month = apr, year = "1993", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "To appear in a special issue of the Journal of Parallel and Distributed Computing on Dataflow and Multithreaded Architectures, July, 1993.", abstract = "This paper describes an experimental study of three dataflow paradigms, namely, no dataflow, pipelined dataflow, and network dataflow, in multithreaded database transitive closure algorithms on shared memory multiprocessors. This study shows that dataflow paradigm directly influences performance parameters such as the amount of interthread communication, how data are partitioned among the threads, whether access to each page of data is exclusive or shared, whether locks are needed for concurrency control, and how calculation termination is detected. The algorithm designed with no dataflow outperforms the algorithms with dataflow. Approximately linear speedup is achieved by the no dataflow algorithm with sufficient workload and primary memory. An exclusive access working set model and a shared access working set model describe the interactions between two or more threads' working sets when access to each page of data is exclusive or shared among the threads, respectively. These models are experimentally verified.", acknowledgement = ack-nhfb, annote = "Supported in part by the National Science Foundation.", keywords = "Data flow computing; Multiprocessors", } @Article{Young-Myers:1993:ESTb, author = "Helene Young-Myers and Louiqa Raschid", title = "An Experimental Study of Three Dataflow Paradigms in Multithreaded Database Transitive Closure Algorithms on Shared Memory Multiprocessors", journal = j-J-PAR-DIST-COMP, volume = "18", number = "3", pages = "371--389", month = jul, year = "1993", CODEN = "JPDCER", DOI = "http://dx.doi.org/10.1006/jpdc.1993.1071", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Thu Mar 9 09:18:52 MST 2000", bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1993.1071/production; http://www.idealibrary.com/links/doi/10.1006/jpdc.1993.1071/production/pdf", acknowledgement = ack-nhfb, classification = "C5220P (Parallel architecture); C5470 (Performance evaluation and testing); C6160 (Database management systems (DBMS))", corpsource = "Maryland Univ., College Park, MD, USA", fjournal = "Journal of Parallel and Distributed Computing", keywords = "architectures; calculation termination; concurrency control; database management systems; dataflow; dataflow paradigms; exclusive access; interthread communication; linear; network; no dataflow; parallel; performance evaluation; performance parameters; pipelined dataflow; shared access; shared memory systems; speedup", treatment = "P Practical", } @InProceedings{Alfieri:1994:EKI, author = "R. A. Alfieri", title = "An Efficient Kernel-Based Implementation of {POSIX} Threads", crossref = "Anonymous:1994:USC", pages = "59--72", year = "1994", bibdate = "Sat May 25 07:59:58 MDT 1996", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @InProceedings{Anonymous:1994:DCT, author = "Anonymous", title = "On the Design of {Chant}: {A} Talking Threads Package", crossref = "IEEE:1994:PSW", pages = "350--359", year = "1994", bibdate = "Mon Aug 26 10:38:41 MDT 1996", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Article{Anonymous:1994:MDP, author = "Anonymous", title = "{Multiprocessor desktops are proliferating, even though there remains a shortage of multithreaded applications for them}", journal = j-OPEN-SYSTEMS-TODAY, volume = "165", pages = "60--??", month = dec, year = "1994", ISSN = "1061-0839", bibdate = "Fri Jan 26 17:24:01 MST 1996", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Open Systems Today", } @Article{Anonymous:1994:SIP, author = "Anonymous", title = "Special issue: panel sessions of the {1991 Workshop on Multithreaded Computers, November 22, 1991, Albuquerque, New Mexico, in conjunction with Supercomputing '91}", journal = "Computer architecture news", volume = "22", number = "1", pages = "2--33", year = "1994", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @InProceedings{Baker:1994:EPP, author = "T. P. Baker and Frank Mueller and Viresh Rustagi", title = "Experience with a Prototype of the {POSIX} ``Minimal Realtime System Profile''", crossref = "IEEE:1994:ROS", pages = "12--17", year = "1994", bibdate = "Sat May 25 07:59:58 MDT 1996", bibsource = "Compendex database; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "This paper describes experience prototyping the proposed IEEE standard `minimal realtime system profile', whose primary component is support for real-time threads. It provides some background, describes the implementation, and reports preliminary performance measurements.", acknowledgement = ack-nhfb, affiliation = "Florida State Univ", affiliationaddress = "Tallahassee, FL, USA", classification = "722.4; 723.1; 723.1.1; 723.2", conference = "Proceedings of the 11th IEEE Workshop on Real-Time Operating Systems and Software", conferenceyear = "1994", journalabr = "Proc IEEE Workshop Real Time Oper Syst Software", keywords = "Computer operating systems; Computer software portability; Data structures; High level languages; Interfaces (computer); Mesa programming language; Minimal real time system profile; Program processors; Real time systems; Thread; Thread management; Thread priority scheduling", meetingaddress = "Seattle, WA, USA", meetingdate = "May 18--19 1994", meetingdate2 = "05/18--19/94", publisherinfo = "Computer Society Press", sponsor = "IEEE Computer Society", } @Article{Baquero:1994:CAC, author = "Carlos Baquero and Francisco Moura", title = "Concurrency Annotations in {C++}", journal = j-SIGPLAN, volume = "29", number = "7", pages = "61--67", month = jul, year = "1994", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:16:53 MST 2003", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, classification = "C6110J (Object-oriented programming); C6110P (Parallel programming); C6140D (High level languages)", corpsource = "DI/INESC, Minho Univ., Portugal", fjournal = "ACM SIGPLAN Notices", keywords = "access flag; C language; C++; concurrency annotations; inheritance; inheritance chain; language extension; method code; method invocations; method predicates; multiple threads; object-oriented languages; parallel languages; shared-memory multiprocessor system; synchronisation; synchronization code; synchronization mechanisms", treatment = "P Practical", } @InProceedings{Blumofe:1994:SMC, author = "R. D. Blumofe and C. E. Leiserson", title = "Scheduling multithreaded computations by work stealing", crossref = "Goldwasser:1994:PAS", pages = "356--368", year = "1994", bibdate = "Thu Apr 5 06:13:51 MDT 2001", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Article{Buhr:1994:TRM, author = "R. J. A. Buhr and R. S. Casselman", title = "Timethread-Role Maps for Object-Oriented Design of Real-Time-and-Distributed Systems", journal = j-SIGPLAN, volume = "29", number = "10", pages = "301--301", month = oct, year = "1994", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Fri Apr 24 18:36:02 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, classification = "C6110J (Object-oriented programming); C6150N (Distributed systems)", conflocation = "Portland, OR, USA; 23-27 Oct. 1994", conftitle = "Ninth Annual Conference on Object-Oriented Programming Systems, Languages, and Applications. OOPSLA '94", corpsource = "Dept. of Syst. and Comput. Eng., Carleton Univ., Ottawa, Ont., Canada", fjournal = "ACM SIGPLAN Notices", keywords = "concurrency; distributed processing; distributed systems; dynamic structure; end-to-end responsibility paths; object-oriented approach; object-oriented design; object-oriented design methods; object-oriented methods; object-oriented programming; real-time systems; real-time systems oriented programming; responsibility-driven design; timethread-role maps", sponsororg = "ACM", treatment = "P Practical", } @InProceedings{Bundgen:1994:FPC, author = "Reinhard B{\"u}ndgen and Manfred G{\"o}bel and Wolfgang K{\"u}chlin", title = "A fine-grained parallel completion procedure", crossref = "ACM:1994:IPI", pages = "269--277", year = "1994", bibdate = "Thu Mar 12 08:41:19 MST 1998", bibsource = "http://www.acm.org/pubs/toc/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org:80/pubs/citations/proceedings/issac/190347/p269-bundgen/", abstract = "We present a parallel Knuth--Bendix completion algorithm where the inner loop, deriving the consequences of adding a new rule to the system, is multithreaded. The selection of the best new rule in the outer loop, and hence the completion strategy, is exactly the same as for the sequential algorithm. Our implementation, which is within the PARSAC-2 parallel symbolic computation system, exhibits good parallel speedups on a standard multiprocessor workstation.", acknowledgement = ack-nhfb, affiliation = "Wilhelm-Schickard-Inst. fur Inf., Tubingen Univ., Germany", classification = "C4210L (Formal languages and computational linguistics); C4240P (Parallel programming and algorithm theory); C6130 (Data handling techniques); C6150N (Distributed systems software); C7310 (Mathematics computing)", keywords = "algorithms; Fine grained parallel completion procedure; Fine-grained parallel completion procedure; Multithreaded inner loop; Parallel Knuth--Bendix completion algorithm; Parallel speedups; PARSAC-2 parallel symbolic computation system; Standard multiprocessor workstation", subject = "{\bf I.1.2} Computing Methodologies, SYMBOLIC AND ALGEBRAIC MANIPULATION, Algorithms, Algebraic algorithms. {\bf I.1.0} Computing Methodologies, SYMBOLIC AND ALGEBRAIC MANIPULATION, General. {\bf I.1.3} Computing Methodologies, SYMBOLIC AND ALGEBRAIC MANIPULATION, Languages and Systems. {\bf F.4.2} Theory of Computation, MATHEMATICAL LOGIC AND FORMAL LANGUAGES, Grammars and Other Rewriting Systems, Parallel rewriting systems. {\bf F.1.2} Theory of Computation, COMPUTATION BY ABSTRACT DEVICES, Modes of Computation, Parallelism and concurrency.", thesaurus = "Parallel algorithms; Parallel machines; Rewriting systems; Symbol manipulation", } @Article{Carter:1994:HSF, author = "Nicholas P. Carter and Stephen W. Keckler and William J. Dally", title = "Hardware support for fast capability-based addressing", journal = j-SIGPLAN, volume = "29", number = "11", pages = "319--327", month = nov, year = "1994", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:16:57 MST 2003", bibsource = "http://portal.acm.org/; http://www.acm.org/pubs/toc/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org:80/pubs/citations/proceedings/asplos/195473/p319-carter/", abstract = "Traditional methods of providing protection in memory systems do so at the cost of increased context switch time and/or increased storage to record access permissions for processes. With the advent of computers that supported cycle-by-cycle multithreading, protection schemes that increase the time to perform a context switch are unacceptable, but protecting unrelated processes from each other is still necessary if such machines are to be used in non-trusting environments. This paper examines {\em guarded pointers\/}, a hardware technique which uses tagged 64-bit pointer objects to implement capability-based addressing. Guarded pointers encode a segment descriptor into the upper bits of every pointer, eliminating the indirection and related performance penalties associated with traditional implementations of capabilities. All processes share a single 54-bit virtual address space, and access is limited to the data that can be referenced through the pointers that a process has been issued. Only one level of address translation is required to perform a memory reference. Sharing data between processes is efficient, and protection states are defined to allow fast protected subsystem calls and create unforgeable data keys.", acknowledgement = ack-nhfb, classification = "C5310 (Storage system design); C6120 (File organisation); C6150N (Distributed systems software)", conflocation = "San Jose, CA, USA; 4-7 Oct. 1994", conftitle = "Sixth International Conference on Architectural Support for Programming Languages and Operating Systems (ASPLOS-VI)", corpsource = "Lab. for Comput. Sci., MIT, Cambridge, MA, USA", fjournal = "ACM SIGPLAN Notices", keywords = "54- bit virtual address space; address translation; capability based addressing; cycle-by-cycle multithreading; design; fast capability-based addressing; fast protected subsystem calls; guarded pointers; hardware support; hardware technique; memory architecture; memory bit virtual address space; memory reference; memory systems; multiprocessing programs; performance; protection schemes; protection states; segment descriptor; storage allocation; tagged 64-bit pointer objects; theory; unforgeable data keys; virtual storage", sponsororg = "ACM; IEEE Comput. Soc", subject = "{\bf C.0} Computer Systems Organization, GENERAL, Instruction set design. {\bf C.4} Computer Systems Organization, PERFORMANCE OF SYSTEMS.", treatment = "P Practical", } @Book{Catanzaro:1994:MSA, author = "Ben J. Catanzaro", title = "Multiprocessor system architectures: a technical survey of multiprocessor\slash multithreaded systems using {SPARC}, multilevel bus architectures and {Solaris} {(SunOS)}", publisher = pub-PHPTR, address = pub-PHPTR:adr, pages = "xxxii + 493", year = "1994", ISBN = "0-13-089137-1", ISBN-13 = "978-0-13-089137-2", LCCN = "QA76.5.C3864 1994", bibdate = "Fri Aug 7 08:29:38 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "computer architecture; multiprocessors; sun computers", } @Article{Chase:1994:SPS, author = "Jeffrey S. Chase and Henry M. Levy and Michael J. Feeley and Edward D. Lazowska", title = "Sharing and Protection in a Single-Address-Space Operating System", journal = j-TOCS, volume = "12", number = "4", pages = "271--307", month = nov, year = "1994", CODEN = "ACSYEC", ISSN = "0734-2071", ISSN-L = "0734-2071", bibdate = "Wed Jan 13 18:36:53 MST 1999", bibsource = "http://www.acm.org/pubs/contents/journals/tocs/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org:80/pubs/citations/journals/tocs/1994-12-4/p271-chase/", abstract = "This article explores memory sharing and protection support in Opal, a single-address-space operating system designed for wide-address (64-bit) architectures. Opal threads execute within protection domains in a single shared virtual address space. Sharing is simplified, because addresses are context independent. There is no loss of protection, because addressability and access are independent; the right to access a segment is determined by the protection domain in which a thread executes. This model enables beneficial code-and data-sharing patterns that are currently prohibitive, due in part to the inherent restrictions of multiple address spaces, and in part to Unix programming style. We have designed and implemented an Opal prototype using the Mach 3.0 microkernel as a base. Our implementation demonstrates how a single-address-space structure can be supported alongside of other environments on a modern microkernel operating system, using modern wide-address architectures. This article justifies the Opal model and its goals for sharing and protection, presents the system and its abstractions, describes the prototype implementation, and reports experience with integrated applications.", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Computer Systems", keywords = "design; experimentation; measurement; performance", subject = "{\bf D.4.2} Software, OPERATING SYSTEMS, Storage Management. {\bf C.1.3} Computer Systems Organization, PROCESSOR ARCHITECTURES, Other Architecture Styles, Capability architectures**. {\bf D.3.3} Software, PROGRAMMING LANGUAGES, Language Constructs and Features, Modules, packages. {\bf D.4.4} Software, OPERATING SYSTEMS, Communications Management. {\bf D.4.6} Software, OPERATING SYSTEMS, Security and Protection, Access controls. {\bf D.4.6} Software, OPERATING SYSTEMS, Security and Protection, Information flow controls. {\bf D.4.7} Software, OPERATING SYSTEMS, Organization and Design. {\bf D.4.8} Software, OPERATING SYSTEMS, Performance, Measurements. {\bf E.1} Data, DATA STRUCTURES. {\bf E.2} Data, DATA STORAGE REPRESENTATIONS.", } @Article{Chaudhry:1994:CMP, author = "Ghulam Chaudhry and Xuechang Li", title = "A case for the multithreaded processor architecture", journal = j-COMP-ARCH-NEWS, volume = "22", number = "4", pages = "55--59", month = sep, year = "1994", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:12 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", } @Article{Dennis:1994:MMP, author = "Jack B. Dennis", title = "Machines and Models for Parallel Computing", journal = j-INT-J-PARALLEL-PROG, volume = "22", number = "1", pages = "47--77", month = feb, year = "1994", CODEN = "IJPPE5", ISSN = "0885-7458 (print), 1573-7640 (electronic)", ISSN-L = "0885-7458", bibdate = "Sat Apr 26 11:04:14 MDT 1997", bibsource = "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=22&issue=1; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, classification = "C5220P (Parallel architecture); C5440 (Multiprocessor systems and techniques); C6110 (Systems analysis and programming); C6150N (Distributed systems)", corpsource = "Lab. for Comput. Sci., MIT, Cambridge, MA, USA", fjournal = "International Journal of Parallel Programming", keywords = "concurrency control; dataflow principles; functional programming; general semantic model; memory latency; microprocessors; modular software construction; multithreading; parallel computation; parallel computing models; parallel machines; parallel programming; processor architecture; processor design; RISC; shared memory systems; shared-memory model; superpipelined; superscalar; synchronization", treatment = "P Practical", } @Book{Dorfman:1994:EMO, author = "Len Dorfman and Marc J. Neuberger", title = "Effective multithreading in {OS/2}", publisher = pub-MCGRAW-HILL, address = pub-MCGRAW-HILL:adr, pages = "xii + 288", year = "1994", ISBN = "0-07-017841-0 (paperback)", ISBN-13 = "978-0-07-017841-0 (paperback)", LCCN = "QA76.76.O63D6694 1994", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", price = "US\$34.95", acknowledgement = ack-nhfb, annote = "System requirements for computer disk: IBM-compatible PC; 4MB RAM (8MB recommended); OS/2; C compiler such as IBM CSet++ or Borland C++ for OS/2; high-density floppy disk drive; hard disk with 3.1MB free space.", keywords = "Microcomputers -- Operating systems; Operating systems (Computers); OS/2 (Computer file)", } @TechReport{Dubey:1994:APM, author = "Pradeep Dubey and Arvind Krishna and M. J. (Michael J.) Flynn", title = "Analytical performance modeling for a spectrum of multithreaded machines", type = "Research report", number = "RC 19549 (85007)", institution = "IBM T. J. Watson Research Center", address = "Yorktown Heights, NY, USA", pages = "27", day = "3", month = may, year = "1994", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "The throughput of pipelined processors suffers due to delays associated with instruction dependencies and memory latencies. Multithreaded architectures try to tolerate such delays by sharing the pipeline with independent instruction threads. This paper proposes a comprehensive analytical framework to quantitate the performance potential of a wide spectrum of multithreaded machines ranging from those that are capable of switching threads every cycle to those that switch threads only on long inter-instruction latencies. For machines in the former category, the proposed analytic model provides an exact solution for pipeline utilization which is significantly better than lower and upper bounds obtainable from simple approximation techniques. Unlike previously published analytic models of such systems, the Markov model developed here accepts a general distribution for the interlock delays with multiple latencies. For machines in the latter category, the paper provides an approximate analytic model which is simpler than previously published analytic models. The models have been verified using previously published analytical and simulation-based results. As compared to the simulation alternative, the models provide a much quicker estimate of pipeline utilization as a function of a number of threads.", acknowledgement = ack-nhfb, keywords = "Computer architecture", } @MastersThesis{Gallagher:1994:PLM, author = "William Lynn Gallagher", title = "Performance limitations of the {MTS} multithreaded architecture", type = "Thesis (M.S. in Engineering)", school = "University of Texas at Austin", address = "Austin, TX, USA", pages = "xiv + 101", year = "1994", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Article{Gerlhof:1994:MTA, author = "C. A. Gerlhof and A. Kemper", title = "A Multi-Threaded Architecture for Prefetching in Object Bases", journal = j-LECT-NOTES-COMP-SCI, volume = "779", pages = "351--364", year = "1994", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Wed Sep 15 18:44:20 MDT 1999", bibsource = "ftp://ftp.math.utah.edu/pub/tex/bib/lncs1994.bib; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", keywords = "database technology; EDBT; extending database technology", } @Article{Gibson:1994:CMC, author = "Ken Gibson", title = "A {C++} Multitasking Class Library", journal = j-DDJ, volume = "19", number = "5", pages = "28, 30, 32, 34, 96--98", month = may, year = "1994", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Tue Sep 03 09:15:49 1996", bibsource = "http://www.ddj.com/index/author/index.htm; http://www.math.utah.edu/pub/tex/bib/multithreading.bib; UnCover database", abstract = "Multithreaded applications that currently execute more than one section of code aren't directly supported by languages such as C++. Ken presents a C++ multitasking class library for MS-DOS that lets you implement a program as a set of concurrent threads.", acknowledgement = ack-nhfb, classification = "C6110J (Object-oriented programming); C6110P (Parallel programming)", fjournal = "Dr. Dobb's Journal of Software Tools", keywords = "C++ multitasking class library; Concurrent execution; DOS; Embedded processors; Interthread communications; Locator program; Microsoft C++ 7.0; Multithreaded applications; Portability; Processor initialization; Queue class; Real-time device control; Real-time executive; ROMable image; Scheduler object; Semaphore class; Simulation; Thread class; Thread synchronization", thesaurus = "C listings; Multiprogramming; Object-oriented programming; Public domain software; Scheduling; Subroutines", } @Article{Giloi:1994:PSA, author = "Wolfgang K. Giloi", title = "Parallel supercomputer architectures and their programming models", journal = j-PARALLEL-COMPUTING, volume = "20", number = "10--11", pages = "1443--1470", day = "3", month = nov, year = "1994", CODEN = "PACOEJ", ISSN = "0167-8191", ISSN-L = "0167-8191", bibdate = "Fri Aug 6 10:13:51 MDT 1999", bibsource = "http://www.elsevier.com/cgi-bin/cas/tree/store/parco/cas_free/browse/browse.cgi?year=1994&volume=20&issue=10-11; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.elsevier.com/cgi-bin/cas/tree/store/parco/cas_sub/browse/browse.cgi?year=1994&volume=20&issue=10-11&aid=907", acknowledgement = ack-nhfb, classification = "C5220P (Parallel architecture); C5440 (Multiprocessing systems); C5470 (Performance evaluation and testing)", corpsource = "FIRST, GMD Res. Inst. for Comput. Arch. and Software Eng., Berlin, Germany", fjournal = "Parallel Computing", keywords = "*T; abstract machine; architectures; DASH; distributed memory; distributed memory systems; distributed shared; hardware architecture; latency hiding; latency minimization; MANNA; memory architectures; message passing; message passing architectures; multi-threaded architectures; parallel; parallel supercomputer architectures; performance; performance evaluation; physically shared memory systems; programming models; scalability; shared memory architectures; shared memory systems; systems; taxonomy; virtual", treatment = "P Practical", } @Manual{Haines:1994:DCT, author = "Matthew Haines and David Cronk and Piyush Mehrotra", title = "On the design of chant: a talking threads of package: final report", number = "194903", publisher = pub-NTIS, address = pub-NTIS:adr, pages = "??", year = "1994", LCCN = "NAS 1.26:194903 Govt Pubs", bibdate = "Fri May 10 12:18:17 MDT 2002", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "Shipping list number 94-0861-M.", series = "NASA contractor report", acknowledgement = ack-nhfb, keywords = "message processing; messages", } @Article{Halstead:1994:PCR, author = "Burt Halstead and David Callahan and Jack Dennis and R. S. Nikhil and Vivek Sarkar", title = "Programming, compilation, and resource management issues for multithreading (panel session {II})", journal = j-COMP-ARCH-NEWS, volume = "22", number = "1", pages = "19--33", month = mar, year = "1994", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:34 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", } @InProceedings{Holm:1994:CSP, author = "J. Holm and A. Lain and P. Banerjee", title = "Compilation of Scientific Programs into Multithreaded and Message Driven Computation", crossref = "IEEE:1994:PSH", pages = "518--525", year = "1994", bibdate = "Mon Aug 26 10:38:41 MDT 1996", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Article{Iannucci:1994:AII, author = "Robert Iannucci and Anant Agarwal and Bill Dally and Anoop Gupta and Greg Papadopoulos and Burton Smith", title = "Architectural and implementation issues for multithreading (panel session {I})", journal = j-COMP-ARCH-NEWS, volume = "22", number = "1", pages = "3--18", month = mar, year = "1994", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:34 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", } @Book{Iannucci:1994:MCA, editor = "Robert A. Iannucci and others", title = "Multithreaded computer architecture: a summary of the state of the art", volume = "SECS 0281", publisher = pub-KLUWER, address = pub-KLUWER:adr, pages = "xvi + 400", year = "1994", ISBN = "0-7923-9477-1", ISBN-13 = "978-0-7923-9477-8", LCCN = "QA76.9.A73 M85 1994", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", series = "The Kluwer international series in engineering and computer science", acknowledgement = ack-nhfb, keywords = "computer architecture; Computer architecture; Computers -- Design", } @Article{Kanalakis:1994:ET, author = "John M. {Kanalakis, Jr.}", title = "Examining {OS/2} 2.1 threads", journal = j-DDJ, volume = "19", number = "1", pages = "74, 76, 78--79, 96", month = jan, year = "1994", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Tue Sep 10 08:52:50 MDT 1996", bibsource = "http://www.ddj.com/index/author/index.htm; http://www.math.utah.edu/pub/tex/bib/multithreading.bib; UnCover database", abstract = "The OS/2 2.1 multitasking model is based on the execution of threads, making it possible for many sections of a single process to execute simultaneously. John examines OS/2's thread architecture, specifically, the scheduling process.", acknowledgement = ack-nhfb, classification = "C6150J (Operating systems)", fjournal = "Dr. Dobb's Journal of Software Tools", keywords = "Bias implementation; OS/2 2.1 multitasking model; Round robin scheduling; Scheduling process; Thread architecture; Threads", thesaurus = "Multiprogramming; Operating systems [computers]; Scheduling", } @Article{Kelly:1994:MBC, author = "Michael Kelly", title = "Multithreading with {OS/2} and {Borland C++}", journal = j-CCCUJ, volume = "12", number = "8", pages = "67--??", month = aug, year = "1994", CODEN = "CCUJEX", ISSN = "1075-2838", bibdate = "Fri Aug 30 16:52:23 MDT 1996", bibsource = "http://www.cuj.com/cbklist.htm; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "C/C++ Users Journal", } @PhdThesis{Kim:1994:FPF, author = "Chinhyun Kim", title = "Functional programming and fine-grain multithreading for high-performance parallel computing", type = "Thesis (Ph.D.)", school = "University of Southern California", address = "Los Angeles, CA, USA", pages = "xv + 150", year = "1994", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Article{Krieger:1994:ASF, author = "Orran Krieger and Michael Stumm and Ron Unrau", title = "The {Alloc Stream Facility}: {A} Redesign of Application-Level Stream {I/O}", journal = j-COMPUTER, volume = "27", number = "3", pages = "75--82", month = mar, year = "1994", CODEN = "CPTRB4", ISSN = "0018-9162 (print), 1558-0814 (electronic)", ISSN-L = "0018-9162", bibdate = "Mon Feb 3 07:28:57 MST 1997", bibsource = "Compendex database; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Many stdio and even Unix I/O applications run faster when linked to the ASF application-level library. Using the Alloc Stream Interface improves performance even more.", acknowledgement = ack-nhfb, affiliation = "Dept. of Electr. and Comput. Eng., Toronto Univ., Ont., Canada", affiliationaddress = "Toronto, Can", classification = "723; C6110J (Object-oriented programming); C6110P (Parallel programming); C6150J (Operating systems)", fjournal = "Computer", journalabr = "Computer", keywords = "Alloc Stream Facility; Alloc stream interface; Application-level I/O facility; Application-level library; Application-level stream I/O; ASF; C stdio library; C++ stream I/O; Computer operating systems; Concurrency; I/O-intensive applications; Input output programs; Mapped files; Multithreaded applications; Object-oriented structure; Parallel applications; Parallel systems; Performance improvements; Popular I/O interfaces; Sequential byte stream; Standard Unix systems; Stdio; System behavior; UNIX", thesaurus = "Input-output programs; Object-oriented methods; Parallel programming; Unix", } @Article{Laudon:1994:IMT, author = "James Laudon and Anoop Gupta and Mark Horowitz", title = "Interleaving: a multithreading technique targeting multiprocessors and workstations", journal = j-SIGPLAN, volume = "29", number = "11", pages = "308--318", month = nov, year = "1994", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:16:57 MST 2003", bibsource = "http://portal.acm.org/; http://www.acm.org/pubs/toc/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "Co-published in {\em Operating Systems Review}, {\bf 28}(5).", URL = "http://www.acm.org:80/pubs/citations/proceedings/asplos/195473/p308-laudon/", abstract = "There is an increasing trend to use commodity microprocessors as the compute engines in large-scale multiprocessors. However, given that the majority of the microprocessors are sold in the workstation market, not in the multiprocessor market, it is only natural that architectural features that benefit only multiprocessors are less likely to be adopted in commodity microprocessors. In this paper, we explore multiple-context processors, an architectural technique proposed to hide the large memory latency in multiprocessors. We show that while current multiple-context designs work reasonably well for multiprocessors, they are ineffective in hiding the much shorter uniprocessor latencies using the limited parallelism found in workstation environments. We propose an alternative design that combines the best features of two existing approaches, and present simulation results that show it yields better performance for both multiprogrammed workloads on a workstation and parallel applications on a multiprocessor. By addressing the needs of the workstation environment, our proposal makes multiple contexts more attractive for commodity microprocessors.", acknowledgement = ack-nhfb, classification = "C5430 (Microcomputers); C5440 (Multiprocessing systems); C6120 (File organisation); C6150J (Operating systems)", conflocation = "San Jose, CA, USA; 4-7 Oct. 1994", conftitle = "Sixth International Conference on Architectural Support for Programming Languages and Operating Systems (ASPLOS-VI)", corpsource = "Comput. Syst. Lab., Stanford Univ., CA, USA", fjournal = "ACM SIGPLAN Notices", keywords = "architectural features; commodity microprocessors; compute engines; design; interleaved storage; interleaving; large memory latency; large-scale multiprocessors; measurement; multiple-context designs; multiple-context processors; multiprocessing systems; multiprogrammed workloads; multiprogramming; multithreading technique; parallel applications; parallel uniprocessor latencies; performance; theory; uniprocessor latencies; workstations", sponsororg = "ACM; IEEE Comput. Soc", subject = "{\bf C.5.3} Computer Systems Organization, COMPUTER SYSTEM IMPLEMENTATION, Microcomputers. {\bf C.4} Computer Systems Organization, PERFORMANCE OF SYSTEMS.", treatment = "P Practical", } @Article{Launchbury:1994:LFS, author = "John Launchbury and Simon L. {Peyton Jones}", title = "Lazy Functional State Threads", journal = j-SIGPLAN, volume = "29", number = "6", pages = "24--35", month = jun, year = "1994", CODEN = "SINODQ", ISBN = "0-89791-598-4", ISBN-13 = "978-0-89791-598-4", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:16:51 MST 2003", bibsource = "http://portal.acm.org/; http://www.acm.org/pubs/contents/proceedings/pldi/178243/index.html; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org:80/pubs/citations/proceedings/pldi/178243/p24-launchbury/", abstract = "Some algorithms make critical internal use of updatable state, even though their external specification is purely functional. Based on earlier work on monads, we present a way of securely encapsulating stateful computations that manipulate multiple, named, mutable objects, in the context of a non-strict, purely-functional language. The security of the encapsulation is assured by the type system, using parametricity. Intriguingly, this parametricity requires the provision of a (single) constant with a rank-2 polymorphic type.", acknowledgement = ack-nhfb, annote = "Published as part of the Proceedings of PLDI'94.", classification = "C4240 (Programming and algorithm theory); C6110 (Systems analysis and programming); C6140D (High level languages)", conflocation = "Orlando, FL, USA; 20-24 June 1994", conftitle = "ACM SIGPLAN '94 Conference on Programming Language Design and Implementation (PLDI)", corpsource = "Glasgow Univ., UK", fjournal = "ACM SIGPLAN Notices", keywords = "algorithms; encapsulation; external specification; functional language; functional programming; high level languages; languages; lazy functional state threads; monads; mutable objects; nonstrict purely-functional language; parametricity; rank-2 polymorphic type; security; specification; stateful computations; type system; type theory; updatable state", sponsororg = "ACM", subject = "{\bf D.3.3} Software, PROGRAMMING LANGUAGES, Language Constructs and Features, Procedures, functions, and subroutines. {\bf D.3.2} Software, PROGRAMMING LANGUAGES, Language Classifications, Applicative (functional) languages. {\bf F.3.3} Theory of Computation, LOGICS AND MEANINGS OF PROGRAMS, Studies of Program Constructs, Type structure. {\bf F.4.1} Theory of Computation, MATHEMATICAL LOGIC AND FORMAL LANGUAGES, Mathematical Logic, Lambda calculus and related systems.", treatment = "P Practical; T Theoretical or Mathematical", } @Article{Lee:1994:DAM, author = "Ben Lee and A. R. Hurson", title = "Dataflow Architectures and Multithreading", journal = j-COMPUTER, volume = "27", number = "8", pages = "27--39", month = aug, year = "1994", CODEN = "CPTRB4", ISSN = "0018-9162 (print), 1558-0814 (electronic)", ISSN-L = "0018-9162", bibdate = "Mon Feb 3 07:28:57 MST 1997", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Contrary to initial expectations, implementing dataflow computers has presented a. monumental challenge. Now, however, multithreading offers a. viable alternative for buliding hybrid architectures that exploit parallelism.", acknowledgement = ack-nhfb, affiliation = "Dept. of Electr. and Comput. Eng., Oregon State Univ., Corvallis, OR, USA", classification = "C5220P (Parallel architecture); C5440 (Multiprocessing systems)", fjournal = "Computer", keywords = "Compilers; Concurrency; Data dependencies; Dataflow architectures; Dataflow machines; Functional semantics; Hybrid architectures; Id; Imperative languages; Multithreading; Parallel functional languages; Parallel machines; Parallelism; Programmability; Semantics; Side effects; SISAL; Source code; Streams and Iterations in a Single Assignment Language; Syntax; Threaded Abstract Machine", thesaurus = "Parallel architectures; Parallel processing", } @Article{Liedtke:1994:SNIb, author = "Jochen Liedtke", title = "A short note on implementing thread exclusiveness and address space locking", journal = j-OPER-SYS-REV, volume = "28", number = "3", pages = "38--42", month = jul, year = "1994", CODEN = "OSRED8", ISSN = "0163-5980", ISSN-L = "0163-5980", bibdate = "Sat Aug 26 08:55:46 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Operating Systems Review", } @MastersThesis{Lu:1994:MPM, author = "David Ta-Chang Lu", title = "A multithreaded processor for massively parallel architectures", type = "Thesis (M.S.)", school = "University of California, Riverside", address = "Riverside, CA, USA", pages = "vii + 42", year = "1994", LCCN = "QA76.58 .L88 1994", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "computer algorithms; Computer algorithms; computer architecture; Computer architecture; dissertations; dissertations, academic -- UCR -- computer science; parallel computers; Parallel computers; Parallel processing (Electronic computers); parallel processing (electronic computers); Science -- Dissertations; University of California, Riverside. -- Dept. of Computer; University of California, Riverside. Dept. of Computer Science", } @Article{Marinescu:1994:HLC, author = "Dan C. Marinescu and John R. Rice", title = "On High Level Characterization of Parallelism", journal = j-J-PAR-DIST-COMP, volume = "20", number = "1", pages = "107--113", month = jan, year = "1994", CODEN = "JPDCER", DOI = "http://dx.doi.org/10.1006/jpdc.1994.1011", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Thu Mar 9 09:18:53 MST 2000", bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1994.1011/production; http://www.idealibrary.com/links/doi/10.1006/jpdc.1994.1011/production/pdf", acknowledgement = ack-nhfb, classification = "C4240P (Parallel programming and algorithm theory); C5220P (Parallel architecture); C5470 (Performance evaluation and testing)", corpsource = "Dept. of Comput. Sci., Purdue Univ., West Lafayette, IN, USA", fjournal = "Journal of Parallel and Distributed Computing", keywords = "communication complexity; load balancing; massively parallel; parallel architectures; parallel execution; parallelism; performance analysis; performance evaluation; speedup; systems; threads of control", treatment = "T Theoretical or Mathematical", } @Book{MixSoftware:1994:UMC, author = "{Mix Software, Inc}", title = "Using {Multi-C}: a portable multithreaded {C} programming library", publisher = pub-PHPTR, address = pub-PHPTR:adr, pages = "vi + 257", year = "1994", ISBN = "0-13-606195-8", ISBN-13 = "978-0-13-606195-3", LCCN = "QA76.73.C15 U85 1994", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "System requirements for computer disk: IBM-compatible PC; DOS; Mix, Borland, or Microsoft-compatible C/C++ compilers.", acknowledgement = ack-nhfb, annote = "System requirements for computer disk: IBM-compatible PC; DOS; Mix, Borland, or Microsoft-compatible C/C++ compilers.", keywords = "C (computer program language); C (Computer program language); Microcomputers -- Programming languages", } @Article{Mukherjee:1994:MII, author = "Bodhisattwa Mukherjee and Greg Eisenhauer and Kaushik Ghosh", title = "A machine independent interface for lightweight threads", journal = j-OPER-SYS-REV, volume = "28", number = "1", pages = "33--47", month = jan, year = "1994", CODEN = "OSRED8", ISSN = "0163-5980", ISSN-L = "0163-5980", bibdate = "Sat Aug 26 08:55:36 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Operating Systems Review", } @Article{Norwood:1994:SMP, author = "John Norwood and Shankar Vaidyanathan", title = "Symmetric Multiprocessing for {PCs}", journal = j-DDJ, volume = "19", number = "1", pages = "80, 82--85, 98--99", month = jan, year = "1994", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Tue Sep 03 09:15:46 1996", bibsource = "http://www.ddj.com/index/author/index.htm; http://www.math.utah.edu/pub/tex/bib/multithreading.bib; UnCover database", abstract = "Our authors focus on multithreaded application development for single-processor and symmetric-multiprocessor machines under Windows NT. In doing so, they present Fortran interface statements for the Win32 console API and a black-box solution for calling 32-bit DLLs from 16-bit applications under NT.", acknowledgement = ack-nhfb, classification = "C6150J (Operating systems); C6150N (Distributed systems)", fjournal = "Dr. Dobb's Journal of Software Tools", keywords = "16-Bit applications; 32-Bit DLLs; Black-box solution; Fortran interface statements; Multithreaded application; Single processor machines; Symmetric-multiprocessor machines; Win32 console API; Windows NT", thesaurus = "C listings; Multiprocessing programs; Multiprogramming", } @InProceedings{Ramsey:1994:CTB, author = "Norman Ramsey", title = "Correctness of trap-based breakpoint implementations", crossref = "ACM:1994:CRP", pages = "15--24", year = "1994", bibdate = "Mon May 3 12:50:22 MDT 1999", bibsource = "http://www.acm.org/pubs/toc/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org:80/pubs/citations/proceedings/plan/174675/p15-ramsey/", abstract = "It is common for debuggers to implement breakpoints by a combination of planting traps and single stepping. When the target program contains multiple threads of execution, a debugger that is not carefully implemented may miss breakpoints. This paper gives a formal model of a breakpoint in a two-threaded program. The model describes correct and incorrect breakpoint implementations. Automatic search of the model's state space shows that the correct implementation does miss a breakpoint. The results apply even to debuggers like dbx and gdb, which are apparently for single-threaded programs; when the user evaluates an expression containing function calls, the debugger executes the call in the target address space, in effect creating a new thread.", acknowledgement = ack-nhfb, keywords = "languages; measurement; theory", subject = "{\bf D.2.5} Software, SOFTWARE ENGINEERING, Testing and Debugging. {\bf F.3.1} Theory of Computation, LOGICS AND MEANINGS OF PROGRAMS, Specifying and Verifying and Reasoning about Programs.", } @Article{Rodley:1994:UIC, author = "John Rodley", title = "{OS/2} and {UnixWare} Interprocess Communication", journal = j-DDJ, volume = "19", number = "5", pages = "78--82, 84, 107--109", month = may, year = "1994", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Tue Sep 03 09:15:49 1996", bibsource = "http://www.ddj.com/index/author/index.htm; http://www.math.utah.edu/pub/tex/bib/multithreading.bib; UnCover database", abstract = "Interprocess communication isn't portable between IBM's OS/2 2.1 and Novell's UnixWare 1.1. But even through the implementation details differ greatly, the two systems do share ways of thinking about IPC. John looks at IPC under OS/2 and UnixWare to see what common ground exists.", acknowledgement = ack-nhfb, classification = "C6150J (Operating systems); C6150N (Distributed systems)", fjournal = "Dr. Dobb's Journal of Software Tools", keywords = "APIs; Applications programming; Functionality; IBM OS/2 2.1; Implementation details; Independent processes; Interprocess communication; IPC models; Multitasking operating systems; Novell UnixWare 1.1; Threads", thesaurus = "C listings; Multiprocessing systems; Operating systems [computers]; Unix", } @InProceedings{Shee:1994:DMA, author = "Jang Chung Shee and Chao Chin Wu and Lin Wen You and Cheng Chen", title = "Design of a multithread architecture and its parallel simulation and evaluation environment", crossref = "Anonymous:1994:ICS", pages = "69--76 (vol. 1)", year = "1994", bibdate = "Sun Dec 22 10:19:23 MST 1996", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, affiliation = "Inst. of Comput. Sci. and Inf. Eng., Nat. Chiao Tung Univ., Hsinchu, Taiwan", classification = "C5220P (Parallel architecture); C6115 (Programming support); C6185 (Simulation techniques)", keywords = "Context switch; Integrated multiprocessing simulation environment; Multithread architecture; Parallel simulation; Parallel simulation and evaluation environment; Parallel Virtual Machine; SUN SPARC workstations; Thread-related instructions", thesaurus = "Digital simulation; Parallel architectures; Programming environments", } @TechReport{Squillante:1994:AMP, author = "Mark S. Squillante", title = "Analytic modeling of processor utilization in multithreaded processor architectures", type = "Research report", number = "RC 19543 (84999)", institution = "IBM T. J. Watson Research Center", address = "Yorktown Heights, NY, USA", pages = "9", month = apr, year = "1994", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "In this paper, we develop an analytic model of processor utilization in multithreaded processor architectures that supports both serial and parallel processing of memory requests. The system is modeled as a finite, continuous-time Markov chain whose solution can be obtained efficiently. Although it applies more generally, our modeling approach supports an important class of probability distributions that can be used to approximate the distributions of interest with sufficient accuracy in most practical cases. This results in an efficient and accurate model across a wide variety of system environments.", acknowledgement = ack-nhfb, keywords = "Multiprocessors", } @Article{Tetewsky:1994:GDR, author = "Avram K. Tetewsky", title = "{GUI} Development for Real-Time Applications", journal = j-DDJ, volume = "19", number = "6", pages = "28, 30, 32, 36, 38, 40--41", month = jun, year = "1994", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Tue Sep 03 09:15:49 1996", bibsource = "http://www.ddj.com/index/author/index.htm; http://www.math.utah.edu/pub/tex/bib/multithreading.bib; UnCover database", abstract = "Although they take radically different approaches, both ControlCalc and LabView are designed for building GUI-based, real-time control applications.", acknowledgement = ack-nhfb, affiliation = "Draper (C.S.) Lab., Cambridge, MA, USA", classification = "C6115 (Programming support); C6130B (Graphics techniques); C6180G (Graphical user interfaces); C7420 (Control engineering)", fjournal = "Dr. Dobb's Journal of Software Tools", keywords = "386/OS-9000; 680X0/OS9; ControlCalc Version 1.78; G-Windows 2.3 windowing package; GUI development; LabView 3.0; Multipage-spreadsheet paradigm; Multithreaded program; National Instruments; OS-9000 1.3; PC-based tools; Rapid prototyping; Real-time control application; RTWare; Windows data-flow driven software", thesaurus = "Computerised control; Graphical user interfaces; Real-time systems; Software tools", } @Article{Thekkath:1994:EMH, author = "Radhika Thekkath and Susan J. Eggers", title = "The effectiveness of multiple hardware contexts", journal = j-SIGPLAN, volume = "29", number = "11", pages = "328--337", month = nov, year = "1994", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:16:57 MST 2003", bibsource = "http://portal.acm.org/; http://www.acm.org/pubs/toc/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org:80/pubs/citations/proceedings/asplos/195473/p328-thekkath/", abstract = "Multithreaded processors are used to tolerate long memory latencies. By executing threads loaded in multiple hardware contexts, an otherwise idle processor can keep busy, thus increasing its utilization. However, the larger size of a multi-thread working set can have a negative effect on cache conflict misses. In this paper we evaluate the two phenomena together, examining their combined effect on execution time. The usefulness of multiple hardware contexts depends on: program data locality, cache organization and degree of multiprocessing. Multiple hardware contexts are most effective on programs that have been optimized for data locality. For these programs, execution time dropped with increasing contexts, over widely varying architectures. With unoptimized applications, multiple contexts had limited value. The best performance was seen with only two contexts, and only on uniprocessors and small multiprocessors. The behavior of the unoptimized applications changed more noticeably with variations in cache associativity and cache hierarchy, unlike the optimized programs. As a mechanism for exploiting program parallelism, an additional processor is clearly better than another context. However, there were many configurations for which the addition of a few hardware contexts brought as much or greater performance than a larger multiprocessor with fewer than the optimal number of contexts.", acknowledgement = ack-nhfb, classification = "C5320G (Semiconductor storage); C5440 (Multiprocessing systems); C6110P (Parallel programming); C6120 (File organisation); C6150N (Distributed systems software)", conflocation = "San Jose, CA, USA; 4-7 Oct. 1994", conftitle = "Sixth International Conference on Architectural Support for Programming Languages and Operating Systems (ASPLOS-VI)", corpsource = "Dept. of Comput. Sci. and Eng., Washington Univ., Seattle, WA, USA", fjournal = "ACM SIGPLAN Notices", keywords = "cache associativity; cache conflict misses; cache hierarchy; cache organization; cache storage; data locality; design; long; long memory latencies; measurement; multi-thread working set; multiple hardware contexts; multiprocessing; multiprocessing systems; multithreaded processors; parallel programming; performance; program data locality; program parallelism; storage management; theory; unoptimized applications", sponsororg = "ACM; IEEE Comput. Soc", subject = "{\bf C.5.3} Computer Systems Organization, COMPUTER SYSTEM IMPLEMENTATION, Microcomputers. {\bf C.4} Computer Systems Organization, PERFORMANCE OF SYSTEMS.", treatment = "P Practical", } @Article{Thekkath:1994:ISB, author = "R. Thekkath and S. J. Eggers", title = "Impact of sharing-based thread placement on multithreaded architectures", journal = j-COMP-ARCH-NEWS, volume = "22", number = "2", pages = "176--186", month = apr, year = "1994", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:40 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", } @MastersThesis{Wang:1994:MAD, author = "Xiaobao Wang", title = "Multithreaded architecture: design and performance analysis", volume = "3016", type = "Thesis (M. S.)", school = "Department of Electrical Engineering, University of Hawaii at Manoa", address = "Manoa, HI, USA", pages = "59", year = "1994", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", series = "Theses for the degree of Master of Science (University of Hawaii at Manoa)", acknowledgement = ack-nhfb, keywords = "Computer architecture; Multiprocessors", } @Article{Williams:1994:NST, author = "Al Williams", title = "{NT-Style} Threads For {MS-DOS}", journal = j-DDJ, volume = "19", number = "2", pages = "74, 76--77", month = feb, year = "1994", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Tue Sep 03 09:15:47 1996", bibsource = "http://www.ddj.com/index/author/index.htm; http://www.math.utah.edu/pub/tex/bib/multithreading.bib; UnCover database", abstract = "Al uses Phar Lap's TNT 386/DOS-Extender to implement NT-style threads in a DOS program that removes a directory tree. Instead of recursing down the tree, the program (which works with NT and TNT) processes directories in parallel.", acknowledgement = ack-nhfb, classification = "C6110 (Systems analysis and programming); C6150C (Compilers, interpreters and other processors); C6150J (Operating systems)", fjournal = "Dr. Dobb's Journal of Software Tools", keywords = "BIOS interrupts; C library functions; Compiling; DOS; Memory allocation; MS-DOS; Multiple threads; Multithreading; Phar Lap; Specification; TNT 386/DOS-Extender; Win32 programming API; Win32-base API; Windows; Windows NT", thesaurus = "Interrupts; Multiprogramming; Operating systems [computers]; Program compilers", } @Article{Anonymous:1995:HUW, author = "Anonymous", title = "{{HP-UX} 10.0 will be unveiled this week, with newly tuned kernel and {I}\slash {O} paths, plus a multithreaded {NFS} implementation}", journal = j-OPEN-SYSTEMS-TODAY, volume = "168", pages = "34--??", month = feb, year = "1995", ISSN = "1061-0839", bibdate = "Fri Jan 26 17:24:01 MST 1996", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Open Systems Today", } @Article{Anonymous:1995:HWB, author = "Anonymous", title = "{{HP-UX} 10.0 will be unveiled this week, with newly tuned kernel and {I}\slash {O} paths, plus a multithreaded {NFS} implementation}", journal = j-OPEN-SYSTEMS-TODAY, volume = "168", pages = "34--??", month = feb, year = "1995", ISSN = "1061-0839", bibdate = "Fri Jan 26 17:24:01 MST 1996", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Open Systems Today", } @Article{Baker:1995:GTP, author = "Mary Baker", title = "Going threadbare (panel session): sense or sedition? a debate on the threads abstraction", journal = j-OPER-SYS-REV, volume = "29", number = "5", pages = "227--227", month = dec, year = "1995", CODEN = "OSRED8", ISSN = "0163-5980", ISSN-L = "0163-5980", bibdate = "Sat Aug 26 08:55:55 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Operating Systems Review", } @Article{Baker:1995:UOV, author = "Henry G. Baker", title = "``Use-once'' variables and linear objects: storage management, reflection and multi-threading", journal = j-SIGPLAN, volume = "30", number = "1", pages = "45--52", month = jan, year = "1995", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:16:59 MST 2003", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", } @Book{Bic:1995:ATD, author = "Lubomir Bic and Guang R. Gao and Jean-Luc Gaudiot", title = "Advanced topics in dataflow computing and multithreading", publisher = pub-IEEE, address = pub-IEEE:adr, pages = "x + 450", year = "1995", ISBN = "0-8186-6541-6, 0-8186-6540-8 (paperback)", ISBN-13 = "978-0-8186-6541-7, 978-0-8186-6540-0 (paperback)", LCCN = "QA76.9.A73A356 1994", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "Computer architecture; Data structures (Computer science); Parallel processing (Electronic computers)", } @Article{Blumofe:1995:CEM, author = "Robert D. Blumofe and Christopher F. Joerg and Bradley C. Kuszmaul and Charles E. Leiserson and Keith H. Randall and Yuli Zhou", title = "{Cilk}: an efficient multithreaded runtime system", journal = j-SIGPLAN, volume = "30", number = "8", pages = "207--216", month = aug, year = "1995", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:17:08 MST 2003", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Cilk (pronounced `silk') is a C-based runtime system for multithreaded parallel programming. In this paper, we document the efficiency of the Cilk work-stealing scheduler, both empirically and analytically. We show that on real and synthetic applications, the `work' and `critical path' of a Cilk computation can be used to accurately model performance. Consequently, a Cilk programmer can focus on reducing the work and critical path of his computation, insulated from load balancing and other runtime scheduling issues. We also prove that for the class of `fully strict' (well-structured) programs, the Cilk scheduler achieves space, time, and communication bounds all within a constant factor of optimal. The Cilk runtime system currently runs on the Connection Machine CM5 massively parallel processor (MPP), the Intel Paragon MPP, the Silicon Graphics Power Challenge symmetric multiprocessor (SMP), and the MIT Phish network of workstations. Applications written in Cilk include protein folding, graphic rendering, backtrack searching, and the *Socrates chess program, which won third prize in the 1994 ACM International Computer Chess Championship.", acknowledgement = ack-nhfb, affiliation = "Lab. for Comput. Sci., MIT, Cambridge, MA, USA", classification = "C6110P (Parallel programming); C6150C (Compilers, interpreters and other processors); C6150N (Distributed systems software)", fjournal = "ACM SIGPLAN Notices", keywords = "*Socrates chess program; Accurate performance modelling; Backtrack searching; C-based multithreaded runtime system; Cilk; Communication bounds; Connection Machine CM5; Critical path; Efficiency; Fully strict programs; Graphic rendering; Intel Paragon; Load balancing; MIT Phish workstation network; Parallel programming; Protein folding; Runtime scheduling issues; Silicon Graphics Power Challenge; Space bounds; Time bounds; Well-structured programs; Work-stealing scheduler", thesaurus = "Backtracking; Biology computing; Molecular configurations; Parallel programming; Processor scheduling; Program interpreters; Proteins; Rendering [computer graphics]", } @PhdThesis{Blumofe:1995:EMP, author = "Robert D. (Robert David) Blumofe", title = "Executing multithreaded programs efficiently", type = "Thesis (Ph.D.)", school = "Massachusetts Institute of Technology, Department of Electrical Engineering and Computer Science", address = "Cambridge, MA, USA", pages = "145", year = "1995", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Article{Cejtin:1995:HOD, author = "Henry Cejtin and Suresh Jagannathan and Richard Kelsey", title = "Higher-Order Distributed Objects", journal = j-TOPLAS, volume = "17", number = "5", pages = "704--739", month = sep, year = "1995", CODEN = "ATPSDT", ISSN = "0164-0925 (print), 1558-4593 (electronic)", ISSN-L = "0164-0925", bibdate = "Fri Jan 5 07:58:42 MST 1996", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org/pubs/toc/Abstracts/0164-0925/213986.html", abstract = "We describe a distributed implementation of Scheme that permits efficient transmission of higher-order objects such as closures and continuations. The integration of distributed communication facilities within a higher-order programming language engenders a number of new abstractions and paradigms for distributed computing. Among these are user-specified load-balancing and migration policies for threads, incrementally linked distributed computations, and parameterized client-server applications. To our knowledge, this is the first distributed dialect of Scheme (or a related language) that addresses lightweight communication abstractions for higher-order objects.", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Programming Languages and Systems", keywords = "experimentation; languages", subject = "{\bf D.1.3}: Software, PROGRAMMING TECHNIQUES, Concurrent Programming, Distributed programming. {\bf D.3.2}: Software, PROGRAMMING LANGUAGES, Language Classifications, Applicative languages. {\bf D.3.2}: Software, PROGRAMMING LANGUAGES, Language Classifications, Extensible languages. {\bf D.3.3}: Software, PROGRAMMING LANGUAGES, Language Constructs and Features, Concurrent programming structures. {\bf D.3.2}: Software, PROGRAMMING LANGUAGES, Language Classifications, SCHEME.", } @Article{Chang:1995:CSM, author = "C.-Y. Chang and J.-P. Sheu", title = "Compile-time scheduling of multithread with data localities on multiple vector processors", journal = j-CPE, volume = "7", number = "5", pages = "349--369", month = aug, year = "1995", CODEN = "CPEXEI", ISSN = "1040-3108", ISSN-L = "1040-3108", bibdate = "Tue Sep 7 05:40:19 MDT 1999", bibsource = "http://www.math.utah.edu/pub/tex/bib/cpe.bib; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Concurrency, practice and experience", } @Article{Chang:1995:CTS, author = "C.-Y. Chang and J.-P. Sheu", title = "Compile-time scheduling of multithread with data localities on multiple vector processors", journal = j-CPE, volume = "7", number = "5", pages = "349--369", month = aug, year = "1995", CODEN = "CPEXEI", ISSN = "1040-3108", ISSN-L = "1040-3108", bibdate = "Tue Sep 7 05:40:19 MDT 1999", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Concurrency, practice and experience", } @Article{Chong:1995:PAF, author = "Yong-Kim Chong and Kai Hwang", title = "Performance Analysis of Four Memory Consistency Models for Multithreaded Multiprocessors", journal = j-IEEE-TRANS-PAR-DIST-SYS, volume = "6", number = "10", pages = "1085--1099", month = oct, year = "1995", CODEN = "ITDSEO", DOI = "http://dx.doi.org/10.1109/71.473517", ISSN = "1045-9219 (print), 1558-2183 (electronic)", ISSN-L = "1045-9219", bibdate = "Fri Nov 6 12:31:15 MST 1998", bibsource = "Compendex database; http://www.computer.org/tpds/td1995/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.computer.org/tpds/td1995/l1085abs.htm", acknowledgement = ack-nhfb, affiliation = "Nanyang Technological Univ", affiliationaddress = "Singapore, Singapore", classification = "716.1; 722.1; 722.3; 722.4; 921.4; 922.1; C1160 (Combinatorial mathematics); C5440 (Multiprocessing systems); C5470 (Performance evaluation and testing)", corpsource = "Sch. of Electr. and Electron. Eng., Nanyang Technol. Univ., Singapore", fjournal = "IEEE Transactions on Parallel and Distributed Systems", journalabr = "IEEE Trans Parallel Distrib Syst", keywords = "attributes; Bandwidth; Buffer storage; cache interferences; Computer networks; Computer selection and evaluation; Computer simulation; Context switching; Data communication systems; Data storage equipment; Distributed shared memory; distributed shared memory models; embedded Markov chains; evaluation; Latency hiding techniques; Markov processes; memory consistency models; Memory consistency models; memory event reordering; multiprocessing systems; Multiprocessing systems; multithreaded multiprocessors; Multithreaded multiprocessors; performance; Performance; performance analysis; Performance evaluation; Petri net models; Petri nets; Processors; rate; scalable multiprocessors; Scalable multiprocessors; stochastic timed Petri nets; Stochastic timed Petri nets; synchronisation; synchronization; Synchronization; Telecommunication traffic; write buffers", treatment = "A Application; P Practical", } @TechReport{Chrisochoides:1995:MMDa, author = "Nikos Chrisochoides", title = "Multithreaded model for dynamic load balancing parallel adaptive {PDE} computations", type = "Technical report", number = "CTC95, TR221", institution = "Cornell Theory Center, Cornell University", address = "Ithaca, NY, USA", pages = "23", year = "1995", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, alttitle = "Multi-threaded model for dynamic load balancing parallel adaptive PDE computations", } @TechReport{Chrisochoides:1995:MMDb, author = "Nikos Chrisochoides", title = "Multithreaded model for dynamic load balancing parallel adaptive {PDE} computations", type = "NASA contractor report 198244; ICASE report 95-83.", institution = "Institute for Computer Applications in Science and Engineering NASA Langley Research Center", address = "Hampton, VA, USA", pages = "i + 23 + i", month = nov, year = "1995", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "To appear in Applied Numerical Mathematics Journal.", abstract = "We present a multithreaded model for the dynamic load-balancing of numerical, adaptive computations required for the solution of Partial Differential Equations (PDEs) on multiprocessors. Multithreading is used as a means of exploring concurrency at the processor level in order to tolerate synchronization costs inherent to traditional (non-threaded) parallel adaptive PDE solvers. Our preliminary analysis for parallel, adaptive PDE solvers indicates that multithreading can be used as a mechanism to mask overheads required for the dynamic balancing of processor workloads with computations required for the actual numerical solution of the PDEs. Also, multithreading can simplify the implementation of dynamic load-balancing algorithms, a task that is very difficult for traditional data parallel adaptive PDE computations. Unfortunately, multithreading does not always simplify program complexity, often makes code re-usability difficult, and increases software complexity.", acknowledgement = ack-nhfb, annote = "Supported in part by an Alex Nason Prize Award Supported in part by the NSF, supplemented by ARPA. Supported in part by the National Aeronautics and Space Administration.", keywords = "Differential equations, Partial; Parallel programming (Computer science); Synchronization; Threads (Computer programs)", } @MastersThesis{Divekar:1995:IMP, author = "Ravindra Divekar", title = "The impact of multithreading on the performance of superscalar processors", type = "Thesis (M.A.)", number = "2117", school = "State University of New York at Binghamton, Thomas J. Watson School of Engineering and Applied Science", address = "Binghamton, NY, USA", pages = "vi + 73", year = "1995", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", series = "Master's theses / State University of New York at Binghamton", acknowledgement = ack-nhfb, keywords = "Operating systems (Computers)", } @Article{Drusinsky:1995:VDE, author = "Doron Drusinsky", title = "Visually Designing Embedded-Systems Applications", journal = j-DDJ, volume = "20", number = "6", pages = "62, 64, 66, 68, 104--106", month = jun, year = "1995", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Thu Jan 9 09:35:43 MST 1997", bibsource = "Compendex database; http://www.ddj.com/index/author/index.htm; http://www.math.utah.edu/pub/tex/bib/multithreading.bib; UnCover database", abstract = "Doron describes how design tools that incorporate object-oriented inheritance and extended state diagrams (the visual counterpart of finite state machines) can be used to build control systems.", acknowledgement = ack-nhfb, affiliation = "R-Active Concepts and Co-Active Concepts, Ltd", classification = "721.1; 722.4; 723.1; 723.1.1; 723.2; 723.5; C5140 (Firmware); C6110J (Object-oriented programming); C6110P (Parallel programming); C6140D (High level languages)", fjournal = "Dr. Dobb's Journal of Software Tools", journalabr = "Dr Dobb's J Software Tools Prof Program", keywords = "C; C (programming language); C++ listing; Codes (SYMBOLS); Computer aided software engineering; Computer software; Computer systems; Concurrency; Digital answering machine; Embedded systems; Embedded-systems application; ESD; Extended state diagram; Extended state diagrams; Finite automata; Finite state diagram; Firmware; Hierarchy; Inheritance; Interactive computer systems; Microcode; Multithreading; Object oriented programming; Operating-system-like routine; Reactive system; Real time system; State diagram; Synchronization; Systems analysis; Visual synchronization; Visually designing", pagecount = "4", thesaurus = "C language; C listings; Firmware; Object-oriented programming; Real-time systems", } @TechReport{Dubey:1995:SSM, author = "Pradeep Dubey", title = "Single-program speculative multithreading ({SPSM}) architecture: compiler-assisted fine-grained multithreading", type = "Research report", number = "RC 19928 (88233)", institution = "IBM T. J. Watson Research Center", address = "Yorktown Heights, NY, USA", pages = "25", day = "6", month = feb, year = "1995", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Recent limit studies on instruction-level parallel processing, based on non-numeric applications, have reported significant performance gains from speculative execution of multiple control flows. This paper describes a new single-program speculative multithreading (SPSM) architecture, which can be viewed as an extension of any existing single-thread architecture. It enables speculative fetch, decode, and execution from multiple program locations simultaneously. Instruction threads are generated at compile-time using control dependence analysis. Inter-thread data dependences are also analyzed at compile-time. However, resource binding of instructions is performed only at run time, to offer binary compatibility across different implementations. New thread generation algorithms, being prototyped in a version of the TOBEY compiler, are also described. The SPSM architecture includes novel fork/suspend instructions which are used to identify independent instruction threads, and also to specify compile-time control flow speculations associated with inter-thread dependences.", acknowledgement = ack-nhfb, keywords = "Computer architecture", } @Article{Dugger:1995:MC, author = "Jim Dugger", title = "Multithreading in {C++}", journal = j-CCCUJ, volume = "13", number = "11", pages = "23--??", month = nov, year = "1995", CODEN = "CCUJEX", ISSN = "1075-2838", bibdate = "Fri Aug 30 16:52:23 MDT 1996", bibsource = "http://www.cuj.com/cbklist.htm; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "C/C++ Users Journal", } @InProceedings{Elmasri:1995:TCL, author = "N. Elmasri and H. H. J. Hum and G. R. Gao", title = "The Threaded Communication Library: Preliminary Experiences on a Multiprocessor with Dual-Processor Nodes", crossref = "ACM:1995:CPI", pages = "195--199", year = "1995", bibdate = "Mon Aug 26 10:38:41 MDT 1996", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Article{English:1995:MC, author = "John English", title = "Multithreading in {C++}", journal = j-SIGPLAN, volume = "30", number = "4", pages = "21--28", month = apr, year = "1995", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:17:03 MST 2003", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", } @Manual{Fahringer:1995:UTDa, author = "Thomas Fahringer and Matthew Haines and Piyush Mehrotra", title = "On the utility of threads for data parallel programming", number = "198155", publisher = pub-NTIS, address = pub-NTIS:adr, pages = "??", year = "1995", LCCN = "NAS 1.26:198155 Govt Pubs", bibdate = "Fri May 10 12:18:17 MDT 2002", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "Shipping list number 96-0037-M", series = "NASA contractor report", acknowledgement = ack-nhfb, keywords = "computation; interprocessor communication; parallel programming; particle in cell technique; relaxation method (mathematics)", } @InProceedings{Fahringer:1995:UTDb, author = "T. Fahringer and M. Haines and P. Mehrotra", title = "On the Utility of Threads for Data Parallel Programming", crossref = "ACM:1995:CPI", pages = "51--59", year = "1995", bibdate = "Mon Aug 26 10:38:41 MDT 1996", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @InProceedings{Field:1995:PPS, author = "John Field and G. Ramalingam and Frank Tip", title = "Parametric program slicing", crossref = "ACM:1995:CRP", pages = "379--392", year = "1995", bibdate = "Mon May 3 12:52:30 MDT 1999", bibsource = "http://www.acm.org/pubs/toc/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org:80/pubs/citations/proceedings/plan/199448/p379-field/", abstract = "Program slicing is a technique for isolating computational threads in programs. In this paper, we show how to mechanically extract a family of practical algorithms for computing slices directly from semantic specifications. These algorithms are based on combining the notion of {\em dynamic dependence tracking\/} in term rewriting systems with a program representation whose behavior is defined via an equational logic. Our approach is distinguished by the fact that changes to the behavior of the slicing algorithm can be accomplished through simple changes in rewriting rules that define the semantics of the program representation. Thus, e.g., different notions of dependence may be specified, properties of language-specific datatypes can be exploited, and various time, space, and precision tradeoffs may be made. This flexibility enables us to generalize the traditional notions of static and dynamic slices to that of a {\em constrained\/} slice, where any subset of the inputs of a program may be supplied.", acknowledgement = ack-nhfb, keywords = "algorithms; languages", subject = "{\bf F.3.3} Theory of Computation, LOGICS AND MEANINGS OF PROGRAMS, Studies of Program Constructs, Program and recursion schemes. {\bf F.3.3} Theory of Computation, LOGICS AND MEANINGS OF PROGRAMS, Studies of Program Constructs, Functional constructs. {\bf F.3.2} Theory of Computation, LOGICS AND MEANINGS OF PROGRAMS, Semantics of Programming Languages. {\bf F.3.1} Theory of Computation, LOGICS AND MEANINGS OF PROGRAMS, Specifying and Verifying and Reasoning about Programs, Specification techniques. {\bf F.4.2} Theory of Computation, MATHEMATICAL LOGIC AND FORMAL LANGUAGES, Grammars and Other Rewriting Systems. {\bf D.3.2} Software, PROGRAMMING LANGUAGES, Language Classifications, C.", } @Article{Finger:1995:LTC, author = "Jonathan Finger", title = "Lightweight Tasks in {C}", journal = j-DDJ, volume = "20", number = "5", pages = "48, 50, 102", month = may, year = "1995", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Tue Sep 03 09:16:50 1996", bibsource = "Compendex database; http://www.ddj.com/index/author/index.htm; http://www.math.utah.edu/pub/tex/bib/multithreading.bib; UnCover database", abstract = "While most modern operating systems allow multiple threads within a process, earlier-generation systems do not. Jonathan presents a multithreading package that allows for cooperatively multitasked threads within a single process for operating systems that do not explicitly support threads.", acknowledgement = ack-nhfb, classification = "722.4; 723.1; 723.1.1; C6110B (Software engineering techniques); C6150J (Operating systems)", fjournal = "Dr. Dobb's Journal of Software Tools", journalabr = "Dr Dobb's J Software Tools Prof Program", keywords = "C; C (programming language); Codes (SYMBOLS); Computer operating systems; Context switch; Cooperative task switching; Cooperatively multitasked threads; DOS; High level language; Lightweight tasker; Lightweight tasks; Microsoft compiler; Minicomputer platform; MIX Software; Modern operating systems; Multi-C package; Multiple processes; Multiprocessing systems; Multiprogramming; Multitasking system; Multithreading code; Multithreading package; Multiuser application; Multiuser mailing list management system; PC/DOS system; Preemptive task switching; Program compilers; Software engineering; Tenberry Software; Threads; Watcom compiler", pagecount = "2", thesaurus = "C listings; Multiprogramming; Software portability", } @Article{Fiske:1995:TPT, author = "Stuart Fiske and William J. Dally", title = "Thread prioritization: {A} thread scheduling mechanism for multiple-context parallel processors", journal = j-FUT-GEN-COMP-SYS, volume = "11", number = "6", pages = "503--518", month = oct, year = "1995", CODEN = "FGSEVI", ISSN = "0167-739X (print), 1872-7115 (electronic)", ISSN-L = "0167-739X", bibdate = "Sat Jan 10 12:00:22 MST 2004", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Future Generation Computer Systems", remark = "High-Performance Computer Architecture.", } @Article{Ford:1995:EDT, author = "Dan Ford", title = "Event-Driven Threads In {C++}", journal = j-DDJ, volume = "20", number = "6", pages = "48--50, 52, 54, 98, 100, 102", month = jun, year = "1995", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Thu Jan 9 09:35:43 MST 1997", bibsource = "Compendex database; http://www.ddj.com/index/author/index.htm; http://www.math.utah.edu/pub/tex/bib/multithreading.bib; UnCover database", abstract = "Dan presents a powerful, multithreaded architecture that can be used by almost any application. Implemented in C++, this class library lets you quickly create and control threads.", acknowledgement = ack-nhfb, affiliation = "Hewlett--Packard", classification = "721.1; 722.4; 723.1; 723.1.1; 723.2; 723.5; C6110J (Object-oriented programming); C6110P (Parallel programming); C6140D (High level languages)", fjournal = "Dr. Dobb's Journal of Software Tools", journalabr = "Dr Dobb's J Software Tools Prof Program", keywords = "C; C (programming language); C++; Computer aided software engineering; Computer architecture; Computer simulation; Data structures; Equivalence classes; Event driven threads; Hierarchical systems; Interthread communication; Message driven thread; Multithreaded; Multithreaded applications; Multithreading; Object oriented programming; Object oriented programming application; Object-oriented infrastructure; Parallel processing; Parallelism; Synchronization; Synchronization strategies", pagecount = "5", thesaurus = "C language; C listings; Object-oriented programming; Parallel programming", } @Article{Ford:1995:ETC, author = "Dan Ford", title = "Event-Driven Threads In {C++}", journal = j-DDJ, volume = "20", number = "6", pages = "48--50, 52, 54, 98, 100, 102", month = jun, year = "1995", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Thu Jan 9 09:35:43 MST 1997", bibsource = "Compendex database; http://www.ddj.com/index/author/index.htm; http://www.math.utah.edu/pub/tex/bib/multithreading.bib; UnCover database", abstract = "Dan presents a powerful, multithreaded architecture that can be used by almost any application. Implemented in C++, this class library lets you quickly create and control threads.", acknowledgement = ack-nhfb, affiliation = "Hewlett--Packard", classification = "721.1; 722.4; 723.1; 723.1.1; 723.2; 723.5; C6110J (Object-oriented programming); C6110P (Parallel programming); C6140D (High level languages)", fjournal = "Dr. Dobb's Journal of Software Tools", journalabr = "Dr Dobb's J Software Tools Prof Program", keywords = "C; C (programming language); C++; Computer aided software engineering; Computer architecture; Computer simulation; Data structures; Equivalence classes; Event driven threads; Hierarchical systems; Interthread communication; Message driven thread; Multithreaded; Multithreaded applications; Multithreading; Object oriented programming; Object oriented programming application; Object-oriented infrastructure; Parallel processing; Parallelism; Synchronization; Synchronization strategies", pagecount = "5", thesaurus = "C language; C listings; Object-oriented programming; Parallel programming", } @Book{Gao:1995:ATD, author = "Guang R. Gao and Lubomir Bic and Jean-Luc Gaudiot", title = "Advanced topics in dataflow computing and multithreading", publisher = pub-IEEE, address = pub-IEEE:adr, pages = "x + 450", year = "1995", ISBN = "0-8186-6541-6 (hardcover), 0-8186-6540-8 (paperback), 0-8186-6542-4", ISBN-13 = "978-0-8186-6541-7 (hardcover), 978-0-8186-6540-0 (paperback), 978-0-8186-6542-4", LCCN = "QA76.9.A73 A356 1995", bibdate = "Sat Apr 20 11:22:41 2002", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "computer architecture; data structures (computer science); parallel processing (electronic computers)", } @Article{Gerber:1995:IOX, author = "Bob Gerber", title = "{Informix} Online {XPS}", journal = j-SIGMOD, volume = "24", number = "2", pages = "463--463", month = may, year = "1995", CODEN = "SRECD8", ISSN = "0163-5808 (print), 1943-5835 (electronic)", ISSN-L = "0163-5808", bibdate = "Mon Jan 12 08:45:52 MST 2004", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, classification = "C6110P (Parallel programming); C6150N (Distributed systems software); C6160B (Distributed databases)", fjournal = "ACM SIGMOD Record", keywords = "Informix Dynamic Scalable Architecture; Informix Extended Parallel Server; Informix Online XPS; Large SMP systems; Light access methods; Linear performance speedups; Loosely coupled environments; Massively parallel clusters; Online database servers; Online/DSA servers; Open systems spectrum; Parallel database systems; Parallel resource management; Pipelined hash partitioned operators; SMP based high performance parallel data query; Table partitioning; Uniprocessor systems; XPS; XPS multithreaded process groups", thesaurus = "Distributed databases; File servers; Parallel programming; Query processing", xxcrossref = "Anonymous:1995:ASI", } @Article{Girkar:1995:ETL, author = "Milind Girkar and Constantine D. Polychronopoulos", title = "Extracting Task-Level Parallelism", journal = j-TOPLAS, volume = "17", number = "4", pages = "600--634", month = jul, year = "1995", CODEN = "ATPSDT", ISSN = "0164-0925 (print), 1558-4593 (electronic)", ISSN-L = "0164-0925", bibdate = "Fri Jan 5 07:58:42 MST 1996", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org/pubs/toc/Abstracts/0164-0925/210189.html", abstract = "Automatic detection of {\em task-level parallelism\/} (also referred to as functional, DAG, unstructured, or thread parallelism) at various levels of program granularity is becoming increasingly important for parallelizing and back-end compilers. Parallelizing compilers detect iteration-level or coarser granularity parallelism which is suitable for parallel computers; detection of parallelism at the statement-or operation-level is essential for most modern microprocessors, including superscalar and VLIW architectures. In this article we study the problem of detecting, expressing, and optimizing task-level parallelism, where ``task'' refers to a program statement of arbitrary granularity. Optimizing the amount of functional parallelism (by allowing synchronization between arbitrary nodes) in sequential programs requires the notion of {\em precedence\/} in terms of paths in graphs which incorporate control and data dependences. Precedences have been defined before in a different context; however, the definition was dependent on the ideas of parallel execution and time. We show that the problem of determining precedences statically is NP-complete. Determining precedence relationships is useful in finding the essential data dependences. We show that there exists a unique minimum set of essential data dependences; finding this minimum set is NP-hard and NP-easy. We also propose a heuristic algorithm for finding the set of essential data dependences. Static analysis of a program in the Perfect Benchmarks was done, and we present some experimental results.", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Programming Languages and Systems", keywords = "algorithms; experimentation; languages; theory", subject = "{\bf D.3.4}: Software, PROGRAMMING LANGUAGES, Processors, Optimization. {\bf D.3.4}: Software, PROGRAMMING LANGUAGES, Processors, Compilers. {\bf F.1.3}: Theory of Computation, COMPUTATION BY ABSTRACT DEVICES, Complexity Classes, Reducibility and completeness. {\bf D.3.4}: Software, PROGRAMMING LANGUAGES, Processors, Code generation.", } @MastersThesis{Gulati:1995:MSM, author = "Manu Gulati", title = "Multithreading on a superscalar microprocessor", type = "Thesis (M.S., Engineering)", school = "University of California, Irvine", address = "Irvine, CA, USA", pages = "x + 102", year = "1995", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Manual{Haines:1995:RSC, author = "Matthew Haines and Piyush Mehrotra and David Cronk", title = "Ropes, support for collective operations among distributed threads", number = "198157", publisher = pub-NTIS, address = pub-NTIS:adr, pages = "??", year = "1995", LCCN = "NAS 1.26:198157 Govt Pubs", bibdate = "Fri May 10 12:18:17 MDT 2002", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "Shipping list number 96-0037-M", series = "NASA contractor report", acknowledgement = ack-nhfb, keywords = "computer system design; distributed processing; interprocessor communication; memory (computers); numerical control; parallel programming; threads", } @Article{Jensen:1995:DRT, author = "E. Douglas Jensen", title = "Distributed real-time operating systems", journal = j-DDJ, volume = "20", number = "2", pages = "32--34, 36, 38", month = feb, year = "1995", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Tue Sep 10 08:45:36 MDT 1996", bibsource = "http://www.ddj.com/index/author/index.htm; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, classification = "C6150N (Distributed systems software)", fjournal = "Dr. Dobb's Journal of Software Tools", keywords = "Distributed objects; Distributed operating systems; Operating systems; Real-time computing; Real-time operating systems; Real-time paradigm; Threads", thesaurus = "Network operating systems; Real-time systems", } @Article{Kavi:1995:DCM, author = "Krishna M. Kavi and A. R. Hurson and Phenil Patadia and Elizabeth Abraham and Ponnarasu Shanmugam", title = "Design of cache memories for multi-threaded dataflow architecture", journal = j-COMP-ARCH-NEWS, volume = "23", number = "2", pages = "253--264", month = may, year = "1995", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:47 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", } @Article{Kleiman:1995:IT, author = "Steve Kleiman and Joe Eykholt", title = "Interrupts as threads", journal = j-OPER-SYS-REV, volume = "29", number = "2", pages = "21--26", month = apr, year = "1995", CODEN = "OSRED8", ISSN = "0163-5980", ISSN-L = "0163-5980", bibdate = "Sat Aug 26 08:55:41 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Operating Systems Review", } @Book{Kleiman:1995:PT, author = "Steve Kleiman and Devang Shah and Bart Smaalders", title = "Programming With Threads", publisher = pub-SUNSOFT, address = pub-SUNSOFT:adr, pages = "xxviii and 534", year = "1995", ISBN = "0-13-172389-8", ISBN-13 = "978-0-13-172389-4", LCCN = "QA76.58.K59 1996", bibdate = "Wed Dec 09 12:51:22 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", price = "US\$48.00", URL = "http://www. amazon. com/exec/obidos/ISBN%3D0131723898/sunworldonlineA/002-4892305-5599452", acknowledgement = ack-nhfb, } @Article{Lam:1995:CPC, author = "Richard B. Lam", title = "Cross-platform communication classes", journal = j-DDJ, volume = "20", number = "3", pages = "20, 22, 24, 26", month = mar, year = "1995", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Tue Sep 10 08:45:36 MDT 1996", bibsource = "http://www.ddj.com/index/author/index.htm; http://www.math.utah.edu/pub/tex/bib/multithreading.bib; UnCover database", abstract = "Richard summarizes common techniques for interprocess communication, presenting a library that implements semaphores in a platform-independent manner to allow signaling or controlling of shared resources between processes and threads.", acknowledgement = ack-nhfb, classification = "C5620L (Local area networks); C6110J (Object-oriented programming); C6140D (High level languages); C6150N (Distributed systems software)", fjournal = "Dr. Dobb's Journal of Software Tools", keywords = "AIX; C++ libraries; Client/server computing; Cross platform C++ libraries; Cross-platform communication classes; Example library; Graphical user interfaces; Interprocess communications; OS/2; Semaphores; Shared resources; Windows NT", thesaurus = "C language; Client-server systems; Object-oriented languages; Object-oriented programming; Resource allocation; Software libraries", } @Article{Larcheveque:1995:OIP, author = "J.-M. Larchev{\^{e}}que", title = "Optimal Incremental Parsing", journal = j-TOPLAS, volume = "17", number = "1", pages = "1--15", month = jan, year = "1995", CODEN = "ATPSDT", ISSN = "0164-0925 (print), 1558-4593 (electronic)", ISSN-L = "0164-0925", bibdate = "Fri Jan 5 07:58:42 MST 1996", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org/pubs/toc/Abstracts/0164-0925/200996.html", abstract = "This communication sets the problem of incremental parsing in the context of a complete incremental compiling system. It turns out that, according to the incrementally paradigm of the attribute evaluator and data-flow analyzer to be used, two definitions of optimal incrementality in a parser are possible. Algorithms for achieving both forms of optimality are given, both of them based on ordinary LALR(1) parse tables. Optimality and correctness proofs, which are merely outlined in this communication, are made intuitive thanks to the concept of a {\em well-formed list of threaded trees}, a natural extension of the concept of {\em threaded tree\/} found in earlier works on incremental parsing.", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Programming Languages and Systems", keywords = "algorithms; performance; theory", subject = "{\bf D.3.4}: Software, PROGRAMMING LANGUAGES, Processors, Parsing. {\bf D.2.6}: Software, SOFTWARE ENGINEERING, Programming Environments, Interactive. {\bf D.3.4}: Software, PROGRAMMING LANGUAGES, Processors, Compilers. {\bf E.1}: Data, DATA STRUCTURES, Trees.", } @Article{Lenatti:1995:RPM, author = "C. Lenatti", title = "{Rethinking in Parallel: Multiprocessing is on the rise, despite a dearth of tools to help create multithreaded applications}", journal = j-UNIXWORLD-OPEN-COMP, volume = "12", number = "8", pages = "57--??", year = "1995", CODEN = "OPCOEB", ISSN = "1072-4044", bibdate = "Fri Jan 26 17:24:01 MST 1996", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "UnixWorld's Open Computing", } @Article{Leppanen:1995:PWO, author = "Ville Lepp{\"a}nen", title = "Performance of work-optimal {PRAM} simulation algorithms on coated meshes", journal = j-COMP-J, volume = "38", number = "10", pages = "801--810", month = "????", year = "1995", CODEN = "CMPJA6", ISSN = "0010-4620 (print), 1460-2067 (electronic)", ISSN-L = "0010-4620", bibdate = "Wed Jul 21 09:54:40 MDT 1999", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www3.oup.co.uk/computer_journal/Volume_38/Issue_10/Vol38_10.index.html", URL = "http://www3.oup.co.uk/computer_journal/Volume_38/Issue_10/Vol38_10.body.html#AbstractLeppanen", acknowledgement = ack-nhfb, author-1-adr = "Department of Computer Science, University of Turku, Lemmink{\"a}isenkatu 14-18, Datacity, FIN-20520 Turku, Finland", classcodes = "C5220P (Parallel architecture); C7430 (Computer engineering); C5320G (Semiconductor storage); C6110P (Parallel programming); C4240C (Computational complexity)", corpsource = "Dept. of Comput. Sci., Turku Univ., Finland", email-1 = "Ville.Leppanen@cs.utu.fi", fjournal = "The Computer Journal", keywords = "architectures; coated meshes; combining queues method; computational complexity; cost; greedy routing; mesh connected routing machinery; multithreading level; parallel; parallel algorithms; random-access storage; routing steps; simulated PRAM processors; simulation; sorting; synchronization wave; virtual leveled network technique; virtual machines; work optimal PRAM simulation algorithms", treatment = "P Practical", } @TechReport{Lim:1995:LPB, author = "Beng-Hong Lim and Ricardo Bianchini", title = "Limits on the performance benefits of multithreading and prefetching", type = "Research report", number = "RC 20238 (89547)", institution = "IBM T. J. Watson Research Center", address = "Yorktown Heights, NY, USA", pages = "23", day = "20", month = oct, year = "1995", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, annote = "Supported in part by ARPA. Supported in part by NSF Experimental Systems. Supported in part by a NSF Presidential Young Investigator Award", keywords = "Cache memory; Fault-tolerant computing; Multiprocessors", } @MastersThesis{Loikkanen:1995:FMS, author = "Matias Loikkanen", title = "A fine-grain multithreading superscalar architecture", type = "Thesis (M.S., Engineering)", school = "University of California, Irvine", address = "Irvine, CA, USA", pages = "xi + 103", year = "1995", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @MastersThesis{Lu:1995:HMC, author = "Howard J. (Howard Jason) Lu", title = "Heterogeneous multithreaded computing", type = "Thesis (M. Eng.)", school = "Massachusetts Institute of Technology, Department of Electrical Engineering and Computer Science", address = "Cambridge, MA, USA", pages = "21", year = "1995", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @TechReport{Marsland:1995:SSM, author = "T. A. Marsland and Yaoqing Gao and Francis Chi-Moon Lau", title = "A study of software multithreading in distributed systems", type = "Technical report", number = "TR 95-23", institution = "Dept. of Computing Science, University of Alberta", address = "Edmonton, AB, Canada", pages = "25", year = "1995", ISSN = "0316-4683", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Article{Mayes:1995:ULT, author = "K. R. Mayes and S. Quick and B. C. Warboys", title = "User-level threads on a general hardware interface", journal = j-OPER-SYS-REV, volume = "29", number = "4", pages = "57--62", month = oct, year = "1995", CODEN = "OSRED8", ISSN = "0163-5980", ISSN-L = "0163-5980", bibdate = "Sat Aug 26 08:55:52 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Operating Systems Review", } @MastersThesis{Metz:1995:IDS, author = "David Metz", title = "Interface design and system impact analysis of a message-handling processor for fine-grain multithreading", type = "Thesis (M.S.)", school = "Oregon State University", address = "Corvallis, OR, USA", pages = "63", year = "1995", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "Multiprocessors; Parallel processing (Electronic computers)", } @MastersThesis{Miller:1995:TPC, author = "Robert C. (Robert Chisolm) Miller", title = "A type-checking preprocessor for {Cilk 2}, a multithreaded {C} language", type = "Thesis (M. Eng.)", school = "Massachusetts Institute of Technology, Department of Electrical Engineering and Computer Science", address = "Cambridge, MA, USA", pages = "38", year = "1995", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @PhdThesis{Moore:1995:MPD, author = "Simon W. Moore", title = "Multithreaded processor design", type = "Thesis (Ph. D.)", school = "University of Cambridge, Computer Laboratory", address = "Cambridge, Cambridgeshire, UK", pages = "xvi + 125", month = feb, year = "1995", LCCN = "QA76.9.A73 M66 1995", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "Available as Technical Report 358.", abstract = "Multithreaded processors aim to improve upon both control-flow and data-flow processor models by forming some amalgam of the two. They combine sequential behaviour from the control-flow model with concurrent aspects from data-flow design. Some multithreaded processor designs have added just a little concurrency to control-flow or limited sequential execution to data-flow. This thesis demonstrates that more significant benefits may be obtained by a more radical amalgamation of the two models. A data-driven microthread model is proposed, where a microthread is a short control-flow code sequence. To demonstrate the efficiency of this model, a suitable multithreaded processor, called Anaconda, is designed and evaluated. Anaconda incorporates a scalable temporally predictable memory tree structure with distributed virtual address translation and memory protection. A temporally predictable cached direct-mapped matching store is provided to synchronise data to microthreads. Code is prefetched into an instruction cache before execution commences. Earliest-deadline-first or fixed-priority scheduling is supported via a novel hardware priority queue. Control-flow execution is performed by a modified Alpha 21064 styled pipeline which assists comparison with commercial processors.", acknowledgement = ack-nhfb, annote = "Supported in part by a studentship from the UK Science and Engineering Research Council", keywords = "Computer architecture", } @Article{Oikawa:1995:RDU, author = "Shuichi Oikawa and Hideyuki Tokuda", title = "Reflection of developing user-level real-time thread packages", journal = j-OPER-SYS-REV, volume = "29", number = "4", pages = "63--76", month = oct, year = "1995", CODEN = "OSRED8", ISSN = "0163-5980", ISSN-L = "0163-5980", bibdate = "Sat Aug 26 08:55:52 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Operating Systems Review", } @Article{Prabhakar:1995:IDO, author = "Ernest N. Prabhakar", title = "Implementing Distributed Objects", journal = j-DDJ, volume = "20", number = "8", pages = "80, 82, 84--85, 105--106", month = aug, year = "1995", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Thu Jan 9 09:35:43 MST 1997", bibsource = "Compendex database; http://www.ddj.com/index/author/index.htm; http://www.math.utah.edu/pub/tex/bib/multithreading.bib; UnCover database", abstract = "Ernest uses NeXT's PDO and Objective-C to implement a simple client-server application that packages a legacy application into an interoperable object and its client.", acknowledgement = ack-nhfb, affiliation = "NextStep\slash OpenStep User Groups Int", classification = "722.1; 722.2; 722.3; 722.4; 723.1; C5620L (Local area networks); C6110J (Object-oriented programming); C6110P (Parallel programming); C6140D (High level languages)", fjournal = "Dr. Dobb's Journal of Software Tools", journalabr = "Dr Dobb's J Software Tools Prof Program", keywords = "Codes (symbols); Computer networks; Distributed applications; Distributed computer systems; Distributed objects; Interfaces (COMPUTER); Interoperable object; Interoperable objects; Legacy application; Multithreaded object; Network protocols; NeXT; Object oriented programming; Objective-C; PDO; Portable distributed objects; Program compilers; Simple client server application; Software prototyping; Storage allocation (computer); Table lookup", pagecount = "4", thesaurus = "C language; C listings; Client-server systems; Object-oriented programming; Parallel programming", } @Article{Prasad:1995:WNT, author = "Shashi Prasad", title = "{Windows NT} Threads --- {A} multithreaded application may actually run slower on an {SMP} machine than on its single-threaded equivalent. Here's how to avoid that", journal = j-BYTE, volume = "20", number = "11", pages = "253--??", month = nov, year = "1995", CODEN = "BYTEDJ", ISSN = "0360-5280", ISSN-L = "0360-5280", bibdate = "Mon Aug 19 08:30:25 1996", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "BYTE Magazine", } @Article{Prasad:1995:WTS, author = "Shashi Prasad", title = "Weaving a Thread --- {Solaris} and {Windows NT} bring the power, speed, and efficiency of multithreading and symmetric multiprocessing to the desktop", journal = j-BYTE, volume = "20", number = "10", pages = "173--??", month = oct, year = "1995", CODEN = "BYTEDJ", ISSN = "0360-5280", ISSN-L = "0360-5280", bibdate = "Mon Aug 19 08:30:21 1996", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "BYTE Magazine", } @Book{Reich:1995:DHP, author = "David E. Reich", title = "Designing high-powered {OS/2 Warp} applications: the anatomy of multithreaded programs", publisher = pub-WILEY, address = pub-WILEY:adr, pages = "xxxi + 336", year = "1995", ISBN = "0-471-11586-X (paperback)", ISBN-13 = "978-0-471-11586-1 (paperback)", LCCN = "QA76.76.O63R437 1995", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "Application software; Microcomputers -- Operating systems; Operating systems (Computers); OS/2 Warp", } @Article{Rodens:1995:ESC, author = "Ira Rodens", title = "Examining {Symantec C++} 7.0", journal = j-DDJ, volume = "20", number = "8", pages = "86--89, 106--107", month = aug, year = "1995", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Thu Jan 9 09:35:43 MST 1997", bibsource = "Compendex database; http://www.ddj.com/index/author/index.htm; http://www.math.utah.edu/pub/tex/bib/multithreading.bib; UnCover database", abstract = "Among other features, this recent incarnation of Symantec C++ sports a visual programming environment, class and hierarchy editors, distributed build tools, and support for templates, exceptions, and run-time type identification. Compiler author Walter Bright adds tips and techniques for optimizing C++ code.", acknowledgement = ack-nhfb, affiliation = "CompuServe", classification = "722.2; 723.1; 723.1.1; 723.5; C6110J (Object-oriented programming); C6110V (Visual programming); C6115 (Programming support); C6130B (Graphics techniques); C6150G (Diagnostic, testing, debugging and evaluating systems); C6180G (Graphical user interfaces)", fjournal = "Dr. Dobb's Journal of Software Tools", journalabr = "Dr Dobb's J Software Tools Prof Program", keywords = "32-Bit multithreaded linker; Benchmarking; Browsers; Build tasks; C (programming language); C++ language; Codes (SYMBOLS); Computer programming; Distributed build tools; DOS; Exceptions an; Express Agents; File editors; Graphical user interfaces; Hierarchy editors; LAN; Linker; Multiscope debugger; Program compilers; Program debugging; Run time type identification; Run time type identification programming environment; Software engineering; Symantec C++ 7; Templates; Upgraded Microsoft Foundation Classes; Visual programming; Visual programming environment; Visual tools; Windows 95 resources", thesaurus = "Graphical user interfaces; Object-oriented programming; Program debugging; Software reviews; Software tools; Visual programming", } @Article{Rodley:1995:TPU, author = "John Rodley", title = "Thread Programming In {UnixWare} 2.0", journal = j-DDJ, volume = "20", number = "6", pages = "56, 58--61, 102, 104", month = jun, year = "1995", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Thu Jan 9 09:35:43 MST 1997", bibsource = "Compendex database; http://www.ddj.com/index/author/index.htm; http://www.math.utah.edu/pub/tex/bib/multithreading.bib; UnCover database", abstract = "With the advent of UnixWare 2.0, threads have made their way to the UNIX desktop. John describes how threads are implemented and how you can take advantage of them.", acknowledgement = ack-nhfb, classification = "722.2; 722.4; 723.1; 723.2; 723.5; C6110P (Parallel programming); C6150J (Operating systems); C6150N (Distributed systems software)", fjournal = "Dr. Dobb's Journal of Software Tools", journalabr = "Dr Dobb's J Software Tools Prof Program", keywords = "Computer aided software engineering; Computer programming; Computer simulation; Concurrency programming; Fork; Lightweight processes; Multiprocessing; Multiprocessing systems; Multithreading; Object oriented programming; P1003.lc; Parallel programming; POSIX Portable Operating Systems Standard; Real time systems; Signal processing; Thread programming; Thread specification; UNIX; UnixWare 2.0; User interfaces", pagecount = "5", thesaurus = "Multiprocessing programs; Parallel programming; Unix", } @Article{Rogers:1995:SDD, author = "Anne Rogers and Martin C. Carlisle and John H. Reppy and L. J. Hendren", title = "Supporting Dynamic Data Structures on Distributed-Memory Machines", journal = j-TOPLAS, volume = "17", number = "2", pages = "233--263", month = mar, year = "1995", CODEN = "ATPSDT", ISSN = "0164-0925 (print), 1558-4593 (electronic)", ISSN-L = "0164-0925", bibdate = "Fri Jan 5 07:58:42 MST 1996", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org/pubs/toc/Abstracts/0164-0925/201065.html", abstract = "Compiling for distributed-memory machines has been a very active research area in recent years. Much of this work has concentrated on programs that use arrays as their primary data structures. To date, little work has been done to address the problem of supporting programs that use pointer-based dynamic data structures. The techniques developed for supporting SPMD execution of array-based programs rely on the fact that arrays are statically defined and directly addressable. Recursive data structures do not have these properties, so new techniques must be developed. In this article, we describe an execution model for supporting programs that use pointer-based dynamic data structures. This model uses a simple mechanism for migrating a thread of control based on the layout of heap-allocated data and introduces parallelism using a technique based on futures and lazy task creation. We intend to exploit this execution model using compiler analyses and automatic parallelization techniques. We have implemented a prototype system, which we call {\em Olden}, that runs on the Intel iPSC/860 and the Thinking Machines CM-5. We discuss our implementation and report on experiments with five benchmarks.", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Programming Languages and Systems", keywords = "experimentation; languages; measurement; performance", subject = "{\bf D.3.4}: Software, PROGRAMMING LANGUAGES, Processors, Run-time environments. {\bf D.1.3}: Software, PROGRAMMING TECHNIQUES, Concurrent Programming, Parallel programming. {\bf D.3.4}: Software, PROGRAMMING LANGUAGES, Processors, Compilers. {\bf D.3.3}: Software, PROGRAMMING LANGUAGES, Language Constructs and Features, Data types and structures. {\bf D.3.3}: Software, PROGRAMMING LANGUAGES, Language Constructs and Features, Dynamic storage management.", } @PhdThesis{Roh:1995:CGE, author = "Lucas J. Roh", title = "Code generations, evaluations, and optimizations in multithreaded executions", type = "Thesis (Ph.D.)", school = inst-CSU, address = inst-CSU:adr, pages = "ix + 154", year = "1995", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "Code generators; Computer architecture; Parallel processing (Electronic computers)", } @InProceedings{Schauser:1995:SCP, author = "Klaus E. Schauser and David E. Culler and Seth C. Goldstein", title = "Separation constraint partitioning: a new algorithm for partitioning non-strict programs into sequential threads", crossref = "ACM:1995:CRP", pages = "259--271", year = "1995", bibdate = "Mon May 3 12:52:30 MDT 1999", bibsource = "http://www.acm.org/pubs/toc/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org:80/pubs/citations/proceedings/plan/199448/p259-schauser/", abstract = "In this paper we present substantially improved thread partitioning algorithms for modern implicitly parallel languages. We present a new block partitioning algorithm, {\em separation constraint partitioning\/}, which is both more powerful and more flexible than previous algorithms. Our algorithm is guaranteed to derive maximal threads. We present a theoretical framework for proving the correctness of our partitioning approach, and we show how separation constraint partitioning makes interprocedural partitioning viable. We have implemented the partitioning algorithms in an Id90 compiler for workstations and parallel machines. Using this experimental platform, we quantify the effectiveness of different partitioning schemes on whole applications.", acknowledgement = ack-nhfb, keywords = "algorithms; experimentation; languages; theory; verification", subject = "{\bf D.3.2} Software, PROGRAMMING LANGUAGES, Language Classifications, Parallel C. {\bf D.3.4} Software, PROGRAMMING LANGUAGES, Processors, Compilers. {\bf F.2.2} Theory of Computation, ANALYSIS OF ALGORITHMS AND PROBLEM COMPLEXITY, Nonnumerical Algorithms and Problems, Computations on discrete structures. {\bf F.3.3} Theory of Computation, LOGICS AND MEANINGS OF PROGRAMS, Studies of Program Constructs.", } @MastersThesis{Shahnaz:1995:DMD, author = "Munira Shahnaz", title = "Design of a multithreaded data cache for a hyperscalar processor", type = "Thesis (M.S.)", school = "Department of Electrical Engineering, Texas A\&M University", address = "College Station, TX, USA", pages = "xi + 80", year = "1995", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "Major electrical engineering", } @PhdThesis{Shankar:1995:STI, author = "Bhanu Shankar", title = "The spectrum of thread implementations on hybrid multithreaded architectures", type = "Thesis (Ph.D.)", school = inst-CSU, address = inst-CSU:adr, pages = "xi + 176", year = "1995", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "Computer architecture; Parallel processing (Electronic computers)", } @TechReport{Small:1995:SAB, author = "Christopher Small and Margo Seltzer", title = "Scheduler activations on {BSD}: sharing thread management between kernel and application", type = "Technical Report", number = "31-95", institution = "Center for Research in Computing Technology, Harvard University", address = "Cambridge, MA, USA", pages = "12", year = "1995", bibdate = "Tue Sep 17 07:11:15 2002", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Article{Spertus:1995:ELB, author = "Ellen Spertus and William J. Dally", title = "Evaluating the locality benefits of active messages", journal = j-SIGPLAN, volume = "30", number = "8", pages = "189--198", month = aug, year = "1995", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:17:08 MST 2003", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "A major challenge in fine-grained computing is achieving locality without excessive scheduling overhead. We built two J-Machine implementations of a fine-grained programming model, the Berkeley Threaded Abstract Machine. One implementation takes an active messages approach, maintaining a scheduling hierarchy in software in order to improve data cache performance. Another approach relies on the J-Machine's message queues and fast task switch, lowering the control costs at the expense of data locality. Our analysis measures the costs and benefits of each approach, for a variety of programs and cache configurations. The active messages implementation is strongest when miss penalties are high and for the finest-grained programs. The hardware-buffered implementation is strongest in direct-mapped caches, where it achieves substantially better instruction cache performance.", acknowledgement = ack-nhfb, affiliation = "Lab. for Comput. Sci., MIT, Cambridge, MA, USA", classification = "C6110P (Parallel programming); C6120 (File organisation); C6150C (Compilers, interpreters and other processors); C6150N (Distributed systems software)", fjournal = "ACM SIGPLAN Notices", keywords = "Active messages; Benefits; Berkeley Threaded Abstract Machine; Cache configuration; Costs; Data cache performance; Data locality; Direct-mapped caches; Fast task switch; Fine-grained computing; Fine-grained programming model; Hardware-buffered; Instruction cache performance; J-Machine; Locality benefits; Message queues; Miss penalties; Scheduling hierarchy; Scheduling overhead", thesaurus = "Cache storage; Cost-benefit analysis; Parallel programming; Program compilers; Scheduling; Software performance evaluation", } @Article{Steensgaard:1995:ONC, author = "B. Steensgaard and E. Jul", title = "Object and native code thread mobility among heterogeneous computers (includes sources)", journal = j-OPER-SYS-REV, volume = "29", number = "5", pages = "68--77", month = dec, year = "1995", CODEN = "OSRED8", ISSN = "0163-5980", ISSN-L = "0163-5980", bibdate = "Sat Aug 26 08:55:55 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Operating Systems Review", } @Article{Stuckey:1995:FCI, author = "Richard Stuckey", title = "A fully conformant implementation of {ECMA-162}", journal = j-ADA-USER, volume = "16", number = "2", pages = "83--94", month = jun, year = "1995", CODEN = "AUJOET", ISSN = "0268-652X", bibdate = "Mon Sep 8 18:43:50 MDT 1997", bibsource = "Compendex database; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "ICL has developed a portable implementation of the Ada interfaces to PCTE as specified by ECMA-162. The interfaces map the functionality required onto that provided by the C interfaces to PCTE as specified by ECMA-158. The process of implementing the interfaces revealed a number of errors in the ECMA PCTE standards, such as errors in ECMA-162 concerning the mapping of ECMA-149 onto Ada, errors in ECMA-158 such as missing operations or functions with incorrect parameter modes, discrepancies between the Ada and C bindings and errors in ECMA-149. The architecture of the interfaces and their test harness has been designed to allow easy porting from one PCTE implementation to another, and also from one Ada compilation system to another; some major constraints were imposed by the use of the C interfaces as the underlying platform, particularly regarding Ada's multi-threading abilities. The advantages of using the interfaces include the benefits of being able to implement tools in Ada instead of C; insulation from the underlying PCTE implementation; and the provision of facilities (e.g. call tracing) between tools and PCTE.", acknowledgement = ack-nhfb, affiliation = "ICL Enterprises", affiliationaddress = "Reading, Engl", classification = "722.2; 723.1; 723.1.1; 723.5; 902.2; C6115 (Programming support); C6140D (High level languages)", corpsource = "ICL Enterprises, Reading, UK", fjournal = "Ada User", journalabr = "Ada User J", keywords = "Ada; Ada (programming language); Ada compilation system; Ada interfaces; application program interfaces; bindings; C (programming language); C interfaces; call tracing; Codes (symbols); Computer aided software engineering; ECMA PCTE standards; ECMA-149; ECMA-158; ECMA-162; Errors; errors; fully conformant implementation; incorrect parameter modes; missing operations; multi-threading abilities; Portable Common Tools Environment; portable implementation; programming environments; software portability; software standards; software tools; Standards; test harness; User interfaces", pubcountry = "Netherlands", treatment = "P Practical", } @Book{SunSoft:1995:SMP, author = "{SunSoft}", title = "{Solaris} multithreaded programming guide", publisher = pub-SUNSOFT, address = pub-SUNSOFT:adr, pages = "xviii + 158", year = "1995", ISBN = "0-13-160896-7", ISBN-13 = "978-0-13-160896-2", LCCN = "QA76.76.O63 S635 1995", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "Multiprocessors; Operating systems (Computers); Solaris (Computer file); UNIX (Computer file)", } @Article{Tamasanis:1995:MMW, author = "Doug Tamasanis", title = "{Mathematica} meets {Warp}", journal = j-BYTE, volume = "20", number = "5", month = may, year = "1995", CODEN = "BYTEDJ", ISSN = "0360-5280", ISSN-L = "0360-5280", bibdate = "Fri May 24 09:57:14 MDT 1996", bibsource = "Compendex database; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Wolfram Research has ported Mathematica, the software tool for quantitative analysis, from its Macintosh origins to a wide range of platforms, including PCs, Unix workstations, and several larger systems. The latest port of Mathematica 2.2 is to OS/2 Warp. Now OS/2 users do not have to rely on the Windows version of the Mathematica kernel, which only simulates multithreading. The new release takes full advantage of the OS/2 preemptive scheduler, threading, and 32-bit flat memory structure to both improve performance and to greatly increase the size of the problems Mathematica can handle. The OS/2 version is found faster and more stable than the Windows version.", acknowledgement = ack-nhfb, affiliation = "BYTE", classification = "722.2; 723.1; 723.1.1; 723.2; 723.5", fjournal = "BYTE Magazine", journalabr = "Byte", keywords = "C (programming language); Command line interface; Computer aided software engineering; Computer architecture; Computer operating systems; Computer simulation; Computer software; File editors; FORTRAN (programming language); Graphical user interfaces; Network protocols; Performance; Software Package Mathematica; Word processing", pagecount = "3", } @Article{Taylor:1995:CSA, author = "Richard N. Taylor and Kari A. Nies and Gregory Alan Bolcer and Craig A. MacFarlane and Kenneth M. Anderson and Gregory F. Johnson", title = "Chiron-1: a software architecture for user interface development, maintenance, and run-time support", journal = j-TOCHI, volume = "2", number = "2", pages = "105--144", month = jun, year = "1995", CODEN = "ATCIF4", ISSN = "1073-0516", ISSN-L = "1073-0516", bibdate = "Tue Jan 19 05:49:17 MST 1999", bibsource = "http://www.acm.org/pubs/contents/journals/tochi/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org:80/pubs/citations/journals/tochi/1995-2-2/p105-taylor/", abstract = "The Chiron-1 user interface system demonstrates key techniques that enable a strict separation of an application from its user interface. These techniques include separating the control-flow aspects of the application and user interface: they are concurrent and may contain many threads. Chiron also separates windowing and look-and-feel issues from dialogue and abstract presentation decisions via mechanisms employing a client-server architecture. To separate application code from user interface code, user interface agents called {\em artists\/} are attached to instances of application abstract data types (ADTs). Operations on ADTs within the application implicitly trigger user interface activities within the artists. Multiple artists can be attached to ADTs, providing multiple views and alternative forms of access and manipulation by either a single user or by multiple users. Each artist and the application run in separate threads of control. Artists maintain the user interface by making remote calls to an abstract depiction hierarchy in the Chiron server, insulting the user interface code from the specifics of particular windowing systems and toolkits. The Chiron server and clients execute in separate processes. The client-server architecture also supports multilingual systems: mechanisms are demonstrated that support clients written in programming languages other than that of the server while nevertheless supporting object-oriented server concepts. The system has been used in several universities and research and development projects. It is available by anonymous ftp.", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Computer-Human Interaction", keywords = "design; languages", subject = "{\bf H.5.2} Information Systems, INFORMATION INTERFACES AND PRESENTATION, User Interfaces, User interface management systems (UIMS). {\bf D.2.2} Software, SOFTWARE ENGINEERING, Design Tools and Techniques, User interfaces. {\bf D.2.m} Software, SOFTWARE ENGINEERING, Miscellaneous, Reusable software**.", } @PhdThesis{Thekkath:1995:DPM, author = "Radhika Thekkath", title = "Design and performance of multithreaded architectures", type = "Thesis (Ph.D.)", school = "University of Washington", address = "Seattle, WA, USA", pages = "x + 100", year = "1995", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "Computer architecture; Multiprocessors", } @MastersThesis{Todiwala:1995:DRT, author = "Khushroo Rustom Todiwala", title = "A distributed ray tracing implementation using multithreaded {RPC}", type = "Thesis (M.S.)", number = "4691", school = "University of Texas at El Paso", address = "El Paso, TX, USA", pages = "xi + 140", year = "1995", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", series = "Master's thesis / University of Texas at El Paso", acknowledgement = ack-nhfb, keywords = "Electronic data processing -- Distributed processing", } @TechReport{Toulouse:1995:CID, author = "Michel Toulouse and Teodor Gabriel Crainic and Michel Gendreau", title = "Communication issues in designing cooperative multi-thread parallel searches", type = "Report", number = "CRT-95-47", institution = "Centre de recherche sur les transports, Universit{\'e} de Montr{\'e}al", address = "Montr{\'e}al, Qu{\'e}bec, Canada", year = "1995", bibdate = "Sat Apr 20 11:20:32 2002", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Article{Tullsen:1995:SMM, author = "Dean M. Tullsen and Susan J. Eggers and Henry M. Levy", title = "Simultaneous multithreading: maximizing on-chip parallelism", journal = j-COMP-ARCH-NEWS, volume = "23", number = "2", pages = "392--403", month = may, year = "1995", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:47 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", } @Article{vanHoff:1995:JIP, author = "Arthur {van Hoff}", title = "{Java} and {Internet} Programming", journal = j-DDJ, volume = "20", number = "8", pages = "56, 58, 60--61, 101--102", month = aug, year = "1995", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Thu Jan 9 09:35:43 MST 1997", bibsource = "Compendex database; http://www.ddj.com/index/author/index.htm; http://www.math.utah.edu/pub/tex/bib/multithreading.bib; UnCover database", URL = "http://www.ddj.com/ddj/issues/j508a.htm", abstract = "Java, a language designed for Internet development, is an object-oriented, multithreaded, portable, dynamic language that's similar to C, yet simpler than C++.", abstract2 = "In 1990, a new language called `Java' was developed which, it turns out, addresses many of the issues of software distribution on the Internet. Java is a simple, object-oriented, multi-threaded, garbage-collected, secure, robust, architecture-neutral, portable, high-performance, dynamic language. The language is similar to C and C++ but much simpler. Java programs are compiled into a binary format that can be executed on many platforms without recompilation. The language contains mechanisms to verify and execute binary Java programs in a controlled environment, protecting computer from potential viruses and security violations.", acknowledgement = ack-nhfb, affiliation = "Sun Microsystems", classification = "721.1; 722.2; 722.3; 723.1; 723.1.1; C6110J (Object-oriented programming); C6140D (High level languages); C6150N (Distributed systems software)", fjournal = "Dr. Dobb's Journal of Software Tools", journalabr = "Dr Dobb's J Software Tools Prof Program", keywords = "Architecture-neutral language; Binary format; Browser; Bytecodes; Bytecodes, Java language; C (programming language); Codes (symbols); Compilation; Computational linguistics; Computer networks; Computer programming languages; Computer software portability; Garbage-collection; High-performance dynamic language; Interactive programs; Interfaces (computer); Internet; Internet programming; Java (programming language); Multithreaded language; Multithreading; Object oriented programming; Object-oriented language; Portable language; Program compilers; Program interpreters; Robust language; Secure language; Security of data; Semantics; Software distribution; Software engineering; Syntax; UNIX", pagecount = "4", thesaurus = "Complete computer programs; Internet; Object-oriented languages; Object-oriented programming; Security of data; Software portability", } @Article{Wallach:1995:OAM, author = "Deborah A. Wallach and Wilson C. Hsieh and Kirk L. Johnson and M. Frans Kaashoek and William E. Weihl", title = "Optimistic active messages: a mechanism for scheduling communication with computation", journal = j-SIGPLAN, volume = "30", number = "8", pages = "217--226", month = aug, year = "1995", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:17:08 MST 2003", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Low-overhead message passing is critical to the performance of many applications. Active messages (AMs) reduce the software overhead for message handling: messages are run as handlers instead of as threads, which avoids the overhead of thread management and the unnecessary data copying of other communication models. Scheduling the execution of AMs is typically done by disabling and enabling interrupts or by polling the network. This primitive scheduling control puts severe restrictions on the code that can be run in a message handler. This paper describes a new software mechanism, optimistic active messages (OAM), that eliminates these restrictions; OAMs allow arbitrary user code to execute in handlers, and also allow handlers to block. Despite this gain in expressiveness, OAMs perform as well as AMs. We used OAM as the base for a remote procedure calling (RPC) system, Optimistic RPC (ORPC), for the CM-5 multiprocessor; it consists of an optimized thread package and a stub compiler that hides communication details from the programmer. ORPC is 1.5 to 5 times faster than traditional RPC (TRPC) for small messages and performs as well as AMs. Applications that primarily communicate using large data transfers or are fairly coarse-grained perform equally well. For applications that send many short messages, however, the ORPC and AM implementations are up to 3 times faster than the TRPC implementations. Using ORPC, programmers obtain the benefits of well-proven programming abstractions, do not have to be concerned with communication details, and yet obtain nearly the performance of hand-coded AM programs.", acknowledgement = ack-nhfb, affiliation = "Lab. for Comput. Sci., MIT, Cambridge, MA, USA", classification = "C6150N (Distributed systems software)", fjournal = "ACM SIGPLAN Notices", keywords = "Application performance; Arbitrary user code; Blocking; CM-5 multiprocessor; Coarse-grained applications; Communication detail hiding; Communication scheduling; Computation scheduling; Expressiveness; Large data transfers; Low-overhead message passing; Message handlers; Optimistic active messages; Optimistic remote procedure calls; Optimized thread package; Programming abstractions; Software overhead; Stub compiler", thesaurus = "Message passing; Remote procedure calls; Scheduling", } @Article{Walter:1995:PMS, author = "Stephen Walter", title = "Put Multiprocessing Systems to Work. {II}", journal = j-UNIX-REVIEW, volume = "13", number = "1", pages = "39--??", month = jan, year = "1995", CODEN = "UNRED5", ISSN = "0742-3136", bibdate = "Sat May 25 07:59:58 MDT 1996", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib; UnCover library database", abstract = "Programming for multiprocessors requires use of unusual features such as spin locks, mutex locks, barrier synchronization, and the like. Using the POSIX threads API helps, but the rest you have to do yourself.", acknowledgement = ack-nhfb, fjournal = "UNIX review", } @Article{Wayner:1995:FAN, author = "Peter Wayner", title = "Free Agents: {A} new generation of light-weight, multithreaded operating environments provide security and interoperability for agent developers", journal = j-BYTE, volume = "20", number = "3", pages = "105--??", month = mar, year = "1995", CODEN = "BYTEDJ", ISSN = "0360-5280", ISSN-L = "0360-5280", bibdate = "Tue Jan 2 10:01:41 MST 1996", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "BYTE Magazine", } @Article{Yam:1995:CFD, author = "Michael Yam", title = "A {C++} Framework for {DCE} Threads", journal = j-DDJ, volume = "20", type = "SB", number = "??", pages = "27--??", month = jul # "\slash " # aug, year = "1995", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Mon Sep 2 09:09:39 MDT 1996", bibsource = "http://www.ddj.com/index/author/index.htm; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Dr. Dobb's Journal of Software Tools", } @InProceedings{Yasrebi:1995:EDO, author = "M. Yasrebi", title = "Experience with Distributed Objects in a Portable and Multithreaded Library for a {LAN\slash WAN} Gateway Application", crossref = "IEEE:1995:PCL", volume = "20", pages = "164--173", year = "1995", bibdate = "Mon Sep 27 14:16:06 MDT 1999", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, annote = "Also known as LCN'95. IEEE Cat no 95TB100005", keywords = "computer communications; IEEE; LCN; local computer networks", } @Article{Aitken:1996:MCJ, author = "Gary Aitken", title = "Moving from {C++} to {Java}", journal = j-DDJ, volume = "21", number = "3", pages = "52, 54--56", month = mar, year = "1996", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Thu Jan 9 09:35:43 MST 1997", bibsource = "Compendex database; http://www.ddj.com/index/author/index.htm; http://www.math.utah.edu/pub/tex/bib/multithreading.bib; UnCover database", abstract = "Java is claimed to be much easier to learn than C++, but the difficulties most people have in learning to program in both C++ and Java have little to do with language itself. This paper explores some of the differences between Java and C++. The aim is to make user aware of potential problems and opportunities when moving from C++ to Java. Brief explanations are provided for those concepts that until now unfamiliar for many users.", acknowledgement = ack-nhfb, affiliation = "Integrated Computer Solutions", classification = "721.1; 722.2; 723.1; 723.1.1; 723.2", fjournal = "Dr. Dobb's Journal of Software Tools", journalabr = "Dr Dobb's J Software Tools Prof Program", keywords = "C (programming language); Character arrays; Character sets; Data structures; File organization; Garbage collected language; Header files; Interfaces (COMPUTER); Java; Machine code; Member function; Multithreading; Object oriented programming; Pointers; Program compilers; Program interpreters; Program processors; Program translators; Programming theory; Software engineering; Synchronization; Virtual machine", pagecount = "4", } @MastersThesis{Annavaram:1996:BVN, author = "Murali Annavaram", title = "Blocking versus non-blocking: issues and tradeoffs in multithreaded code execution", type = "Thesis (M.S.)", school = inst-CSU, address = inst-CSU:adr, pages = "viii + 57", year = "1996", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "Multiprocessors -- Design and construction; Parallel processing (Electronic computers)", } @Article{Arnold:1996:MPJ, author = "K. Arnold and J. Gosling", title = "Multithreaded programming in {Java}", journal = j-WEB-TECHNIQUES, volume = "1", number = "7", pages = "34--40, 42--43", month = oct, year = "1996", CODEN = "WETEFA", ISSN = "1086-556X", bibdate = "Sat Mar 15 08:49:09 MST 1997", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, classcodes = "C6150N (Distributed systems software); C6110J (Object-oriented programming); C6140D (High level languages); C6150J (Operating systems)", fjournal = "Web Techniques", keywords = "display; display code; dynamic behaviour; handshaking; interactive program; interrupts; Java; Java object oriented language; multiple; multiprogramming; multithreaded programming; multithreaded system; object-oriented languages; object-oriented programming; operations; parallel programming; polling; problems; real world software; synchronisation; threads; updates; user input", treatment = "P Practical", } @Article{Bellosa:1996:PIL, author = "Frank Bellosa and Martin Steckermeier", title = "The Performance Implications of Locality Information Usage in Shared-Memory Multiprocessors", journal = j-J-PAR-DIST-COMP, volume = "37", number = "1", pages = "113--121", day = "25", month = aug, year = "1996", CODEN = "JPDCER", DOI = "http://dx.doi.org/10.1006/jpdc.1996.0112", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Thu Mar 9 09:19:00 MST 2000", bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0112/production; http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0112/production/pdf", acknowledgement = ack-nhfb, classification = "C5220P (Parallel architecture); C5440 (Multiprocessing systems); C5470 (Performance evaluation and testing)", corpsource = "Dept. of Comput. Sci. IV, Erlangen-Nurnberg Univ., Germany", fjournal = "Journal of Parallel and Distributed Computing", keywords = "cache miss counters; cache storage; evaluation; locality information; memory multiprocessors; parallel architectures; performance; scalable shared-; scheduling decisions; shared memory systems; shared-memory multiprocessors; thread scheduling algorithms", treatment = "P Practical", } @Article{Berg:1996:HDT, author = "C. Berg", title = "How do threads work and how can {I} create a general-purpose event?", journal = j-DDJ, volume = "21", number = "11", pages = "111--115, 126--127", month = nov, year = "1996", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Sat Mar 15 08:49:09 MST 1997", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, classcodes = "C6110J (Object-oriented programming); C6140D (High level languages); C6150J (Operating systems); C6150N (Distributed systems software)", corpsource = "Digital Focus, USA", fjournal = "Dr. Dobb's Journal of Software Tools", keywords = "(computers); application; application program interfaces; applications; event; exception handling; general-purpose event; Internet; Java; Java thread mechanism; languages; lightweight processes; multiprocessor architecture; multithreading; object; object-oriented; object-oriented programming; operating systems; oriented language; programming interface; scheduling; synchronisation; synchronization; thread programming; threads; web", treatment = "P Practical", } @Article{Berg:1996:JQH, author = "Cliff Berg", title = "{Java Q and A}: How do Threads Work and How Can {I} Create a General-Purpose Event?", journal = j-DDJ, volume = "21", number = "11", pages = "111--??", day = "1", month = nov, year = "1996", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Tue Oct 15 08:20:29 1996", bibsource = "http://www.ddj.com/index/author/index.htm; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Dr. Dobb's Journal of Software Tools", } @InProceedings{Bhandarkar:1996:MPM, author = "M. A. Bhandarkar and L. V. Kale", title = "{MICE}: a prototype {MPI} implementation in {Converse} environment", crossref = "IEEE:1996:PSM", pages = "26--31", year = "1996", bibdate = "Sat Apr 19 16:34:54 MDT 1997", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, classification = "C6110P (Parallel programming); C6115 (Programming support); C6150E (General utility programs); C6150N (Distributed systems software)", conftitle = "Proceedings. Second MPI Developer's Conference", corpsource = "Dept. of Comput. Sci., Illinois Univ., Urbana, IL, USA", keywords = "Abstract Device Interface; application program interfaces; communication; computations; Converse interoperable parallel programming environment; message managers; message passing; MICE; MPI modules; MPICH; multi-threaded MPI programs; open systems; parallel programming; programming environments; prototype MPI implementation; public-domain MPI implementation; PVM interoperation; thread objects; utility programs", sponsororg = "IEEE Comput. Soc. Tech. Committee on Distributed Process", treatment = "P Practical", } @Article{Bianchini:1996:EPM, author = "Ricardo Bianchini and Beng-Hong Lim", title = "Evaluating the Performance of Multithreading and Prefetching in Multiprocessors", journal = j-J-PAR-DIST-COMP, volume = "37", number = "1", pages = "83--97", day = "25", month = aug, year = "1996", CODEN = "JPDCER", DOI = "http://dx.doi.org/10.1006/jpdc.1996.0109", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Thu Mar 9 09:19:00 MST 2000", bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0109/production; http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0109/production/pdf", acknowledgement = ack-nhfb, classification = "C5220P (Parallel architecture); C5440 (Multiprocessing systems); C5470 (Performance evaluation and testing); C6110P (Parallel programming); C6150N (Distributed systems software)", corpsource = "COPPE Syst. Eng., Federal Univ. of Rio de Janeiro, Brazil", fjournal = "Journal of Parallel and Distributed Computing", keywords = "cache; memory latency; MIT Alewife multiprocessor; multiprocessing systems; multiprocessors; multithreading; parallel; parallel architectures; performance evaluation; programming; software prefetching; storage management", treatment = "P Practical", } @Article{Blumofe:1996:CEM, author = "Robert D. Blumofe and Christopher F. Joerg and Bradley C. Kuszmaul and Charles E. Leiserson and Keith H. Randall and Yuli Zhou", title = "{Cilk}: An Efficient Multithreaded Runtime System", journal = j-J-PAR-DIST-COMP, volume = "37", number = "1", pages = "55--69", day = "25", month = aug, year = "1996", CODEN = "JPDCER", DOI = "http://dx.doi.org/10.1006/jpdc.1996.0107", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Thu Mar 9 09:19:00 MST 2000", bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0107/production; http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0107/production/pdf", acknowledgement = ack-nhfb, classification = "C4240P (Parallel programming and algorithm theory); C6110P (Parallel programming)", corpsource = "Lab. for Comput. Sci., MIT, Cambridge, MA, USA", fjournal = "Journal of Parallel and Distributed Computing", keywords = "Cilk; critical path analysis; critical-path length; directed acyclic graph; load balancing; multithreaded runtime system; parallel; parallel algorithms; parallel programming; processor scheduling; programming; runtime scheduling; synchronisation", treatment = "P Practical; T Theoretical or Mathematical", } @Article{Bundgen:1996:SCM, author = "Reinhard B{\"u}ndgen and Manfred G{\"o}bel and Wolfgang K{\"u}chlin", title = "Strategy Compliant Multi-Threaded Term Completion", journal = j-J-SYMBOLIC-COMP, volume = "21", number = "4/5/6", pages = "475--506 (or 475--505??)", month = apr # ", " # may # " \& " # jun, year = "1996", CODEN = "JSYCEH", ISSN = "0747-7171 (print), 1095-855X (electronic)", ISSN-L = "0747-7171", MRclass = "68Q42 (68Q22 68Q40)", MRnumber = "1 420 910", bibdate = "Sat May 10 15:54:09 MDT 1997", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "Parallel symbolic computation.", acknowledgement = ack-nhfb, classcodes = "C7310 (Mathematics computing); C5440 (Multiprocessing systems); C4210L (Formal languages and computational linguistics); C6130 (Data handling techniques)", corpsource = "Wilhelm-Schickard-Inst. fur Inf., Tubingen Univ., Germany", fjournal = "Journal of Symbolic Computation", keywords = "completion module AC; Knuth--Bendix completion; parallel; parallel architectures; rewriting systems; shared memory; strategy compliant multi-threaded term completion; symbol manipulation; systems; term-rewriting system PaReDuX; unfailing completion", treatment = "A Application; P Practical", } @Article{Drake:1996:IJT, author = "Donald G. Drake", title = "Introduction to {Java} threads", journal = j-JAVAWORLD, volume = "1", number = "2", pages = "??--??", month = apr, year = "1996", CODEN = "????", ISSN = "1091-8906", bibdate = "Thu Aug 13 08:48:26 MDT 1998", bibsource = "http://www.javaworld.com/javaworld/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.javaworld.com/javaworld/jw-04-1996/jw-04-threads.htm", acknowledgement = ack-nhfb, } @Article{Eickemeyer:1996:EMU, author = "Richard J. Eickemeyer and Ross E. Johnson and Steven R. Kunkel and Mark S. Squillante and Shiafun Liu", title = "Evaluation of multithreaded uniprocessors for commercial application environments", journal = j-COMP-ARCH-NEWS, volume = "24", number = "2", pages = "203--212", month = may, year = "1996", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:47 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", } @Article{Engelhardt:1996:PIP, author = "Dean Engelhardt and Andrew Wendelborn", title = "A Partitioning-Independent Paradigm for Nested Data Parallelism", journal = j-INT-J-PARALLEL-PROG, volume = "24", number = "4", pages = "291--317", month = aug, year = "1996", CODEN = "IJPPE5", ISSN = "0885-7458 (print), 1573-7640 (electronic)", ISSN-L = "0885-7458", bibdate = "Sat Apr 26 11:36:49 MDT 1997", bibsource = "Compendex database; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, affiliation = "Univ of Adelaide", affiliationaddress = "Aust", classification = "721.1; 722.4; 723.1.1; 723.2; 723.5; C6110P (Parallel programming); C6120 (File organisation); C6150C (Compilers, interpreters and other processors); C6150N (Distributed systems software)", corpsource = "Dept. of Comput. Sci., Adelaide Univ., SA, Australia", fjournal = "International Journal of Parallel Programming", journalabr = "Int J Parallel Program", keywords = "abstract machine; Computational methods; Computer simulation; costs; data parallel model; data partitioning; Data structures; data structures; High level languages; irregular data structures; Multi threading; multinode execution model; Multiprocessing systems; multiprocessing systems; multiprocessor machines; nested data parallelism; Nested data parallelism; nested data structures; nodal multi-threading; one-dimensional data parallel operator; parallel computation; Parallel execution models; Parallel processing systems; parallel programming; partitioning-independent paradigm; Performance; performance statistics; program compilers; software performance evaluation; Thinking machines; Thinking Machines CM-5", treatment = "P Practical", } @Article{Esposito:1996:MVB, author = "Dino Esposito", title = "Multithreading and {Visual Basic}", journal = j-DDJ, volume = "21", number = "12", pages = "46--??", month = dec, year = "1996", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Sat Mar 07 08:22:15 1998", bibsource = "http://www.ddj.com/index/author/index.htm; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Although Visual Basic does not support native multithreading, it does support the Windows API. This means you can write VB applications composed of two or more threads. Dino shows you how to create multithreaded applications using both the SDK and Visual Basic", acknowledgement = ack-nhfb, fjournal = "Dr. Dobb's Journal of Software Tools", } @MastersThesis{Farber:1996:EAM, author = "Philipp Farber", title = "Execution architecture of the multithreaded {ADAM} prototype", type = "Thesis (doctoral)", number = "13", school = "Swiss Federal Institute of Technology", address = "Zurich, Switzerland", pages = "iv + 127", year = "1996", ISBN = "3-7281-2384-6", ISBN-13 = "978-3-7281-2384-8", LCCN = "????", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", series = "TIK-Schriftenreihe", acknowledgement = ack-nhfb, keywords = "Computer architecture; Parallel processing (Electronic computers); Parallel programming (Computer science)", } @InProceedings{Farcy:1996:ISP, author = "A. Farcy and O. Temam", title = "Improving Single-Process Performance with Multithreaded Processors", crossref = "ACM:1996:FCP", pages = "350--357", year = "1996", bibdate = "Wed Mar 18 12:33:18 MST 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, annote = "Also known as ICS'96. Held as part of the Federated computing research conference (FCRC'96)", keywords = "ACM; architecture; computer; FCRC; ICS; SIGARCH; supercomputing", } @InProceedings{Foster:1996:MIW, author = "I. Foster and J. Geisler and S. Tuecke", title = "{MPI} on the {I-WAY}: a wide-area, multimethod implementation of the {Message Passing Interface}", crossref = "IEEE:1996:PSM", pages = "10--17", year = "1996", bibdate = "Sat Apr 19 16:34:54 MDT 1997", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, classification = "C5620W (Other computer networks); C6110B (Software engineering techniques); C6115 (Programming support); C6130S (Data security); C6150E (General utility programs); C6150N (Distributed systems software)", conftitle = "Proceedings. Second MPI Developer's Conference", corpsource = "Argonne Nat. Lab., IL, USA", keywords = "application program interfaces; authentication; automatic configuration mechanisms; communication mechanisms; geographically distributed computing resources; geographically distributed database resources; geographically distributed graphics resources; geographically distributed networking; heterogeneous systems; high-speed wide-area networks; I-WAY distributed- computing experiment; message authentication; message passing; Message Passing Interface; MPICH; Nexus multithreaded runtime system; parallel programming; portable high-performance programming model; process creation; programming environments; software environment; software libraries; utility programs; wide area networks", sponsororg = "IEEE Comput. Soc. Tech. Committee on Distributed Process", treatment = "P Practical", } @Article{Foster:1996:NAI, author = "Ian Foster and Carl Kesselman and Steven Tuecke", title = "The {Nexus} Approach to Integrating Multithreading and Communication", journal = j-J-PAR-DIST-COMP, volume = "37", number = "1", pages = "70--82", day = "25", month = aug, year = "1996", CODEN = "JPDCER", DOI = "http://dx.doi.org/10.1006/jpdc.1996.0108", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Thu Mar 9 09:19:00 MST 2000", bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0108/production; http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0108/production/pdf", acknowledgement = ack-nhfb, classification = "C6110P (Parallel programming); C6150C (Compilers, interpreters and other processors); C6150N (Distributed systems software)", corpsource = "Div. of Math. and Comput. Sci., Argonne Nat. Lab., IL, USA", fjournal = "Journal of Parallel and Distributed Computing", keywords = "asynchronous messaging; client-server systems; compiler target; data communication; distributed; distributed-memory systems; dynamic; dynamic communication; global memory model; global pointer; mechanism; memory systems; message passing; multithreading; Nexus runtime system; parallel languages; parallel programming; program compilers; remote service request; synchronisation; thread creation", treatment = "P Practical", } @Article{Goldstein:1996:LTI, author = "Seth Copen Goldstein and Klaus Erik Schauser and David E. Culler", title = "Lazy Threads: Implementing a Fast Parallel Call", journal = j-J-PAR-DIST-COMP, volume = "37", number = "1", pages = "5--20", day = "25", month = aug, year = "1996", CODEN = "JPDCER", DOI = "http://dx.doi.org/10.1006/jpdc.1996.0104", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Thu Mar 9 09:19:00 MST 2000", bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0103/production; http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0103/production/pdf; http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0104/production; http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0104/production/pdf", acknowledgement = ack-nhfb, classification = "C4240P (Parallel programming and algorithm theory); C6120 (File organisation)", corpsource = "Comput. Sci. Div., California Univ., Berkeley, CA, USA", fjournal = "Journal of Parallel and Distributed Computing", keywords = "code generation strategy; lazy threads; multithreaded execution models; parallel call; parallel programming; parallel-ready sequential call; storage management", treatment = "T Theoretical or Mathematical", } @MastersThesis{Gollapudi:1996:MCA, author = "Sreenivas Gollapudi", title = "A multithreaded client-server architecture for distributed multimedia systems", type = "Thesis (M.S.)", school = "Dept. of Computer Science, State University of New York at Buffalo", address = "Buffalo, NY, USA", pages = "viii + 72", year = "1996", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "Also available as technical report 96-13.", acknowledgement = ack-nhfb, keywords = "Electronic data processing -- Distributed processing; Multimedia systems -- Design and construction; Multitasking (Computer science)", } @Article{Grunwald:1996:WPO, author = "Dirk Grunwald and Richard Neves", title = "Whole-Program Optimization for Time and Space Efficient Threads", journal = j-SIGPLAN, volume = "31", number = "9", pages = "50--59", month = sep, year = "1996", CODEN = "SINODQ", ISBN = "0-89791-767-7", ISBN-13 = "978-0-89791-767-4", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sat May 1 15:50:57 MDT 1999", bibsource = "http://www.acm.org/pubs/toc/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "Co-published as SIGOPS Operating Systems Review {\bf 30}(5), December 1996, and as SIGARCH Computer Architecture News, {\bf 24}(special issue), October 1996.", URL = "http://www.acm.org:80/pubs/citations/proceedings/asplos/237090/p50-grunwald/", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "algorithms; design; languages; performance", subject = "{\bf D.3.4} Software, PROGRAMMING LANGUAGES, Processors, Optimization. {\bf C.1.2} Computer Systems Organization, PROCESSOR ARCHITECTURES, Multiple Data Stream Architectures (Multiprocessors), Parallel processors**. {\bf D.1.3} Software, PROGRAMMING TECHNIQUES, Concurrent Programming, Parallel programming.", } @Article{Hamilton:1996:JSN, author = "Marc A. Hamilton", title = "{Java} and the Shift to Net-Centric Computing", journal = j-COMPUTER, volume = "29", number = "8", pages = "31--39", month = aug, year = "1996", CODEN = "CPTRB4", ISSN = "0018-9162 (print), 1558-0814 (electronic)", ISSN-L = "0018-9162", bibdate = "Sat Mar 15 08:49:09 MST 1997", bibsource = "Compendex database; http://www.math.utah.edu/pub/tex/bib/multithreading.bib; UnCover library database", note = "Mentions Java's use of Unicode characters.", abstract = "Java, with its write once, run anywhere model, changes the basic techniques by which software is designed, developed, and deployed.", acknowledgement = ack-nhfb, affiliation = "Sun Microsystems", affiliationaddress = "El Segundo, CA, USA", classcodes = "C6140D (High level languages); C6110J (Object-oriented programming); C7210 (Information services and centres); C6120 (File organisation)", classification = "722.1; 722.3; 723; 723.1; 723.1.1; 723.2; 723.3; 723.5; C6110J (Object-oriented programming); C6120 (File organisation); C6140D (High level languages); C7210 (Information services and centres)", corpsource = "Sun Microsyst., El Segundo, CA, USA", fjournal = "Computer", journalabr = "Computer", keywords = "application program interfaces; application programming; C; C (programming language); C++; computer aided software; Computer architecture; Computer hardware; Computer networks; Computer operating systems; Computer programming languages; Computer simulation; Computer software; Computer software portability; Distributed database systems; Dynamic linking; engineering; environments; garbage collection; interfaces; Internet; Internet, Object oriented programming; interpreted language; Java; Java programming language; language; management; Memory management; Middleware; Middleware, Computer programming languages; multithreading; Multithreading; multithreading; Multithreading; multithreading; Net centric computing; net-centric computing; Network centric computing; Numeric data types; Object oriented programming; object-; object-oriented languages; object-oriented programming; oriented programming; program compiler; Program compilers; program debugging; Program interpreters; program testing; programming environments; Security of data; software development; Software engineering; software-development life cycle; storage; Storage allocation (computer); Virtual machines; Web browser; Web browsers; World Wide Web", treatment = "P Practical", } @Article{Helmbold:1996:TRC, author = "D. P. Helmbold and C. E. McDowell", title = "A Taxonomy of Race Conditions", journal = j-J-PAR-DIST-COMP, volume = "33", number = "2", pages = "159--164", day = "15", month = mar, year = "1996", CODEN = "JPDCER", DOI = "http://dx.doi.org/10.1006/jpdc.1996.0034", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Thu Mar 9 09:18:59 MST 2000", bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0034/production; http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0034/production/pdf", acknowledgement = ack-nhfb, classification = "C4230 (Switching theory); C4240P (Parallel programming and algorithm theory); C6110P (Parallel programming)", corpsource = "Dept. of Comput. and Inf. Sci., California Univ., Santa Cruz, CA, USA", fjournal = "Journal of Parallel and Distributed Computing", keywords = "access; anomalies; hazards and race conditions; multiple threads; nondeterministic behavior; parallel programming; race conditions taxonomy; timing", treatment = "P Practical; T Theoretical or Mathematical", } @Article{Hertzum:1996:BQO, author = "Morten Hertzum and Erik Fr{\o}kj{\ae}r", title = "Browsing and querying in online documentation: a study of user interfaces and the interaction process", journal = j-TOCHI, volume = "3", number = "2", pages = "136--161", month = jun, year = "1996", CODEN = "ATCIF4", ISSN = "1073-0516", ISSN-L = "1073-0516", bibdate = "Tue Jan 19 05:49:17 MST 1999", bibsource = "http://www.acm.org/pubs/contents/journals/tochi/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org:80/pubs/citations/journals/tochi/1996-3-2/p136-hertzum/", abstract = "A user interface study concerning the usage effectiveness of selected retrieval modes was conducted using an experimental text retrieval system, TeSS, giving access to online documentation of certain programming tools. Four modes of TeSS were compared: (1) browsing, (2) conventional boolean retrieval, (3) boolean retrieval based on Venn diagrams, and (4) these three combined. Further, the modes of TeSS were compared to the use of printed manuals. The subjects observed were 87 computing new to them. In the experiment the use of printed manuals is faster and provides answers of higher quality than any of the electronic modes. Therefore, claims about the effectiveness of computer-based text retrieval have to by vary in situations where printed manuals are manageable to the user. Among the modes of TeSS, browsing is the fastest and the one causing the fewest operational errors. On the same two variables, time and operational errors, the Venn diagram mode performs better than conventional boolean retrieval. The combined mode scores worst on the objective performance measures; nonetheless nearly all subject prefer this mode. Concerning the interaction process, the subjects tend to manage the complexities of the information retrieval tasks by issuing series of simple commands and exploiting the interactive capabilities of TeSS. To characterize the dynamics of the interaction process two concepts are introduced; threads and sequences of tactics. Threads in a query sequence describes the continuity during retrieval. Sequences of tactics concern the combined mode and describe how different retrieval modes succeed each other as the retrieval process evolves.", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Computer-Human Interaction", keywords = "experimentation; human factors; performance", subject = "{\bf H.5.2} Information Systems, INFORMATION INTERFACES AND PRESENTATION, User Interfaces, Evaluation/methodology. {\bf H.3.3} Information Systems, INFORMATION STORAGE AND RETRIEVAL, Information Search and Retrieval, Query formulation. {\bf H.3.3} Information Systems, INFORMATION STORAGE AND RETRIEVAL, Information Search and Retrieval, Retrieval models. {\bf H.3.4} Information Systems, INFORMATION STORAGE AND RETRIEVAL, Systems and Software. {\bf H.5.2} Information Systems, INFORMATION INTERFACES AND PRESENTATION, User Interfaces, Training, help, and documentation.", } @MastersThesis{Hudson:1996:MDA, author = "Greg Hudson", title = "Multithreaded design in the {Athena} environment", type = "Thesis (M. Eng.)", school = "Massachusetts Institute of Technology, Department of Electrical Engineering and Computer Science", address = "Cambridge, MA, USA", pages = "240", year = "1996", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Article{Hum:1996:SEM, author = "Herbert H. J. Hum and Olivier Maquelin and Kevin B. Theobald and Xinmin Tian and Guang R. Gao and Laurie J. Hendren", title = "A Study of the {EARTH-MANNA} Multithreaded System", journal = j-INT-J-PARALLEL-PROG, volume = "24", number = "4", pages = "319--348", month = aug, year = "1996", CODEN = "IJPPE5", ISSN = "0885-7458 (print), 1573-7640 (electronic)", ISSN-L = "0885-7458", bibdate = "Sat Apr 26 11:36:49 MDT 1997", bibsource = "Compendex database; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, affiliation = "Intel Corp", affiliationaddress = "OR, USA", classification = "722.3; 722.4; 723.5; 731.1; C5220P (Parallel architecture); C5440 (Multiprocessing systems); C5470 (Performance evaluation and testing); C6150N (Distributed systems software)", corpsource = "Dept. of Meas., Archit. and Planning, Intel Corp., Hillsboro, OR, USA", fjournal = "International Journal of Parallel Programming", journalabr = "Int J Parallel Program", keywords = "ASIC synchronization unit; benchmarks; Communication latency; communication latency; Computer architecture; Computer hardware; Computer simulation; Data communication systems; data flow computing; dataflow-like thread synchronizations; earth manna system; EARTH-MANNA multithreaded system; Execution unit; multiprocessing systems; Multiprocessing systems; multiprocessor systems; multithreaded architecture; Multithreaded system; off-the-shelf execution unit; parallel architectures; Parallel processing systems; performance; Performance; performance evaluation; processor scheduling; Program processors; remote requests; Scheduling; scheduling; sequentially-executed code; synchronisation; Synchronization; synchronization; Synchronization unit; uniprocessor performance", treatment = "P Practical", } @PhdThesis{Joerg:1996:CSP, author = "Christopher F. (Christopher Frank) Joerg", title = "The {Cilk} system for parallel multithreaded computing", type = "Thesis (Ph.D.)", school = "Massachusetts Institute of Technology, Department of Electrical Engineering and Computer Science", address = "Cambridge, MA, USA", pages = "199", year = "1996", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Article{Karamcheti:1996:RME, author = "Vijay Karamcheti and John Plevyak and Andrew A. Chien", title = "Runtime Mechanisms for Efficient Dynamic Multithreading", journal = j-J-PAR-DIST-COMP, volume = "37", number = "1", pages = "21--40", day = "25", month = aug, year = "1996", CODEN = "JPDCER", DOI = "http://dx.doi.org/10.1006/jpdc.1996.0105", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Thu Mar 9 09:19:00 MST 2000", bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0105/production; http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0105/production/pdf", acknowledgement = ack-nhfb, classification = "C4240P (Parallel programming and algorithm theory); C5220P (Parallel architecture); C6150C (Compilers, interpreters and other processors)", corpsource = "Dept. of Comput. Sci., Illinois Univ., Urbana, IL, USA", fjournal = "Journal of Parallel and Distributed Computing", keywords = "compiler; distributed memory machines; distributed memory systems; dynamic multithreading; hybrid; Illinois Concert runtime system; parallel; parallel architectures; program compilers; programming; pull messaging; stack-heap; threads", treatment = "P Practical", } @Book{Kleiman:1996:PT, author = "Steve Kleiman and Devang Shah and Bart Smaalders", title = "Programming with threads", publisher = pub-PH, address = pub-PH:adr, pages = "xxviii + 534", year = "1996", ISBN = "0-13-172389-8", ISBN-13 = "978-0-13-172389-4", LCCN = "QA76.58 .K53 1996", bibdate = "Fri May 10 12:18:17 MDT 2002", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "multitasking (computer science); parallel programming (computer science); synchronization", } @Article{Leary:1996:CEH, author = "S. Leary", title = "{C++} exception handling in multithreaded programs", journal = j-C-PLUS-PLUS-REPORT, volume = "8", number = "2", pages = "20--31", month = feb, year = "1996", CODEN = "CRPTE7", ISSN = "1040-6042", bibdate = "Tue Mar 25 13:34:48 MST 1997", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, classcodes = "C6110J (Object-oriented programming); C6140D (High level languages); C6150J (Operating systems); C6130 (Data handling techniques)", corpsource = "Dresser-Wayne Ind., USA", fjournal = "C++ Report", keywords = "C language; C++; exception handling; exception-aware thread class; exception-safe programming; lightweight threads; multiprogramming; multitasking; multithreaded programs; object oriented programming; object-; object-oriented programming; operating; oriented languages; OS/2; reusable C++ classes; software reusability; Solaris; systems; systems (computers); thread manager class; thread-safe reference counting class; Windows 95; Windows NT", treatment = "P Practical", } @Book{Lewis:1996:TPG, author = "Bil Lewis and Daniel J. Berg", title = "Threads Primer: {A} Guide to Multithreaded Programming", publisher = pub-SUNSOFT, address = pub-SUNSOFT:adr, pages = "xxvi + 319", year = "1996", ISBN = "0-13-443698-9", ISBN-13 = "978-0-13-443698-2", LCCN = "QA76.642 .L478 1996", bibdate = "Fri Apr 11 17:06:46 2003", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", series = "Sun BluePrints Program", acknowledgement = ack-nhfb, keywords = "POSIX (Computer software standard); Threads (Computer programs); UNIX (Computer file)", } @Article{Lim:1996:LPB, author = "Beng-Hong Lim and Ricardo Bianchini", title = "Limits on the performance benefits of multithreading and prefetching", journal = j-SIGMETRICS, volume = "24", number = "1", pages = "37--46", month = may, year = "1996", CODEN = "????", DOI = "http://doi.acm.org/10.1145/233008.233021", ISSN = "0163-5999 (print), 1557-9484 (electronic)", ISSN-L = "0163-5999", bibdate = "Thu Jun 26 11:21:30 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "This paper presents new analytical models of the performance benefits of multithreading and prefetching, and experimental measurements of parallel applications on the MIT Alewife multiprocessor. For the first time, both techniques are evaluated on a real machine as opposed to simulations. The models determine the region in the parameter space where the techniques are most effective, while the measurements determine the region where the applications lie. We find that these regions do not always overlap significantly. The multithreading model shows that only 2-4 contexts are necessary to maximize this technique's potential benefit in current multiprocessors. Multithreading improves execution time by less than 10\% for most of the applications that we examined. The model also shows that multithreading can significantly improve the performance of the same applications in multiprocessors with longer latencies. Reducing context-switch overhead is not crucial. The software prefetching model shows that allowing 4 outstanding prefetches is sufficient to achieve most of this technique's potential benefit on current multiprocessors. Prefetching improves performance over a wide range of parameters, and improves execution time by as much as 20-50\% even on current multiprocessors. The two models show that prefetching has a significant advantage over multithreading for machines with low memory latencies and/or applications with high cache miss rates because a prefetch instruction consumes less time than a context-switch.", acknowledgement = ack-nhfb, fjournal = "ACM SIGMETRICS Performance Evaluation Review", } @Article{Lowenthal:1996:UFG, author = "David K. Lowenthal and Vincent W. Freeh and Gregory R. Andrews", title = "Using Fine-Grain Threads and Run-Time Decision Making in Parallel Computing", journal = j-J-PAR-DIST-COMP, volume = "37", number = "1", pages = "41--54", day = "25", month = aug, year = "1996", CODEN = "JPDCER", DOI = "http://dx.doi.org/10.1006/jpdc.1996.0106", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Thu Mar 9 09:19:00 MST 2000", bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0106/production; http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0106/production/pdf", acknowledgement = ack-nhfb, classification = "C5220P (Parallel architecture); C6110P (Parallel programming)C4240P (Parallel programming and algorithm theory)", corpsource = "Dept. of Comput. Sci., Arizona Univ., Tucson, AZ, USA", fjournal = "Journal of Parallel and Distributed Computing", keywords = "computing; distributed shared memory; distributed-memory multiprocessors; fine-grain; fine-grain threads; parallel; parallel architectures; parallel programming; parallelism; run-time decision making", treatment = "P Practical", } @Article{Mane:1996:SJP, author = "I. Mane", title = "Survey of the {Java} programming language", journal = j-ELECTRONIK, volume = "45", number = "17", pages = "84--87", day = "20", month = "????", year = "1996", CODEN = "EKRKAR", ISSN = "0013-5658", bibdate = "Sat Mar 15 08:49:09 MST 1997", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, classcodes = "C6140D (High level languages); C6150C (Compilers, interpreters and other processors)", countrypub = "Germany", fjournal = "Elektronik", keywords = "fixed; high level languages; Java programming language; memory partitions; multi-threading; program compilers; source code compiler", language = "German", treatment = "G General Review", } @PhdThesis{Mao:1996:PMS, author = "Weihua Mao", title = "Performance modeling of data prefetching and multithreading in scalable multiprocessors", type = "Thesis (Ph.D.)", school = "University of Southern California", address = "Los Angeles, CA, USA", pages = "xi + 130", year = "1996", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, alttitle = "Performance modeling of data prefetching and multithreading in scalable multiprocessors", } @Article{McManis:1996:JDSa, author = "Chuck McManis", title = "{Java} In Depth: Synchronizing threads in {Java}", journal = j-JAVAWORLD, volume = "1", number = "2", pages = "??--??", month = apr, year = "1996", CODEN = "????", ISSN = "1091-8906", bibdate = "Thu Aug 13 08:48:26 MDT 1998", bibsource = "http://www.javaworld.com/javaworld/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.javaworld.com/javaworld/jw-04-1996/jw-04-synch.htm", acknowledgement = ack-nhfb, } @Article{McManis:1996:JDSb, author = "Chuck McManis", title = "{Java} In Depth: Synchronizing threads in {Java}, {Part II}", journal = j-JAVAWORLD, volume = "1", number = "3", pages = "??--??", month = may, year = "1996", CODEN = "????", ISSN = "1091-8906", bibdate = "Thu Aug 13 08:48:26 MDT 1998", bibsource = "http://www.javaworld.com/javaworld/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.javaworld.com/javaworld/jw-05-1996/jw-05-mcmanis.htm", acknowledgement = ack-nhfb, } @Article{McManis:1996:JDT, author = "Chuck McManis", title = "{Java} In Depth: Threads and applets and visual controls", journal = j-JAVAWORLD, volume = "1", number = "5", pages = "??--??", month = jul, year = "1996", CODEN = "????", ISSN = "1091-8906", bibdate = "Thu Aug 13 08:48:26 MDT 1998", bibsource = "http://www.javaworld.com/javaworld/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.javaworld.com/javaworld/jw-07-1996/jw-07-mcmanis.htm", acknowledgement = ack-nhfb, } @MastersThesis{Mishra:1996:TIS, author = "Amitabh Mishra", title = "Task and instruction scheduling in parallel multithreaded processors", type = "Thesis (M.S.)", school = "Department of Computer Science, Texas A\&M University", address = "College Station, TX, USA", pages = "ix + 60", year = "1996", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "Major computer science", } @Article{Mitchell:1996:JTM, author = "John D. Mitchell", title = "{Java} Tips: More about threads and the resize problem", journal = j-JAVAWORLD, volume = "1", number = "4", pages = "??--??", month = jun, year = "1996", CODEN = "????", ISSN = "1091-8906", bibdate = "Thu Aug 13 08:48:26 MDT 1998", bibsource = "http://www.javaworld.com/javaworld/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.javaworld.com/javaworld/javatips/jw-javatip9.htm", acknowledgement = ack-nhfb, } @Book{Moore:1996:MPD, author = "Simon W. (Simon William) Moore", title = "Multithreaded processor design", volume = "SECS 358", publisher = pub-KLUWER, address = pub-KLUWER:adr, pages = "xvi + 142", year = "1996", ISBN = "0-7923-9718-5", ISBN-13 = "978-0-7923-9718-2", LCCN = "QA76.5 .M574 1996", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", series = "The Kluwer international series in engineering and computer science", acknowledgement = ack-nhfb, keywords = "Computer architecture; computer architecture; Computers -- Design; multiprocessors -- design and construction; Multiprocessors -- Design and construction; Parallel computers; parallel computers", } @Book{Nichols:1996:PP, author = "Bradford Nichols and Bick Buttlar and Jackie Proulx Farrell", title = "{Pthreads} Programming", publisher = pub-ORA, address = pub-ORA:adr, pages = "xvi + 267", year = "1996", ISBN = "1-56592-115-1", ISBN-13 = "978-1-56592-115-3", LCCN = "QA76.642.N53 1996", bibdate = "Mon May 11 11:04:53 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", price = "US\$29.95", URL = "http://www.amazon.com/exec/obidos/ASIN/1565921151/ref=sim_books/002-4892305-5599452; http://www.oreilly.com/catalog/pthread", acknowledgement = ack-nhfb, } @Book{Northrup:1996:PUT, author = "Charles J. Northrup", title = "Programming with {UNIX} Threads", publisher = pub-WILEY, address = pub-WILEY:adr, pages = "xv + 399", year = "1996", ISBN = "0-471-13751-0 (paperback)", ISBN-13 = "978-0-471-13751-1 (paperback)", LCCN = "QA76.76.O63 N674 1996", bibdate = "Tue May 25 07:14:38 MDT 1999", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "operating systems (computers); UNIX (computer file)", } @Book{Norton:1996:TTM, author = "Scott J. Norton and Mark D. DiPasquale", title = "Thread time: the multithreaded programming guide", publisher = pub-PH, address = pub-PH:adr, pages = "xx + 538", year = "1996", ISBN = "0-13-190067-6 (paperback)", ISBN-13 = "978-0-13-190067-7 (paperback)", LCCN = "QA76.642.N67 1996", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", series = "Hewlett--Packard professional books", URL = "http://www.amazon.com/exec/obidos/ASIN/0131900676/ref=sim_books/002-4892305-5599452", acknowledgement = ack-nhfb, annote = "System requirements: IBM compatible PC; CD-ROM drive.", keywords = "Parallel programming (Computer science)", } @Book{Pham:1996:MPW, author = "Thuan Q. Pham and Pankaj K. Garg", title = "Multithreaded programming with {Windows NT}", publisher = pub-PHPTR, address = pub-PHPTR:adr, pages = "xviii + 227", year = "1996", ISBN = "0-13-120643-5", ISBN-13 = "978-0-13-120643-4", LCCN = "QA76.642 .P52 1996", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, annote = "One 3 1/2 in. diskette in pocket inside back cover.", keywords = "Microsoft Windows NT; multiprocessors; Multiprocessors; Parallel programming; parallel programming (computer science); Parallel programming (Computer science)", } @Article{Philbin:1996:TSC, author = "James Philbin and Jan Edler and Otto J. Anshus and Craig C. Douglas and Kai Li", title = "Thread Scheduling for Cache Locality", journal = j-SIGPLAN, volume = "31", number = "9", pages = "60--71", month = sep, year = "1996", CODEN = "SINODQ", ISBN = "0-89791-767-7", ISBN-13 = "978-0-89791-767-4", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:17:23 MST 2003", bibsource = "http://portal.acm.org/; http://www.acm.org/pubs/toc/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "Co-published as SIGOPS Operating Systems Review {\bf 30}(5), December 1996, and as SIGARCH Computer Architecture News, {\bf 24}(special issue), October 1996.", URL = "http://www.acm.org:80/pubs/citations/proceedings/asplos/237090/p60-philbin/", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "algorithms; experimentation; performance", subject = "{\bf D.3.4} Software, PROGRAMMING LANGUAGES, Processors, Optimization. {\bf I.1.2} Computing Methodologies, SYMBOLIC AND ALGEBRAIC MANIPULATION, Algorithms, Algebraic algorithms. {\bf F.2.2} Theory of Computation, ANALYSIS OF ALGORITHMS AND PROBLEM COMPLEXITY, Nonnumerical Algorithms and Problems, Sequencing and scheduling. {\bf F.2.1} Theory of Computation, ANALYSIS OF ALGORITHMS AND PROBLEM COMPLEXITY, Numerical Algorithms and Problems, Computations on matrices. {\bf D.2.2} Software, SOFTWARE ENGINEERING, Design Tools and Techniques, User interfaces.", } @Book{Robbins:1996:PUP, author = "Kay A. Robbins and Steven Robbins", title = "Practical {UNIX} programming: a guide to concurrency, communication, and multithreading", publisher = pub-PHPTR, address = pub-PHPTR:adr, pages = "xiv + 658", year = "1996", ISBN = "0-13-443706-3", ISBN-13 = "978-0-13-443706-4", LCCN = "QA76.76.O63 R615 1996", bibdate = "Tue May 25 07:14:38 MDT 1999", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "Microcomputers -- Programming; Operating systems; UNIX (Computer file)", } @Article{Roh:1996:GOE, author = "Lucas Roh and Walid A. Najjar and Bhanu Shankar and A. P. Wim B{\"o}hm", title = "Generation, Optimization, and Evaluation of Multithreaded Code", journal = j-J-PAR-DIST-COMP, volume = "32", number = "2", pages = "188--204", day = "1", month = feb, year = "1996", CODEN = "JPDCER", DOI = "http://dx.doi.org/10.1006/jpdc.1996.0013", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Thu Mar 9 09:18:59 MST 2000", bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0013/production; http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0013/production/pdf", acknowledgement = ack-nhfb, classification = "C1180 (Optimisation techniques); C4230M (Multiprocessor interconnection); C5220P (Parallel architecture); C6110P (Parallel programming); C6150C (Compilers, interpreters and other processors); C6150N (Distributed systems software)", corpsource = "Dept. of Comput. Sci., Colorado State Univ., Fort Collins, CO, USA", fjournal = "Journal of Parallel and Distributed Computing", keywords = "architectures; code generation scheme; compiler intermediate; form; global bottom-up optimization technique; inputs; instruction level; intrathread locality; latency tolerance; multiprocessor interconnection networks; multithreaded; multithreaded code; multithreaded code evaluation; multithreaded code generation; multithreaded computation model; multithreaded synchronization; optimisation; optimising compilers; parallel; parallel architectures; parallelising compilers; parallelism; Pebbles; processor scheduling; processor utilization; program level; programming; reduced instruction set computing; scalability; synchronisation; synchronization costs; top-down code generation", treatment = "T Theoretical or Mathematical", } @InProceedings{Sah:1996:PIS, author = "A. Sah and K. Brown and E. Brewer", title = "Programming the {Internet} from the server-side with {Tcl} and {Audience1}", crossref = "USENIX:1996:ATT", pages = "235--??, 183--188", year = "1996", bibdate = "Sat Mar 15 08:49:09 MST 1997", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, classcodes = "C6150N (Distributed systems software); C6115 (Programming support); C6110 (Systems analysis and programming); C6140D (High level languages); C7230 (Publishing and reproduction); C7250N (Front end systems for online searching)", conflocation = "Monterey, CA, USA; 10--13 July 1996", conftitle = "Proceedings of 4th Annual Tcl/Tk Workshop '96", corpsource = "Inktomi Corp., Berkeley, CA, USA", keywords = "applications; Audience1; authoring languages; client-server; client-server systems; client-side languages; electronic; end-; extension library; HotBot search engine; HotWired; Inktomi; Internet; mass customization features; MTtcl; multi-threaded Tcl; online front-ends; programming; publishing; server languages; server-side Internet programming; software libraries; to-end publishing tool; World Wide Web", treatment = "P Practical", } @Article{Schmidt:1996:CAPa, author = "D. C. Schmidt and S. Vinoski", title = "Comparing alternative programming techniques for multithreaded servers", journal = j-C-PLUS-PLUS-REPORT, volume = "8", number = "2", pages = "50--59", month = feb, year = "1996", CODEN = "CRPTE7", ISSN = "1040-6042", bibdate = "Tue Mar 25 13:34:48 MST 1997", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, classcodes = "C6150N (Distributed systems software); C6110J (Object- oriented programming); C6160 (Database management systems (DBMS)); C6140D (High level languages)", corpsource = "Washington Univ., St. Louis, MO, USA", fjournal = "C++ Report", keywords = "applications; C; C language; C++; client-server systems; CORBA; database management; desktop client; financial data processing; investment brokers; languages; multithreaded servers; multithreaded systems; object-oriented; object-oriented programming; programming; query processing; stock prices; stock quote database; synchronization; systems; wrappers", treatment = "P Practical", } @Article{Schmidt:1996:CAPb, author = "D. C. Schmidt and S. Vinoski", title = "Comparing alternative programming techniques for multithreaded {CORBA} servers", journal = j-C-PLUS-PLUS-REPORT, volume = "8", number = "4", pages = "56--66", month = apr, year = "1996", CODEN = "CRPTE7", ISSN = "1040-6042", bibdate = "Tue Mar 25 13:34:48 MST 1997", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, classcodes = "C6110J (Object-oriented programming); C6110P (Parallel programming); C6140D (High level languages)", corpsource = "Washington Univ., St. Louis, MO, USA", fjournal = "C++ Report", keywords = "C language; complexity; distributed multithreaded applications; multithreaded CORBA servers; object-oriented programming; parallel; programming; programming techniques", treatment = "P Practical", } @Article{Schmidt:1996:CAPc, author = "D. C. Schmidt and S. Vinoski", title = "Comparing alternative programming techniques for multithreaded {CORBA} servers", journal = j-C-PLUS-PLUS-REPORT, volume = "8", number = "7", pages = "47--56", month = jul, year = "1996", CODEN = "CRPTE7", ISSN = "1040-6042", bibdate = "Tue Mar 25 13:34:48 MST 1997", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, classcodes = "C6110J (Object-oriented programming); C6150N (Distributed systems software); C5690 (Other data communication equipment and techniques); C6110P (Parallel programming)", corpsource = "Washington Univ., St. Louis, MO, USA", fjournal = "C++ Report", keywords = "alternative programming techniques; C; C++ wrappers; concurrency model; CORBA; multithreaded CORBA; multithreaded stock quote servers; network servers; object-oriented programming; parallel; programming; servers; thread per request; thread per session model; thread pool", treatment = "P Practical", } @Article{Severance:1996:MOB, author = "Charles Severance and Richard Enbody and Paul Petersen", title = "Managing the Overall Balance of Operating System Threads on a Multiprocessor Using Automatic Self-Allocating Threads ({ASAT})", journal = j-J-PAR-DIST-COMP, volume = "37", number = "1", pages = "106--112", day = "25", month = aug, year = "1996", CODEN = "JPDCER", DOI = "http://dx.doi.org/10.1006/jpdc.1996.0111", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Thu Mar 9 09:19:00 MST 2000", bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0111/production; http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0111/production/pdf", acknowledgement = ack-nhfb, classification = "C5440 (Multiprocessing systems); C6110P (Parallel programming); C6150J (Operating systems); C6150N (Distributed systems software)", corpsource = "Dept. of Comput. Sci., Michigan State Univ., East Lansing, MI, USA", fjournal = "Journal of Parallel and Distributed Computing", keywords = "allocating threads; allocation; automatic self-; multiprocessing system; multiprocessing systems; operating system; operating systems (computers); parallel programming; processor scheduling; run-time environment; self-scheduling; thread; thread scheduling", treatment = "P Practical; X Experimental", } @InProceedings{Skjellum:1996:TTM, author = "A. Skjellum and B. Protopopov and S. Hebert", title = "A thread taxonomy for {MPI}", crossref = "IEEE:1996:PSM", pages = "50--57", year = "1996", bibdate = "Sat Apr 19 16:34:54 MDT 1997", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, classification = "C6110B (Software engineering techniques); C6110F (Formal methods); C6150E (General utility programs); C6150J (Operating systems); C6150N (Distributed systems software)", conftitle = "Proceedings. Second MPI Developer's Conference", corpsource = "Dept. of Comput. Sci., Mississippi State Univ., MS, USA", keywords = "API extensions; application program interfaces; Channel Device; computational unit; fine-grain concurrency; formal specification; message passing; minimal portable thread management; MPI; MPICH; multi-threaded thread-safe ADI; non-thread-safe MPI call semantics; resource container; software portability; synchronisation; synchronization mechanisms; thread models; thread safety; thread taxonomy; user-level mechanism; utility programs; Windows NT version", sponsororg = "IEEE Comput. Soc. Tech. Committee on Distributed Process", treatment = "P Practical", } @Article{Sundaresan:1996:COO, author = "Neelakantan Sundaresan and Dennis Gannon", title = "{Coir}: An Object-Oriented System for Control and Dynamic Data Parallelism", journal = j-J-PAR-DIST-COMP, volume = "37", number = "1", pages = "98--105", day = "25", month = aug, year = "1996", CODEN = "JPDCER", DOI = "http://dx.doi.org/10.1006/jpdc.1996.0110", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Thu Mar 9 09:19:00 MST 2000", bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0110/production; http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.0110/production/pdf", acknowledgement = ack-nhfb, classification = "C4240P (Parallel programming and algorithm theory); C5220P (Parallel architecture); C6110J (Object-oriented programming); C6110P (Parallel programming); C6150N (Distributed systems software)", corpsource = "Applic. Dev. Technol. Inst., IBM Corp., San Jose, CA, USA", fjournal = "Journal of Parallel and Distributed Computing", keywords = "C++ library; Coir; distributed memory machines; distributed memory systems; dynamic data parallelism; message passing; message-passing; multithreading; object-oriented; object-oriented system; operating system; parallel; parallel architectures; parallel programming; programming; shared memory systems; symmetric multiprocessors; synchronisation", treatment = "P Practical; T Theoretical or Mathematical", } @Article{Tullsen:1996:ECI, author = "Dean M. Tullsen and Susan J. Eggers and Joel S. Emer and Henry M. Levy and Jack L. Lo and Rebecca L. Stamm", title = "Exploiting choice: instruction fetch and issue on an implementable simultaneous multithreading processor", journal = j-COMP-ARCH-NEWS, volume = "24", number = "2", pages = "191--202", month = may, year = "1996", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:47 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", } @PhdThesis{Tullsen:1996:SM, author = "Dean Michael Tullsen", title = "Simultaneous multithreading", type = "Thesis (Ph.D.)", school = "University of Washington", address = "Seattle, WA, USA", pages = "vi + 99", year = "1996", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "Computer architecture; Parallel processing (Electronic computers)", } @MastersThesis{Verriello:1996:MSM, author = "Anthony Verriello", title = "Memory sharing in multithreaded transaction environments", type = "Thesis (M.S.)", school = "Hofstra University", address = "Westport, CT, USA", pages = "180", year = "1996", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "Memory, Virtual (Computer science); Transaction systems (Computer systems)", } @Article{Vinoski:1996:DCD, author = "S. Vinoski and D. C. Schmidt", title = "Distributed callbacks and decoupled communication in {CORBA}", journal = j-C-PLUS-PLUS-REPORT, volume = "8", number = "9", pages = "48--56, 77", month = oct, year = "1996", CODEN = "CRPTE7", ISSN = "1040-6042", bibdate = "Tue Mar 25 13:34:48 MST 1997", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, classcodes = "C6150N (Distributed systems software); C6110J (Object- oriented programming)", corpsource = "Hewlett--Packard's Distributed Comput. Program, Chelmsford, MA, USA", fjournal = "C++ Report", keywords = "client-server systems; client/server; concurrency control; concurrency models; consumers; CORBA; decoupled communication; decoupled peer-to-peer; distributed callbacks; distributed object computing systems; distributed stock quoting; multithreaded; object-oriented; OMG Events object service; programming; relationships; request communication; response communication; server applications; suppliers; systems", treatment = "P Practical", } @Article{Wise:1996:SDP, author = "David S. Wise and Joshua Walgenbach", title = "Static and dynamic partitioning of pointers as links and threads", journal = j-SIGPLAN, volume = "31", number = "6", pages = "42--49", month = jun, year = "1996", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:17:20 MST 2003", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, affiliation = "Dept. of Comput. Sci., Indiana Univ., Bloomington, IN, USA", fjournal = "ACM SIGPLAN Notices", } @Article{Wismuller:1996:IDP, author = "Roland Wism{\"u}ller and Michael Oberhuber and Johann Krammer and Olav Hansen", title = "Interactive debugging and performance analysis of massively parallel applications", journal = j-PARALLEL-COMPUTING, volume = "22", number = "3", pages = "415--442", day = "29", month = apr, year = "1996", CODEN = "PACOEJ", ISSN = "0167-8191", ISSN-L = "0167-8191", bibdate = "Fri Aug 6 10:14:54 MDT 1999", bibsource = "Compendex database; http://www.elsevier.com/cgi-bin/cas/tree/store/parco/cas_free/browse/browse.cgi?year=1996&volume=22&issue=3; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.elsevier.com/cgi-bin/cas/tree/store/parco/cas_sub/browse/browse.cgi?year=1996&volume=22&issue=3&aid=1049", acknowledgement = ack-nhfb, affiliation = "Inst f{\"u}r Informatik der Technischen Universit{\"a}t M{\"u}nchen", affiliationaddress = "M{\"u}nchen, Ger", classification = "722.2; 722.4; 723.1; 723.2; 723.5; C6110P (Parallel programming); C6115 (Programming support); C6150G (Diagnostic, testing, debugging and evaluating systems)", corpsource = "Inst. f{\"u}r Inf., Tech. Univ. M{\"u}nchen, Germany", fjournal = "Parallel Computing", journalabr = "Parallel Comput", keywords = "applications; attributed measurements; Codes (symbols); Computer debugging; Computer programming; Computer simulation; debugger; debugging; DETOP; Distributed computer systems; distributed evaluation; Distributed online monitoring system; environments; Interactive computer systems; Interactive debugging; intrusion; massively parallel; Massively parallel applications; minimal; monitoring system; multithreaded programming models; Online systems; parallel; Parallel debugger; Parallel processing systems; parallel programming; Parallelization; PATOP; Performance; performance analysis; Performance analysis; performance analyzer; performance bottlenecks; Personal computers; PowerPC; program debugging; programming; scalability; software; software performance evaluation; Supercomputers; tools; usability; User interfaces", treatment = "P Practical", } @Article{Yam:1996:DPV, author = "Michael Yam", title = "{DCE} Pthreads versus {NT} Threads. {Michael} ports {PTF}, a {C++} class library for {DCE} pthreads, from {HP-UX System 9} to {Windows NT}. In doing so, he examines the differences between pthreads and {NT} threads, and describes the porting experience", journal = j-DDJ, volume = "21", number = "12", pages = "16--??", month = dec, year = "1996", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Mon Dec 2 07:52:21 MST 1996", bibsource = "http://www.ddj.com/index/author/index.htm; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Dr. Dobb's Journal of Software Tools", } @PhdThesis{Yoo:1996:PCM, author = "Namhoon Yoo", title = "Parallelism control in multithreaded multiprocessors", type = "Thesis (Ph.D.)", school = "University of Southern California", address = "Los Angeles, CA, USA", pages = "x + 86", year = "1996", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "Computer architecture; Data flow computing; Multiprocessors; Parallel processing (Electronic computers)", } @Book{Zignin:1996:TDM, author = "Bernard Zignin", title = "Techniques du multithread: du parall{\`e}lisme dans les processus {(French) [Multithreading techniques: parallelism in processes]}", publisher = pub-HERMES, address = pub-HERMES:adr, pages = "72", year = "1996", ISBN = "2-86601-562-2", ISBN-13 = "978-2-86601-562-6", LCCN = "????", bibdate = "Wed Dec 09 23:36:26 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", series = "CNAM. Syntheses informatiques", acknowledgement = ack-nhfb, keywords = "Parall{\`e}lisme (informatique)", } @Article{Anonymous:1997:NPW, author = "Anonymous", title = "New Products: {WebThreads 1.0.1; QUERYFLEX Report Writer; Linux Pro Desktop 1.0; NDP Fortran for Linux; Numerics and Visualization for Java; Craftworks Linux/AXP 2.2; InfoDock Linux Software Development Toolset; Caldera Wabi 2.2 for Linux}", journal = j-LINUX-J, volume = "34", pages = "??--??", month = feb, year = "1997", CODEN = "LIJOFX", ISSN = "1075-3583 (print), 1938-3827 (electronic)", ISSN-L = "1075-3583", bibdate = "Fri Oct 9 08:35:26 MDT 1998", bibsource = "http://noframes.linuxjournal.com/lj-issues/issue34/index.html; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Linux journal", } @Article{Bednorz:1997:CDA, author = "M. Bednorz and A. Gwozdowski and K. Zieli{\'n}ski", title = "Contextual debugging and analysis of multithreaded applications", journal = j-CPE, volume = "9", number = "2", pages = "123--139", month = feb, year = "1997", CODEN = "CPEXEI", ISSN = "1040-3108", ISSN-L = "1040-3108", bibdate = "Tue Sep 7 06:06:28 MDT 1999", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www3.interscience.wiley.com/journalfinder.html", URL = "http://www3.interscience.wiley.com/cgi-bin/abstract?ID=13852; http://www3.interscience.wiley.com/cgi-bin/fulltext?ID=13852&PLACEBO=IE.pdf", acknowledgement = ack-nhfb, fjournal = "Concurrency, practice and experience", } @Book{Beveridge:1997:MAW, author = "Jim Beveridge and Robert Wiener", title = "Multithreading applications in {Win32}: the complete guide to threads", publisher = pub-AWDP, address = pub-AWDP:adr, pages = "xviii + 368", year = "1997", ISBN = "0-201-44234-5 (pb) 0-201-18385-4 (CD-ROM)", ISBN-13 = "978-0-201-44234-2 (pb) 978-0-201-18385-6 (CD-ROM)", LCCN = "QA76.76.O63 B478 1997", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, annote = "System requirements: IBM compatible PC; Win32; Windows NT or Windows 95; CD-ROM drive.", keywords = "Microsoft Win32; Microsoft Windows (Computer file); Microsoft Windows NT; Operating systems (Computers)", } @Article{Bik:1997:JPJ, author = "Aart J. C. Bik and Juan E. Villacis and Dennis B. Gannon", title = "javar: {A} prototype {Java} restructuring compiler", journal = j-CPE, volume = "9", number = "11", pages = "1181--1191", month = nov, year = "1997", CODEN = "CPEXEI", ISSN = "1040-3108", ISSN-L = "1040-3108", bibdate = "Tue Sep 7 06:06:35 MDT 1999", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www3.interscience.wiley.com/journalfinder.html", note = "Special Issue: Java for computational science and engineering --- simulation and modeling II.", URL = "http://www3.interscience.wiley.com/cgi-bin/abstract?ID=13819; http://www3.interscience.wiley.com/cgi-bin/fulltext?ID=13819&PLACEBO=IE.pdf", acknowledgement = ack-nhfb, classification = "C6110J (Object-oriented programming); C6110P (Parallel programming); C6150C (Compilers, interpreters and other processors)", conflocation = "Las Vegas, NV, USA; 21 June 1997", conftitle = "Java for Computational Science and Engineering --- Simulation and Modeling II", corpsource = "Dept. of Comput. Sci., Indiana Univ., Bloomington, IN, USA", fjournal = "Concurrency, practice and experience", keywords = "annotations; explicit parallelism; functionality; implicit parallelism; Java program parallelization; Java restructuring compiler; javar; multi-threading; object-oriented languages; parallelising compilers; prototype; semantic analysis; software prototyping", pubcountry = "UK", sponsororg = "ACM", treatment = "P Practical", } @Article{Bordawekar:1997:EEH, author = "Rajesh Bordawekar and Steven Landherr and Don Capps and Mark Davis", title = "Experimental evaluation of the {Hewlett--Packard} {Exemplar} file system", journal = j-SIGMETRICS, volume = "25", number = "3", pages = "21--28", month = dec, year = "1997", CODEN = "????", DOI = "http://doi.acm.org/10.1145/270900.270904", ISSN = "0163-5999 (print), 1557-9484 (electronic)", ISSN-L = "0163-5999", bibdate = "Thu Jun 26 11:24:50 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "This article presents results from an experimental evaluation study of the HP Exemplar file system. Our experiments consist of simple micro-benchmarks that study the impact of various factors on the file system performance. These factors include I/O request/buffer sizes, vectored/non-vectored access patterns, read-ahead policies, multi-threaded (temporally irregular) requests, and architectural issues (cache parameters, NUMA behavior, etc.). Experimental results indicate that the Exemplar file system provides high I/O bandwidth, both for single- and multi-threaded applications. The buffer cache, with prioritized buffer management and large buffer sizes, is effective in exploiting temporal and spatial access localities. The performance of non-contiguous accesses can be improved by either using vectored I/O interfaces or tuning the read-ahead facilities. The file system performance depends on the relative locations of the computing threads and the file system, and also on various Exemplar design parameters such as the NUMA architecture, TLB/data cache management and paging policies.", acknowledgement = ack-nhfb, fjournal = "ACM SIGMETRICS Performance Evaluation Review", } @Article{Bramley:1997:TNRb, author = "Randall Bramley", title = "Technology News \& Reviews: {Chemkin} software; {OpenMP Fortran Standard}; {ODE} Toolbox for {Matlab}; {Java} products; {Scientific WorkPlace 3.0}", journal = j-IEEE-COMPUT-SCI-ENG, volume = "4", number = "4", pages = "75--78", month = oct # "\slash " # dec, year = "1997", CODEN = "ISCEE4", ISSN = "1070-9924", ISSN-L = "1070-9924", bibdate = "Sat Jan 9 08:57:23 MST 1999", bibsource = "http://www.computer.org/cse/cs1998; http://www.math.utah.edu/pub/tex/bib/ieeecomputscieng.bib; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://dlib.computer.org/cs/books/cs1997/pdf/c4075.pdf", acknowledgement = ack-nhfb, fjournal = "IEEE Computational Science \& Engineering", remark = "No DOI available: article missing from IEEE Xplore database.", } @Book{Butenhof:1997:PPT, author = "David R. Butenhof", title = "Programming with {POSIX} threads", publisher = pub-AW, address = pub-AW:adr, pages = "xviii + 381", year = "1997", ISBN = "0-201-63392-2", ISBN-13 = "978-0-201-63392-4", LCCN = "QA76.76.T55B88 1997", bibdate = "Mon Sep 01 08:53:12 1997", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", price = "US\$31.95", URL = "http://www.amazon.com/exec/obidos/ASIN/0201633922/ref=sim_books/002-4892305-5599452", acknowledgement = ack-nhfb, } @Article{Calcote:1997:TPS, author = "John Calcote", title = "Thread Pools and Server Performance", journal = j-DDJ, volume = "22", number = "7", pages = "60--??", month = jul, year = "1997", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Sat Jun 28 10:43:47 MDT 1997", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Dr. Dobb's Journal of Software Tools", } @Article{Cenciarelli:1997:SMJ, author = "P. Cenciarelli and A. Knapp and B. Reus and M. Wirsing", title = "From sequential to multi-threaded {Java}: An event-based operational semantics", journal = j-LECT-NOTES-COMP-SCI, volume = "1349", pages = "75--??", year = "1997", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Tue Apr 28 08:51:33 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/java.bib; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Cenciarelli:1997:SMT, author = "P. Cenciarelli and A. Knapp and B. Reus and M. Wirsing", title = "From sequential to multi-threaded {Java}: An event-based operational semantics", journal = j-LECT-NOTES-COMP-SCI, volume = "1349", pages = "75--??", year = "1997", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Tue Apr 28 08:51:33 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @InProceedings{Dou:1997:ISV, author = "Yong Dou and Zhengbing Pang and Xingming Zhou", title = "Implementing a software virtual shared memory on {PVM}", crossref = "IEEE:1997:APD", pages = "??--??", year = "1997", bibdate = "Wed Apr 16 06:39:19 MDT 1997", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, classification = "C6110P (Parallel programming); C6115 (Programming support); C6120 (File organisation); C6140D (High level languages); C7430 (Computer engineering)", corpsource = "Dept. of Comput. Sci., Changsha Inst. of Technol., Hunan, China", keywords = "distributed; FORTRAN; FORTRAN language; GKD-VSM; memory environments; multithread scheme; parallel programming; parallel programming model; Prefetch and Poststore; programming environments; PVM; shared memory; software overhead; software virtual shared memory; synchronisation; user-level; virtual machines; virtual storage", treatment = "P Practical", } @TechReport{Eickemeyer:1997:EMP, author = "Richard J. Eickemeyer", title = "Evaluation of multithreaded processors and thread-switch policies", type = "Research report", number = "RC 20956 (92759)", institution = "IBM T. J. Watson Research Center", address = "Yorktown Heights, NY, USA", pages = "16", day = "18", month = aug, year = "1997", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "This paper examines the use of coarse-grained multithreading to lessen the negative impact of memory access latencies on the performance of uniprocessor on-line transaction processing systems. It considers the effect of switching threads on cache misses in a two-level cache system. It also examines several different thread-switch policies. The results suggest that multithreading with a small number (3-5) of active threads can significantly improve the performance of such commercial environments.", acknowledgement = ack-nhfb, keywords = "Cache memory; Computer architecture; Threads (Computer programs)", } @Article{Emerson:1997:USW, author = "E. A. Emerson and A. P. Sistla", title = "Utilizing Symmetry when Model-Checking under Fairness Assumptions: An Automata-Theoretic Approach", journal = j-TOPLAS, volume = "19", number = "4", pages = "617--638", month = jul, year = "1997", CODEN = "ATPSDT", ISSN = "0164-0925 (print), 1558-4593 (electronic)", ISSN-L = "0164-0925", bibdate = "Wed Dec 3 16:28:05 MST 1997", bibsource = "http://www.acm.org/pubs/toc/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org:80/pubs/citations/journals/toplas/1997-19-4/p617-emerson/", abstract = "One useful technique for combating the state explosion problem is to exploit symmetry when performing temporal logic model checking. In previous work it is shown how, using some basic notions of group theory, symmetry may be exploited for the full range of correctness properties expressible in the very expressive temporal logic CTL*. Surprisingly, while fairness properties are readily expressible in CTL*, these methods are not powerful enough to admit any amelioration of state explosion, when fairness assumptions are involved. We show that it is nonetheless possible to handle fairness efficiently by trading some group theory for automata theory. Our automata-theoretic approach depends on detecting fair paths subtly encoded in a quotient structure whose arcs are annotated with permutations, by using a threaded structure that reflects coordinate shifts caused by the permutations.", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Programming Languages and Systems", keywords = "design; languages; theory; verification", subject = "{\bf F.3.1} Theory of Computation, LOGICS AND MEANINGS OF PROGRAMS, Specifying and Verifying and Reasoning about Programs. {\bf F.1.1} Theory of Computation, COMPUTATION BY ABSTRACT DEVICES, Models of Computation. {\bf D.2.4} Software, SOFTWARE ENGINEERING, Software/Program Verification.", } @MastersThesis{Fisher:1997:SPS, author = "Michael T. Fisher", title = "A study of the performance of simultaneous multithreading on a superscalar processor", type = "Thesis (M.S.E.E.)", number = "2363", school = "State University of New York at Binghamton, Watson School of Engineering and Applied Science", address = "Binghamton, NY, USA", pages = "vi + 98", year = "1997", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", series = "Master's theses / State University of New York at Binghamton", acknowledgement = ack-nhfb, alttitle = "Simultaneous multithreading on a superscalar processor Multithreading on a superscalar processor Superscalar processor", keywords = "Microprocessors -- Testing", } @MastersThesis{Fong:1997:BPM, author = "Waipang Fong", title = "Building a preprocessor for a multithreading compiler", type = "Thesis (M.E.E.)", school = "Department of Electrical Engineering, University of Alabama", address = "Tuscaloosa, AL, USA", pages = "ix + 80", year = "1997", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "Multiprocessors; Parallel processing (Electronic computers)", } @Article{Foster:1997:MMC, author = "Ian Foster and Jonathan Geisler and Carl Kesselman and Steven Tuecke", title = "Managing Multiple Communication Methods in High-Performance Networked Computing Systems", journal = j-J-PAR-DIST-COMP, volume = "40", number = "1", pages = "35--48", day = "10", month = jan, year = "1997", CODEN = "JPDCER", DOI = "http://dx.doi.org/10.1006/jpdc.1996.1266", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Thu Mar 9 09:19:01 MST 2000", bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.1266/production; http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.1266/production/pdf; http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.1266/production/ref", acknowledgement = ack-nhfb, classification = "B6150M (Protocols); B6210L (Computer communications); C5440 (Multiprocessing systems); C5470 (Performance evaluation and testing); C5640 (Protocols); C5670 (Network performance)", corpsource = "Div. of Math. and Comput. Sci., Argonne Nat. Lab., IL, USA", fjournal = "Journal of Parallel and Distributed Computing", keywords = "Argonne MPICH library; computer networks; computing systems; criteria; heterogeneous networked environment; high-performance networked; message passing; message passing interface; multimethod communication; multiple communication methods; multithreaded runtime system; networked computing environments; Nexus; Nexus-based MPI implementation; performance characteristics; performance evaluation; protocols; remote service request mechanisms; transport mechanisms; user-specified selection", treatment = "P Practical", } @TechReport{Fujita:1997:MPA, author = "Tetsuya Theodore Fujita", title = "A multithreaded processor architecture for parallel symbolic computation", type = "Technical Report", number = "MIT/LCS/TM-338", institution = "Laboratory for Computer Science, Massachusetts Institute of Technology", address = "Cambridge, MA, USA", pages = "71", month = sep, year = "1997", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "Computer architecture; Multilisp (Computer program language); Parallel processing (Electronic computers)", } @PhdThesis{Goldstein:1997:LTC, author = "Seth Copen Goldstein", title = "Lazy threads: compiler and runtime structures for fine-grained parallel programming", type = "Thesis ({Ph.D.})", number = "UCB/CSD-97-975", school = "Computer Science Division, University of California, Berkeley", address = "Berkeley, CA, USA", pages = "xi + 174", year = "1997", LCCN = "TK7885.A1 R46 no.97:975", bibdate = "Fri May 10 12:18:17 MDT 2002", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", series = "Report", acknowledgement = ack-nhfb, } @Article{Gomez:1997:EMU, author = "Juan Carlos Gomez and Vernon Rego and V. S. Sunderam", title = "Efficient Multithreaded User-Space Transport for Network Computing: Design and Test of the {TRAP} Protocol", journal = j-J-PAR-DIST-COMP, volume = "40", number = "1", pages = "103--117", day = "10", month = jan, year = "1997", CODEN = "JPDCER", DOI = "http://dx.doi.org/10.1006/jpdc.1996.1269", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Thu Mar 9 09:19:01 MST 2000", bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.1269/production; http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.1269/production/pdf; http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.1269/production/ref", acknowledgement = ack-nhfb, classification = "B6150M (Protocols); B6210L (Computer communications); C5620 (Computer networks and techniques); C5640 (Protocols); C6150G (Diagnostic, testing, debugging and evaluating systems); C6150N (Distributed systems software)", corpsource = "Dept. of Comput. Sci., Purdue Univ., West Lafayette, IN, USA", fjournal = "Journal of Parallel and Distributed Computing", keywords = "communicating; communication; computer networks; computing; computing nodes; efficient multithreaded user-space transport; high-; low-latency; message passing; multithreaded message-passing libraries; network; nodes; performance distributed computing applications; processing; runtime performance; scalability characteristics; software libraries; software performance evaluation; testing; transaction; transaction-oriented protocol; transport protocols; TRAP protocol design; TRAP protocol testing; TRAP-based communication library; user-space protocol", treatment = "P Practical", } @Manual{Haines:1997:DLT, author = "Matthew Haines", title = "On designing lightweight threads for substrate software", number = "201645", publisher = pub-NTIS, address = pub-NTIS:adr, pages = "??", year = "1997", LCCN = "DOC NAS 1.26:201645 mf11", bibdate = "Fri May 10 12:18:17 MDT 2002", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "Shipping list number 98-0847-M.", series = "NASA contractor report", acknowledgement = ack-nhfb, keywords = "operating systems (computers); parallel computers; parallel processing (computers); threads", } @Article{Haines:1997:DPP, author = "Matthew Haines and Piyush Mehrotra and David Cronk", title = "Data-parallel programming in a multithreaded environment", journal = j-SCI-PROG, volume = "6", number = "2", pages = "187--200", month = "Summer", year = "1997", CODEN = "SCIPEV", ISSN = "1058-9244 (print), 1875-919X (electronic)", ISSN-L = "1058-9244", bibdate = "Thu Mar 28 12:27:27 MST 2002", bibsource = "Compendex database; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Scientific Programming", } @Article{Haines:1997:OIA, author = "Matthew Haines", title = "An Open Implementation Analysis and Design for Lightweight Threads", journal = j-SIGPLAN, volume = "32", number = "10", pages = "229--242", month = oct, year = "1997", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:17:39 MST 2003", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", } @Book{Hanson:1997:CII, author = "David R. Hanson", title = "{C} Interfaces and Implementations: Techniques for Creating Reusable Software", publisher = pub-AW, address = pub-AW:adr, pages = "xvii + 519", year = "1997", ISBN = "0-201-49841-3", ISBN-13 = "978-0-201-49841-7", LCCN = "QA76.73.C15H37 1997", bibdate = "Fri Feb 27 16:08:11 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", price = "US\$37.95", series = "Addison-Wesley Professional Computing Series", URL = "http://www.cs.princeton.edu/software/cii/", acknowledgement = ack-nhfb, annote = "Multithreading is discussed in Chapter 20.", } @Article{Hightower:1997:PDD, author = "Lauren Hightower", title = "Publishing Dynamic Data on the {Internet} --- {Allaire's Cold Fusion} is a development tool that provides access (via the {Web}) to any database the {Web} server can access using {ODBC}. {Cold Fusion} runs as a multithreaded {Windows NT} system service and works with any {ODBC-compliant} database", journal = j-DDJ, volume = "22", number = "1", pages = "70--??", month = jan, year = "1997", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Fri Jan 3 06:17:24 MST 1997", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Dr. Dobb's Journal of Software Tools", } @Book{Hughes:1997:OOM, author = "Cameron Hughes and Tracey Hughes", title = "Object-oriented multithreading using {C++}", publisher = pub-WILEY, address = pub-WILEY:adr, pages = "xvi + 495", year = "1997", ISBN = "0-471-18012-2 (paperback)", ISBN-13 = "978-0-471-18012-8 (paperback)", LCCN = "QA76.73.C153H84 1997", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, annote = "System requirements: Windows 95, or OS/2 2.0 and above, or UNIX, or system with POSIX pthreads; ANSI/ISO compliant C++ compiler.", keywords = "C++ (Computer program language); POSIX (Computer software standard); Threads (Computer programs)", } @Article{Kasperink:1997:CDC, author = "Harold R. Kasperink and John C. Dekker", title = "Concurrent Database Commands and {C++}", journal = j-DDJ, volume = "22", number = "8", pages = "84, 86, 88, 89, 98", month = aug, year = "1997", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Sat Aug 23 07:57:02 1997", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Mapping design problems to programming problems leads to software solutions that are easy to extend and reuse. Our authors explain how they resolved multithreaded porting problems using design patterns. The database they use is Oracle and the database transactions are implemented using Oracle ProC as an embedded database command language.", acknowledgement = ack-nhfb, fjournal = "Dr. Dobb's Journal of Software Tools", } @MastersThesis{Khosla:1997:MAT, author = "Samir Khosla", title = "Multithreading the asynchronous trigger processor", type = "Thesis (M.S.)", school = "University of Florida", address = "Gainesville, FL, USA", pages = "ix + 57", year = "1997", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Article{Kougiouris:1997:PMF, author = "Panos Kougiouris and Marco Framba", title = "A Portable Multithreading Framework", journal = j-CCCUJ, volume = "15", number = "8", pages = "??--??", month = aug, year = "1997", CODEN = "CCUJEX", ISSN = "1075-2838", bibdate = "Wed Aug 20 10:44:42 1997", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "C/C++ Users Journal", } @Article{Krieger:1997:HPO, author = "Orran Krieger and Michael Stumm", title = "{HFS}: {A} Performance-Oriented Flexible File System Based on Building-Block Compositions", journal = j-TOCS, volume = "15", number = "3", pages = "286--321", month = aug, year = "1997", CODEN = "ACSYEC", ISSN = "0734-2071", ISSN-L = "0734-2071", bibdate = "Wed Jan 13 18:36:53 MST 1999", bibsource = "http://www.acm.org/pubs/contents/journals/tocs/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org:80/pubs/citations/journals/tocs/1997-15-3/p286-krieger/", abstract = "The Hurricane File System (HFS) is designed for (potentially large-scale) shared-memory multiprocessors. Its architecture is based on the principle that, in order to maximize performance for applications with diverse requirements, a file system must support a wide variety of file structures, file system policies, and I/O interfaces. Files in HFS are implemented using simple building blocks composed in potentially complex ways. This approach yields great flexibility, allowing an application to customize the structure and policies of a file to exactly meet its requirements. As an extreme example, HFS allows a file's structure to be optimized for concurrent random-access write-only operations by 10 threads, something no other file system can do. Similarly, the prefetching, locking, and file cache management policies can all be chosen to match an application's access pattern. In contrast, most parallel file systems support a single file structure and a small set of policies. We have implemented HFS as part of the Hurricane operating system running on the Hector shared-memory multiprocessor. We demonstrate that the flexibility of HFS comes with little processing or I/O overhead. We also show that for a number of file access patterns, HFS is able to deliver to the applications the full I/O bandwidth of the disks on our system.", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Computer Systems", keywords = "design; performance", subject = "{\bf D.4.3} Software, OPERATING SYSTEMS, File Systems Management, File organization. {\bf D.4.3} Software, OPERATING SYSTEMS, File Systems Management, Access methods. {\bf D.4.8} Software, OPERATING SYSTEMS, Performance, Measurements. {\bf E.5} Data, FILES, Optimization**. {\bf E.5} Data, FILES, Organization/structure.", } @PhdThesis{Lang:1997:MTE, author = "Duncan Walter Temple Lang", title = "A multi-threaded extension to a high level interactive statistical computing environment", type = "Thesis (Ph.D. in Statistics)", school = "University of California, Berkeley", address = "Berkeley, CA, USA", pages = "vii + 161", month = dec, year = "1997", LCCN = "308t 1997 951", bibdate = "Fri Aug 7 08:29:38 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "Dissertations -- Academic -- UCB -- statistics -- 1991--2000; University of California, Berkeley. Dept. of Statistics -- Dissertations", } @Article{Larbi:1997:BRM, author = "Michael Larbi", title = "Book Review: {Multithreading Applications in Win32}", journal = j-CCCUJ, volume = "15", number = "7", pages = "65--??", month = jul, year = "1997", CODEN = "CCUJEX", ISSN = "1075-2838", bibdate = "Thu Jun 26 14:12:46 1997", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "C/C++ Users Journal", } @Article{Leiserson:1997:AAM, author = "C. E. Leiserson", title = "Algorithmic analysis of multithreaded algorithms", journal = j-LECT-NOTES-COMP-SCI, volume = "1350", pages = "132--??", year = "1997", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Tue Apr 28 08:51:33 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @MastersThesis{Leven:1997:MIR, author = "Peter J. Leven", title = "A multithreaded implementation of a {Robot Control C Library}", type = "Printout. Thesis (M.S.)", school = "University of Illinois at Urbana-Champaign", address = "Urbana-Champaign, IL, USA", pages = "x + 72", year = "1997", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Article{Lo:1997:CTL, author = "Jack L. Lo and Joel S. Emer and Henry M. Levy and Rebecca L. Stamm and Dean M. Tullsen", title = "Converting Thread-Level Parallelism to Instruction-Level Parallelism via Simultaneous Multithreading", journal = j-TOCS, volume = "15", number = "3", pages = "322--354", month = aug, year = "1997", CODEN = "ACSYEC", ISSN = "0734-2071", ISSN-L = "0734-2071", bibdate = "Wed Jan 13 18:36:53 MST 1999", bibsource = "http://www.acm.org/pubs/contents/journals/tocs/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org:80/pubs/citations/journals/tocs/1997-15-3/p322-lo/", abstract = "To achieve high performance, contemporary computer systems rely on two forms of parallelism: instruction-level parallelism (ILP) and thread-level parallelism (TLP). Wide-issue super-scalar processors exploit ILP by executing multiple instructions from a single program in a single cycle. Multiprocessors (MP) exploit TLP by executing different threads in parallel on different processors. Unfortunately, both parallel processing styles statically partition processor resources, thus preventing them from adapting to dynamically changing levels of ILP and TLP in a program. With insufficient TLP, processors in an MP will be idle; with insufficient ILP, multiple-issue hardware on a superscalar is wasted. This article explores parallel processing on an alternative architecture, simultaneous multithreading (SMT), which allows multiple threads to complete for and share all of the processor's resources every cycle. The most compelling reason for running parallel applications on an SMT processor is its ability to use thread-level parallelism and instruction-level parallelism interchangeably. By permitting multiple threads to share the processor's functional units simultaneously, the processor can use both ILP and TLP to accommodate variations in parallelism. When a program has only a single thread, all of the SMT processor's resources can be dedicated to that thread; when more TLP exists, this parallelism can compensate for a lack of per-thread ILP. We examine two alternative on-chip parallel architectures for the next generation of processors. We compare SMT and small-scale, on-chip multiprocessors in their ability to exploit both ILP and TLP. First, we identify the hardware bottlenecks that prevent multiprocessors from effectively exploiting ILP. Then, we show that because of its dynamic resource sharing, SMT avoids these inefficiencies and benefits from being able to run more threads on a single processor. The use of TLP is especially advantageous when per-thread ILP is limited. The ease of adding additional thread contexts on an SMT (relative to adding additional processors on an MP) allows simultaneous multithreading to expose more parallelism, further increasing functional unit utilization and attaining a 52\% average speedup (versus a four-processor, single-chip multiprocessor with comparable execution resources). This study also addresses an often-cited concern regarding the use of thread-level parallelism or multithreading: interference in the memory system and branch prediction hardware. We find the multiple threads cause interthread interference in the caches and place greater demands on the memory system, thus increasing average memory latencies. By exploiting threading-level parallelism, however, SMT hides these additional latencies, so that they only have a small impact on total program performance. We also find that for parallel applications, the additional threads have minimal effects on branch prediction.", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Computer Systems", keywords = "measurement; performance", subject = "{\bf C.1.2} Computer Systems Organization, PROCESSOR ARCHITECTURES, Multiple Data Stream Architectures (Multiprocessors), Parallel processors**. {\bf C.0} Computer Systems Organization, GENERAL, Instruction set design. {\bf D.4.1} Software, OPERATING SYSTEMS, Process Management.", } @Article{Lo:1997:CTP, author = "Jack L. Lo and Joel S. Emer and Henry M. Levy and Rebecca L. Stamm and Dean M. Tullsen", title = "Converting Thread-Level Parallelism to Instruction-Level Parallelism via Simultaneous Multithreading", journal = j-TOCS, volume = "15", number = "3", pages = "322--354", month = aug, year = "1997", CODEN = "ACSYEC", ISSN = "0734-2071", ISSN-L = "0734-2071", bibdate = "Wed Jan 13 18:36:53 MST 1999", bibsource = "http://www.acm.org/pubs/contents/journals/tocs/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.math.utah.edu/pub/tex/bib/tocs.bib", URL = "http://www.acm.org:80/pubs/citations/journals/tocs/1997-15-3/p322-lo/", abstract = "To achieve high performance, contemporary computer systems rely on two forms of parallelism: instruction-level parallelism (ILP) and thread-level parallelism (TLP). Wide-issue super-scalar processors exploit ILP by executing multiple instructions from a single program in a single cycle. Multiprocessors (MP) exploit TLP by executing different threads in parallel on different processors. Unfortunately, both parallel processing styles statically partition processor resources, thus preventing them from adapting to dynamically changing levels of ILP and TLP in a program. With insufficient TLP, processors in an MP will be idle; with insufficient ILP, multiple-issue hardware on a superscalar is wasted. This article explores parallel processing on an alternative architecture, simultaneous multithreading (SMT), which allows multiple threads to complete for and share all of the processor's resources every cycle. The most compelling reason for running parallel applications on an SMT processor is its ability to use thread-level parallelism and instruction-level parallelism interchangeably. By permitting multiple threads to share the processor's functional units simultaneously, the processor can use both ILP and TLP to accommodate variations in parallelism. When a program has only a single thread, all of the SMT processor's resources can be dedicated to that thread; when more TLP exists, this parallelism can compensate for a lack of per-thread ILP. We examine two alternative on-chip parallel architectures for the next generation of processors. We compare SMT and small-scale, on-chip multiprocessors in their ability to exploit both ILP and TLP. First, we identify the hardware bottlenecks that prevent multiprocessors from effectively exploiting ILP. Then, we show that because of its dynamic resource sharing, SMT avoids these inefficiencies and benefits from being able to run more threads on a single processor. The use of TLP is especially advantageous when per-thread ILP is limited. The ease of adding additional thread contexts on an SMT (relative to adding additional processors on an MP) allows simultaneous multithreading to expose more parallelism, further increasing functional unit utilization and attaining a 52\% average speedup (versus a four-processor, single-chip multiprocessor with comparable execution resources). This study also addresses an often-cited concern regarding the use of thread-level parallelism or multithreading: interference in the memory system and branch prediction hardware. We find the multiple threads cause interthread interference in the caches and place greater demands on the memory system, thus increasing average memory latencies. By exploiting threading-level parallelism, however, SMT hides these additional latencies, so that they only have a small impact on total program performance. We also find that for parallel applications, the additional threads have minimal effects on branch prediction.", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Computer Systems", keywords = "measurement; performance", subject = "{\bf C.1.2} Computer Systems Organization, PROCESSOR ARCHITECTURES, Multiple Data Stream Architectures (Multiprocessors), Parallel processors**. {\bf C.0} Computer Systems Organization, GENERAL, Instruction set design. {\bf D.4.1} Software, OPERATING SYSTEMS, Process Management.", } @TechReport{LoCocero:1997:MML, author = "Joseph LoCocero and D. E. (Donald E.) Thomas", title = "A multithreaded, multiple language hardware\slash software cosimulator", type = "Research report", number = "CMUCAD-97-13", institution = "Center for Electronic Design Automation, Carnegie Mellon University", address = "Pittsburgh, PA, USA", pages = "7", month = apr, year = "1997", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Functional verification of mixed hardware/software systems is vital to guaranteeing a correct, operational system. This paper discusses a new multithreaded, multiple-language cosimulator that directly combines Verilog and C/C++, the native languages most often used by hardware and software designers. The interface between the two languages is specified in detail, as are some illustrative examples. The performance is shown to be clearly better than UNIX socket-based cosimulation approaches. Further, it naturally fits a cosimulation environment where arbitrary C++ programs and Verilog descriptions are developed concurrently.", acknowledgement = ack-nhfb, annote = "Supported in part by Semiconductor Research Corporation.", keywords = "C (Computer program language); Embedded computer systems -- Simulation methods; Verilog (Computer hardware description language)", } @Article{Mateosian:1997:MNT, author = "R. M. Mateosian", title = "Micro News: {DARPA} aids {Tera MTA}", journal = j-IEEE-MICRO, volume = "17", number = "5", pages = "5--6", month = sep # "\slash " # oct, year = "1997", CODEN = "IEMIDZ", DOI = "http://dx.doi.org/10.1109/MM.1997.621216", ISSN = "0272-1732 (print), 1937-4143 (electronic)", ISSN-L = "0272-1732", bibdate = "Thu Dec 14 06:08:58 MST 2000", bibsource = "http://www.computer.org/micro/mi1997/; http://www.math.utah.edu/pub/tex/bib/ieeemicro.bib; http://www.math.utah.edu/pub/tex/bib/multithreading.bib; Science Citation Index database (1980--2000)", URL = "http://dlib.computer.org/mi/books/mi1997/pdf/m5005.pdf", acknowledgement = ack-nhfb, fjournal = "IEEE Micro", } @Article{McCarthy:1997:WMT, author = "Martin McCarthy", title = "What is Multi-Threading?", journal = j-LINUX-J, volume = "34", pages = "??--??", month = feb, year = "1997", CODEN = "LIJOFX", ISSN = "1075-3583 (print), 1938-3827 (electronic)", ISSN-L = "1075-3583", bibdate = "Fri Oct 9 08:35:26 MDT 1998", bibsource = "http://noframes.linuxjournal.com/lj-issues/issue34/index.html; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "A primer on multi-threading: the process whereby Linux manages several tasks simultaneously.", acknowledgement = ack-nhfb, fjournal = "Linux journal", } @Article{McMillan:1997:NSB, author = "Robert McMillan", title = "News: {Sun} boosts {Java} performance, adding {JIT} compiler and {JVM} with multithreading to {Solaris 2.6}", journal = j-JAVAWORLD, volume = "2", number = "7", pages = "??--??", month = jul, year = "1997", CODEN = "????", ISSN = "1091-8906", bibdate = "Thu Aug 13 14:52:27 1998", bibsource = "http://www.javaworld.com/javaworld/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.javaworld.com/javaworld/jw-07-1997/jw-07-speedway.htm", acknowledgement = ack-nhfb, } @Article{Neves:1997:TRS, author = "Richard Neves and Robert B. Schnabel", title = "Threaded Runtime Support for Execution of Fine Grain Parallel Code on Coarse Grain Multiprocessors", journal = j-J-PAR-DIST-COMP, volume = "42", number = "2", pages = "128--142", day = "1", month = may, year = "1997", CODEN = "JPDCER", DOI = "http://dx.doi.org/10.1006/jpdc.1997.1322", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Thu Mar 9 09:19:02 MST 2000", bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1997.1322/production; http://www.idealibrary.com/links/doi/10.1006/jpdc.1997.1322/production/pdf; http://www.idealibrary.com/links/doi/10.1006/jpdc.1997.1322/production/ref", acknowledgement = ack-nhfb, fjournal = "Journal of Parallel and Distributed Computing", } @Book{Oaks:1997:JT, author = "Scott Oaks and Henry Wong", title = "{Java} threads", publisher = pub-ORA, address = pub-ORA:adr, pages = "xiii + 252", year = "1997", ISBN = "1-56592-216-6", ISBN-13 = "978-1-56592-216-7", LCCN = "QA76.73.J38 O25 1997", bibdate = "Fri May 10 12:18:17 MDT 2002", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", series = "Java series", acknowledgement = ack-nhfb, keywords = "java (computer program language); threads (computer programs)", } @MastersThesis{Ongwattanakul:1997:RDM, author = "Songpol Ongwattanakul", title = "A runtime distributed multithreading library for the {PARC} language", type = "Thesis (M.E.E.)", school = "Department of Electrical Engineering, University of Alabama", address = "Tuscaloosa, AL, USA", pages = "viii + 71", year = "1997", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "Multiprocessors; Parallel processing (Electronic computers)", } @Article{Onion:1997:MM, author = "F. Onion", title = "Multithreading in {MFC}", journal = j-C-PLUS-PLUS-REPORT, volume = "9", number = "3", pages = "50--53, 56", month = mar, year = "1997", CODEN = "CRPTE7", ISSN = "1040-6042", bibdate = "Thu Apr 24 09:46:14 MDT 1997", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, classification = "C6110J (Object-oriented programming); C6115 (Programming support); C6150J (Operating systems)", fjournal = "C++ Report", keywords = "API calls; application program interfaces; Internet queries; MFC; multiprogramming; multithreaded programming; object oriented programming; object-oriented programming; remote database hits; software libraries; software tools; threads; user interface; user interfaces; Windows", treatment = "P Practical", } @Article{Park:1997:HPM, author = "Sung-Yong Park and Salim Hariri", title = "A High Performance Message Passing System for {Network of Workstations}", journal = j-J-SUPERCOMPUTING, volume = "11", number = "2", pages = "159--180", month = oct, year = "1997", CODEN = "JOSUED", DOI = "http://www.springerlink.com/openurl.asp?genre=article&id=doi:10.1023/A:1007912007767", ISSN = "0920-8542 (print), 1573-0484 (electronic)", ISSN-L = "0920-8542", bibdate = "Wed Jul 6 12:13:07 MDT 2005", bibsource = "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0920-8542&volume=11&issue=2; http://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.wkap.nl/issuetoc.htm/0920-8542+11+2+1997", URL = "http://www.springerlink.com/openurl.asp?genre=article&issn=0920-8542&volume=11&issue=2&spage=159; http://www.wkap.nl/oasis.htm/149826", acknowledgement = ack-nhfb, classification = "C5620W (Other computer networks); C6150N (Distributed systems software)", corpsource = "Dept. of Electr. and Comput. Eng., Syracuse Univ., NY, USA", fjournal = "The Journal of Supercomputing", keywords = "application programming interface; asynchronous transfer mode; ATM; ATM network; device driver; distributed computing; high performance; message passing; message-passing system; multithreaded message-passing system; NCS; network of workstations; NOW environment; NYNET; wide area network; wide area networks", pubcountry = "Netherlands", treatment = "P Practical", } @Book{Prasad:1997:MPT, author = "Shashi Prasad", title = "Multithreading programming techniques", publisher = pub-MCGRAW-HILL, address = pub-MCGRAW-HILL:adr, pages = "xix + 410", year = "1997", ISBN = "0-07-912250-7, 0-07-050710-4 (Computer disk)", ISBN-13 = "978-0-07-912250-6, 978-0-07-050710-4 (Computer disk)", LCCN = "QA76.76.D47 P72 1997", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", series = "The J. Ranade workstation series", acknowledgement = ack-nhfb, annote = "System requirements: C programming language.", keywords = "Application software -- Development; C (Computer program language); Cross-platform software development", } @Article{Ravoor:1997:MTP, author = "Suresh B. Ravoor and Johnny S. K. Wong", title = "Multithreaded Transaction Processing in Distributed Systems", journal = j-J-SYST-SOFTW, volume = "38", number = "2", pages = "107--117", month = aug, year = "1997", CODEN = "JSSODM", ISSN = "0164-1212", ISSN-L = "0164-1212", bibdate = "Wed Dec 16 08:24:49 MST 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "The Journal of systems and software", } @Article{Savage:1997:EDD, author = "Stefan Savage and Michael Burrows and Greg Nelson and Patrick Sobalvarro and Thomas Anderson", title = "{Eraser}: {A} Dynamic Data Race Detector for Multithreaded Programs", journal = j-TOCS, volume = "15", number = "4", pages = "391--411", month = nov, year = "1997", CODEN = "ACSYEC", ISSN = "0734-2071", ISSN-L = "0734-2071", bibdate = "Wed Jan 13 18:36:53 MST 1999", bibsource = "http://www.acm.org/pubs/contents/journals/tocs/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "Co-published in {\em Operating Systems Review}, {\bf 31}(5).", URL = "http://www.acm.org:80/pubs/citations/journals/tocs/1997-15-4/p391-savage/", abstract = "Multithreaded programming is difficult and error prone. It is easy to make a mistake in synchronization that produces a data race, yet it can be extremely hard to locate this mistake during debugging. This article describes a new tool, called Eraser, for dynamically detecting data races in lock-based multithreaded programs. Eraser uses binary rewriting techniques to monitor every shared-monory reference and verify that consistent locking behavior is observed. We present several case studies, including undergraduate coursework and a multithreaded Web search engine, that demonstrate the effectiveness of this approach.", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Computer Systems", keywords = "algorithms; experimentation; reliability", subject = "{\bf D.2.5} Software, SOFTWARE ENGINEERING, Testing and Debugging, Monitors. {\bf D.1.3} Software, PROGRAMMING TECHNIQUES, Concurrent Programming, Parallel programming. {\bf D.2.5} Software, SOFTWARE ENGINEERING, Testing and Debugging, Debugging aids. {\bf D.2.5} Software, SOFTWARE ENGINEERING, Testing and Debugging, Tracing. {\bf D.4.1} Software, OPERATING SYSTEMS, Process Management, Concurrency. {\bf D.4.1} Software, OPERATING SYSTEMS, Process Management, Deadlocks. {\bf D.4.1} Software, OPERATING SYSTEMS, Process Management, Multiprocessing/multiprogramming/multitasking. {\bf D.4.1} Software, OPERATING SYSTEMS, Process Management, Mutual exclusion.", } @Article{Shepherd:1997:UCA, author = "George Shepherd and Scot Wingo", title = "Undocumented Corner: {ATL} and the {IUknown} Interface", journal = j-DDJ, volume = "22", number = "8", pages = "119--123", month = aug, year = "1997", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Mon Aug 11 11:38:10 MDT 1997", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "George and Scot continue their examination of Microsoft's Active Template Library, this month looking at the heart of ATL, including its support for multithreading and its various implementations of IUnknown.", acknowledgement = ack-nhfb, fjournal = "Dr. Dobb's Journal of Software Tools", } @Article{Shepherd:1997:UCAc, author = "George Shepherd and Scot Wingo", title = "Undocumented Corner: {ATL} and the {IUknown} Interface", journal = j-DDJ, volume = "22", number = "8", pages = "119--123", month = aug, year = "1997", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Mon Aug 11 11:38:10 MDT 1997", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "George and Scot continue their examination of Microsoft's Active Template Library, this month looking at the heart of ATL, including its support for multithreading and its various implementations of IUnknown.", acknowledgement = ack-nhfb, fjournal = "Dr. Dobb's Journal of Software Tools", } @Article{Shoffner:1997:JSSa, author = "Michael Shoffner", title = "{Java} Step by Step: Write your own threaded discussion forum", journal = j-JAVAWORLD, volume = "2", number = "2", pages = "??--??", month = feb, year = "1997", CODEN = "????", ISSN = "1091-8906", bibdate = "Thu Aug 13 14:52:24 1998", bibsource = "http://www.javaworld.com/javaworld/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.javaworld.com/javaworld/jw-02-1997/jw-02-step.htm", acknowledgement = ack-nhfb, } @Article{Shoffner:1997:JSSb, author = "Michael Shoffner", title = "{Java} Step By Step: Write your own threaded discussion forum: The communications and server components", journal = j-JAVAWORLD, volume = "2", number = "3", pages = "??--??", month = mar, year = "1997", CODEN = "????", ISSN = "1091-8906", bibdate = "Thu Aug 13 14:52:25 1998", bibsource = "http://www.javaworld.com/javaworld/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.javaworld.com/javaworld/jw-03-1997/jw-03-step.htm", acknowledgement = ack-nhfb, } @Article{Sime:1997:GPM, author = "J. Sime", title = "Guarded pointers: moving smart pointers into multithreaded systems", journal = j-C-PLUS-PLUS-REPORT, volume = "9", number = "4", pages = "32--41", month = apr, year = "1997", CODEN = "CRPTE7", ISSN = "1040-6042", bibdate = "Thu Apr 24 09:46:14 MDT 1997", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, classification = "C6110J (Object-oriented programming); C6120 (File organisation); C6130 (Data handling techniques); C6150N (Distributed systems software)", fjournal = "C++ Report", keywords = "abstract data types; C listings; concurrency control; concurrency control pattern; data integrity; exception handling; guarded pointers; multiprogramming; multithreaded systems; object-oriented programming; protected data resource; protection proxy pattern; reference count lock; safety; smart pointers; thread safety mechanisms", treatment = "P Practical", } @Article{Sodan:1997:ENN, author = "Angela Sodan and Guang R. Gao and Olivier Maquelin and Jens-Uwe Schultz and Xin-Min Tian", title = "Experiences with non-numeric applications on multithreaded architectures", journal = j-SIGPLAN, volume = "32", number = "7", pages = "124--135", month = jul, year = "1997", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:17:35 MST 2003", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", } @Article{Sohn:1997:DWD, author = "Andrew Sohn and Mitsuhisa Sato and Namhoon Yoo and Jean-Luc Gaudiot", title = "Data and Workload Distribution in a Multithreaded Architecture", journal = j-J-PAR-DIST-COMP, volume = "40", number = "2", pages = "256--264", day = "1", month = feb, year = "1997", CODEN = "JPDCER", DOI = "http://dx.doi.org/10.1006/jpdc.1996.1262", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Thu Mar 9 09:19:02 MST 2000", bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.1262/production; http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.1262/production/pdf; http://www.idealibrary.com/links/doi/10.1006/jpdc.1996.1262/production/ref", acknowledgement = ack-nhfb, fjournal = "Journal of Parallel and Distributed Computing", } @Article{Stewart:1997:MDH, author = "David B. Stewart and Pradeep K. Khosla", title = "Mechanisms for Detecting and Handling Timing Errors", journal = j-CACM, volume = "40", number = "1", pages = "87--93", month = jan, year = "1997", CODEN = "CACMA2", ISSN = "0001-0782 (print), 1557-7317 (electronic)", ISSN-L = "0001-0782", bibdate = "Fri Oct 10 18:17:54 MDT 1997", bibsource = "http://www.acm.org/pubs/toc/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org/pubs/citations/journals/cacm/1997-40-1/p87-stewart/", acknowledgement = ack-nhfb, classification = "C6110B (Software engineering techniques); C6130 (Data handling techniques); C6150J (Operating systems)", corpsource = "Inst. for Adv. Comput. Studies, Maryland Univ., College Park, MD, USA", fjournal = "Communications of the ACM", keywords = "adaptive real-time scheduling; aperiodic servers; Chimera; design; error handling; imprecise computation; low-overhead policy-independent system; management; operating systems (computers); performance; periodic threads; real- time operating system; real-time systems; real-time systems analysis; real-time threads; reliability; scheduling; scheduling policies; software fault tolerance; specifications; system failure; theory; timing; timing error detection; worst-case execution times", subject = "{\bf K.6.3} Computing Milieux, MANAGEMENT OF COMPUTING AND INFORMATION SYSTEMS, Software Management, Software development. {\bf C.3} Computer Systems Organization, SPECIAL-PURPOSE AND APPLICATION-BASED SYSTEMS, Real-time systems. {\bf C.4} Computer Systems Organization, PERFORMANCE OF SYSTEMS.", treatment = "P Practical", } @Article{Taura:1997:FGM, author = "Kenjiro Taura and Akinori Yonezawa", title = "Fine-grain Multithreading with Minimal Compiler Support --- {A} Cost Effective Approach to Implementing Efficient Multithreading Languages", journal = j-SIGPLAN, volume = "32", number = "5", pages = "320--333", month = may, year = "1997", CODEN = "SINODQ", ISBN = "0-89791-907-6", ISBN-13 = "978-0-89791-907-4", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Thu May 13 12:37:28 MDT 1999", bibsource = "http://www.acm.org/pubs/contents/proceedings/pldi/258915/index.html; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org:80/pubs/citations/proceedings/pldi/258915/p320-taura/", acknowledgement = ack-nhfb, annote = "Published as part of the Proceedings of PLDI'97.", fjournal = "ACM SIGPLAN Notices", keywords = "design; languages; measurement; performance; standardization; theory", subject = "{\bf D.3.4} Software, PROGRAMMING LANGUAGES, Processors, Compilers. {\bf D.3.3} Software, PROGRAMMING LANGUAGES, Language Constructs and Features, Data types and structures. {\bf D.3.2} Software, PROGRAMMING LANGUAGES, Language Classifications. {\bf D.3.4} Software, PROGRAMMING LANGUAGES, Processors, Code generation. {\bf C.2.2} Computer Systems Organization, COMPUTER-COMMUNICATION NETWORKS, Network Protocols.", } @PhdThesis{TempleLang:1997:MTE, author = "Duncan Walter {Temple Lang}", title = "A multi-threaded extension to a high level interactive statistical computing environment", type = "Thesis ({Ph.D. in Statistics})", school = "Dept. of Statistics, University of California, Berkeley", address = "Berkeley, CA, USA", pages = "vii + 161", month = dec, year = "1997", bibdate = "Sat Apr 20 11:15:46 2002", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Article{Thompson:1997:THP, author = "P. Thompson and G. Bumgardner", title = "{Threads.h++}: a portable {C++} library for multithreaded programming", journal = j-C-PLUS-PLUS-REPORT, volume = "9", number = "3", pages = "24--37", month = mar, year = "1997", CODEN = "CRPTE7", ISSN = "1040-6042", bibdate = "Thu Apr 24 09:46:14 MDT 1997", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, classification = "C6110B (Software engineering techniques); C6110J (Object-oriented programming); C6115 (Programming support); C6150J (Operating systems)", fjournal = "C++ Report", keywords = "application development; application program interfaces; C language; low-level procedural API; multiprocessor machines; multiprogramming; multithreaded programming; object-oriented abstractions; object-oriented languages; object-oriented programming; operating systems; portable C++ library; responsive performance; software libraries; software portability; synchronisation; synchronization; thread control; thread creation; Threads.h++; Web browsers", treatment = "P Practical", } @Article{Thompson:1997:TPC, author = "P. Thompson and G. Bumgardner", title = "{Threads.h++}: a portable {C++} library for multithreaded programming", journal = j-C-PLUS-PLUS-REPORT, volume = "9", number = "3", pages = "24--37", month = mar, year = "1997", CODEN = "CRPTE7", ISSN = "1040-6042", bibdate = "Thu Apr 24 09:46:14 MDT 1997", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, classification = "C6110B (Software engineering techniques); C6110J (Object-oriented programming); C6115 (Programming support); C6150J (Operating systems)", fjournal = "C++ Report", keywords = "application development; application program interfaces; C language; low-level procedural API; multiprocessor machines; multiprogramming; multithreaded programming; object-oriented abstractions; object-oriented languages; object-oriented programming; operating systems; portable C++ library; responsive performance; software libraries; software portability; synchronisation; synchronization; thread control; thread creation; Threads.h++; Web browsers", treatment = "P Practical", } @TechReport{Tsai:1997:PSC, author = "Jenn-Yuan Tsai", title = "Performance study of a concurrent multithreaded processor", type = "Technical report", number = "TR 97-034", institution = "University of Minnesota, Dept. of Computer Science and Engineering", address = "Minneapolis, MN, USA", pages = "24", year = "1997", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "The performance of a concurrent multithreaded architectural model, called superthreading [15], is studied in this paper. It tries to integrate optimizing compilation techniques and run-time hardware support to exploit both thread-level and instruction-level parallelism, as opposed to exploit only instruction-level parallelism in existing superscalars. The superthreaded architecture uses a thread pipelining execution model to enhance the overlapping between threads, and to facilitate data dependence enforcement between threads through compiler-directed, hardware-supported, thread-level control speculation and run-time data dependence checking. We also evaluate the performance of the superthreaded processor through a detailed trace-driven simulator. Our results show that the superthreaded execution model can obtain good performance by exploiting both thread-level and instruction-level parallelism in programs. We also study the design parameters of its main system components, such as the size of the memory buffer, the bandwidth requirement of the communication links between thread processing units, and the bandwidth requirement of the shared data cache.", acknowledgement = ack-nhfb, annote = "Supported in part by the National Science Foundation. Supported in part by the U.S. Army Intelligence Center and Fort Huachuca. Supported in part by a gift from Intel Corporation", keywords = "Compilers (Computer programs); Computer architecture; Parallel processing (Electronic computers); Threads (Computer programs)", } @TechReport{Tsai:1997:SIC, author = "Jenn-Yuan Tsai", title = "Superthreading: integrating compilation technology and processor architecture for cost-effective concurrent multithreading", type = "Technical report", number = "TR 97-033", institution = "University of Minnesota, Dept. of Computer Science and Engineering", address = "Minneapolis, MN, USA", pages = "16", day = "29", month = jan, year = "1997", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "As the number of transistors that can be integrated on a single chip continues to grow, it is important for computer architects to think beyond the traditional approaches of deeper pipelines and wider instruction issue units for improving performance. This single-threaded execution model limits these approaches to exploiting only the relatively small amount of instruction-level parallelism available in application programs. While integrating an entire multiprocessor onto a single chip is feasible, this architecture is limited to exploiting only relatively coarse-grained heavy-weight parallelism. We propose the superthreaded architecture as an excellent alternative for utilizing the large number of transistors that will become available on a single high-density chip. As a hybrid of a wide-issue superscalar processor and a multiprocessor-on-a-chip, this new concurrent multithreading architecture can leverage the best of existing and future parallel hardware and software technologies. By incorporating speculation for control dependences and run-time checking of data dependences, the superthreaded architecture can exploit the multiple granularities of parallelism available in general-purpose application programs to reduce the execution time of a single program.", acknowledgement = ack-nhfb, annote = "Supported in part by the U.S. Army Intelligence Center and Fort Huachuca. Supported in part by the National Science Foundation. Supported in part by a gift from the Intel Corporation", keywords = "Compilers (Computer programs); Computer architecture; Parallel processing (Electronic computers); Threads (Computer programs)", } @Article{Vanhelsuwe:1997:BRJ, author = "Laurence Vanhelsuw{\'e}", title = "Book Review: The {Java} {Threads} {API} makes it to print media", journal = j-JAVAWORLD, volume = "2", number = "7", pages = "??--??", month = jul, year = "1997", CODEN = "????", ISSN = "1091-8906", bibdate = "Thu Aug 13 14:52:27 1998", bibsource = "http://www.javaworld.com/javaworld/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.javaworld.com/javaworld/jw-07-1997/jw-07-threads.htm", acknowledgement = ack-nhfb, } @Article{Vanhelsuwe:1997:JPE, author = "Laurence Vanhelsuw{\'e}", title = "{JavaBeans}: properties, events, and thread safety", journal = j-JAVAWORLD, volume = "2", number = "9", pages = "??--??", month = sep, year = "1997", CODEN = "????", ISSN = "1091-8906", bibdate = "Thu Aug 13 14:52:28 1998", bibsource = "http://www.javaworld.com/javaworld/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.javaworld.com/javaworld/jw-09-1997/jw-09-raceconditions.htm", acknowledgement = ack-nhfb, } @Article{Venners:1997:UHH, author = "Bill Venners", title = "Under the Hood: How the {Java} virtual machine performs thread synchronization", journal = j-JAVAWORLD, volume = "2", number = "7", pages = "??--??", month = jul, year = "1997", CODEN = "????", ISSN = "1091-8906", bibdate = "Thu Aug 13 14:52:27 1998", bibsource = "http://www.javaworld.com/javaworld/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.javaworld.com/javaworld/jw-07-1997/jw-07-hood.htm", acknowledgement = ack-nhfb, } @Article{Vermeulen:1997:JDW, author = "Alain Vermeulen", title = "{Java} Deadlock: The woes of multithreaded design", journal = j-DDJ, volume = "22", number = "9", pages = "52, 54--56, 88, 89", month = sep, year = "1997", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Mon Aug 11 12:53:44 1997", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Dr. Dobb's Journal of Software Tools", } @Article{Whittaker:1997:TML, author = "Steve Whittaker and Jerry Swanson and Jakov Kucan and Candy Sidner", title = "{TeleNotes}: managing lightweight interactions in the desktop", journal = j-TOCHI, volume = "4", number = "2", pages = "137--168", month = jun, year = "1997", CODEN = "ATCIF4", ISSN = "1073-0516", ISSN-L = "1073-0516", bibdate = "Tue Jan 19 05:49:17 MST 1999", bibsource = "http://www.acm.org/pubs/contents/journals/tochi/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org:80/pubs/citations/journals/tochi/1997-4-2/p137-whittaker/", abstract = "Communication theories and technology have tended to focus on extended, formal meetings and have neglected a prevalent and vital form of workplace communication --- namely, lightweight communication. Unlike formal, extended meetings, lightweight interaction is brief, informal, unplanned, and intermittent. We analyze naturalistic data from a study of work-place communication and derive five design criteria for lightweight interaction systems. These criteria require that systems for lightweight interaction support {\em conversational tracking, rapid connection}, the ability to {\em leave a message, context management,} and {\em shared real-time objects}. Using these criteria, we evaluate existing interpersonal communications technologies. We then describe an implementation of a system (TeleNotes) that is designed to support lightweight interaction by meeting these criteria. The interface metaphor allows communications to be based around desktop objects, resembling ``sticky notes.'' These objects are also organized into ``desktop piles'' to support conversational threads and provide mechanisms for initiating real-time audio, video, and application sharing. We conducted informal user testing of several system prototypes. Based on our findings, outstanding issues concerning theory and systems design for communication systems are outlined --- in particular, with regard to the issue of managing conversations over time.", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Computer-Human Interaction", keywords = "human factors", subject = "{\bf H.5.3} Information Systems, INFORMATION INTERFACES AND PRESENTATION, Group and Organization Interfaces, Evaluation/methodology. {\bf H.1.2} Information Systems, MODELS AND PRINCIPLES, User/Machine Systems, Human factors. {\bf H.5.3} Information Systems, INFORMATION INTERFACES AND PRESENTATION, Group and Organization Interfaces, Asynchronous interaction. {\bf I.3.6} Computing Methodologies, COMPUTER GRAPHICS, Methodology and Techniques, Interaction techniques. {\bf H.5.3} Information Systems, INFORMATION INTERFACES AND PRESENTATION, Group and Organization Interfaces, Synchronous interaction. {\bf H.5.1} Information Systems, INFORMATION INTERFACES AND PRESENTATION, Multimedia Information Systems, Evaluation/methodology.", } @MastersThesis{Yang:1997:MUA, author = "Chia Wei Yang", title = "A multi-context uniprocessor: another multithreaded architecture", type = "Thesis (M.S.)", school = "California Polytechnic State University", address = "San Luis Obispo, CA, USA", pages = "viii + 129", year = "1997", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, annote = "Proposes a computer architecture model that adapts all advantages from multithreaded models to a uniprocessor environment.", keywords = "Computer architecture; Multiprocessors; Parallel processing (Electronic Computers)", } @Book{Adamo:1998:MTO, author = "Jean-Marc Adamo", title = "Multi-threaded object-oriented {MPI}-based message passing interface: the {ARCH} library", volume = "SECS 446", publisher = pub-KLUWER, address = pub-KLUWER:adr, pages = "xiv + 185", year = "1998", ISBN = "0-7923-8165-3", ISBN-13 = "978-0-7923-8165-5", LCCN = "TK5102.5.A293 1998", bibdate = "Fri Aug 7 08:29:38 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", price = "US\$120.00", series = "The Kluwer international series in engineering and computer science", acknowledgement = ack-nhfb, keywords = "data transmission systems; object-oriented programming (computer science); threads (computer programs)", libnote = "Not yet in my library.", } @Article{Anonymous:1998:NTS, author = "Anonymous", title = "New Tools: Software Development: {Uniscape}'s Internationalization Library; {Global Technologies}' {Unix-to-NT} Solution; {KAI}'s Multithreaded {Java} Debugging Tool; {Price Systems}' Parametric Forecasting Tool", journal = j-COMPUTER, volume = "31", number = "6", pages = "98, 102", month = jun, year = "1998", CODEN = "CPTRB4", ISSN = "0018-9162 (print), 1558-0814 (electronic)", ISSN-L = "0018-9162", bibdate = "Thu Jun 4 08:22:02 MDT 1998", bibsource = "http://computer.org/computer/co1998/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://dlib.computer.org/co/books/co1998/pdf/r6098.pdf", acknowledgement = ack-nhfb, fjournal = "Computer", } @Article{Bangs:1998:BOS, author = "Gaurav Bangs and Peter Druschel and Jeffrey C. Mogul", title = "Better operating system features for faster network servers", journal = j-SIGMETRICS, volume = "26", number = "3", pages = "23--30", month = dec, year = "1998", CODEN = "????", DOI = "http://doi.acm.org/10.1145/306225.306234", ISSN = "0163-5999 (print), 1557-9484 (electronic)", ISSN-L = "0163-5999", bibdate = "Thu Jun 26 11:27:29 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Widely-used operating systems provide inadequate support for large-scale Internet server applications. Their algorithms and interfaces fail to efficiently support either event-driven or multi-threaded servers. They provide poor control over the scheduling and management of machine resources, making it difficult to provide robust and controlled service. We propose new UNIX interfaces to improve scalability, and to provide fine-grained scheduling and resource management.", acknowledgement = ack-nhfb, fjournal = "ACM SIGMETRICS Performance Evaluation Review", } @Article{Biagioni:1998:SST, author = "Edoardo Biagioni and Ken Cline and Peter Lee and Chris Okasaki and Chris Stone", title = "Safe-for-Space Threads in {Standard ML}", journal = j-HIGHER-ORDER-SYMB-COMPUT, volume = "11", number = "2", pages = "209--225", month = dec, year = "1998", CODEN = "LSCOEX", DOI = "http://www.springerlink.com/openurl.asp?genre=article&id=doi:10.1023/A:1010016600604", ISSN = "1388-3690 (print), 2212-0793 (electronic)", ISSN-L = "1388-3690", bibdate = "Wed Jul 6 15:50:28 MDT 2005", bibsource = "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=1388-3690&volume=11&issue=2; http://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.wkap.nl/issuetoc.htm/1388-3690+11+2+1998; OCLC Contents1st database", URL = "http://www.springerlink.com/openurl.asp?genre=article&issn=1388-3690&volume=11&issue=2&spage=209; http://www.wkap.nl/oasis.htm/187569", acknowledgement = ack-nhfb, fjournal = "Higher-Order and Symbolic Computation", } @TechReport{Bic:1998:MAD, author = "Lubomir Bic and Michael B. Dillencourt and Munehiro Fukuda", title = "Mobile agents, {DSM}, coordination, and self-migrating threads: a common framework", type = "UCI-ICS technical report", number = "98-33", institution = "Information and Computer Science, University of California, Irvine", address = "Irvine, CA", pages = "11", day = "8", month = oct, year = "1998", LCCN = "Z699 .C3 no.98-33", bibdate = "Fri May 10 12:18:17 MDT 2002", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "distributed shared memory; intelligent agents (computer software)", } @Article{Blumofe:1998:SES, author = "Robert D. Blumofe and Charles E. Leiserson", title = "Space-Efficient Scheduling of Multithreaded Computations", journal = j-SIAM-J-COMPUT, volume = "27", number = "1", pages = "202--229", month = feb, year = "1998", CODEN = "SMJCAT", ISSN = "0097-5397 (print), 1095-7111 (electronic)", ISSN-L = "0097-5397", bibdate = "Sat Dec 5 17:26:53 MST 1998", bibsource = "http://epubs.siam.org/sam-bin/dbq/toclist/SICOMP/27/1; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://epubs.siam.org/sam-bin/dbq/article/25947", acknowledgement = ack-nhfb, fjournal = "SIAM Journal on Computing", } @InProceedings{Caromel:1998:JFS, author = "Denis Caromel and Julien Vayssiere", title = "A {Java} Framework for Seamless Sequential, Multi-threaded, and Distributed Programming", crossref = "ACM:1998:AWJ", pages = "??--??", year = "1998", bibdate = "Thu Apr 27 10:43:08 2000", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.cs.ucsb.edu/conferences/java98/papers/javapp.pdf; http://www.cs.ucsb.edu/conferences/java98/papers/javapp.ps", acknowledgement = ack-nhfb, } @Article{Chapman:1998:OHI, author = "B. Chapman and P. Mehrotra", title = "{OpenMP} and {HPF}: Integrating Two Paradigms", journal = j-LECT-NOTES-COMP-SCI, volume = "1470", pages = "650--??", year = "1998", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Sat Oct 10 14:40:24 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Book{Cohen:1998:WMP, author = "Aaron Cohen and Mike Woodring", title = "{Win32} Multithreaded Programming", publisher = pub-ORA, address = pub-ORA:adr, pages = "xv + 705", year = "1998", ISBN = "1-56592-296-4", ISBN-13 = "978-1-56592-296-9", LCCN = "QA76.76.O63 C633 1998", bibdate = "Fri Aug 7 08:29:38 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", price = "US\$39.95", URL = "http://www.ora.com/catalog/multithread/; http://www.oreilly.com/catalog/multithread", acknowledgement = ack-nhfb, keywords = "Microsoft Win32; Microsoft Windows (Computer file); Operating systems (Computers)", } @Article{Criscolo:1998:JQ, author = "Mike Criscolo", title = "{Java Q\&A}: How Do {I} Queue {Java} Threads?", journal = j-DDJ, volume = "23", number = "10", pages = "127--129", month = oct, year = "1998", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Fri Sep 11 09:12:05 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.ddj.com/ftp/1998/1998_10/jqa108.txt; http://www.ddj.com/ftp/1998/1998_10/jqa108.zip", abstract = "In examining queuing techniques in Java, Mike presents one approach to multithreading he has implemented, and examines the differences between centralized- and distributed-queuing models. Additional resources include jqa108.txt (listings) and jqa108.zip (source code).", acknowledgement = ack-nhfb, fjournal = "Dr. Dobb's Journal of Software Tools", } @Article{Criscolo:1998:JQH, author = "Mike Criscolo", title = "{Java Q and A}: How Do {I} Queue {Java} Threads?", journal = j-DDJ, volume = "23", number = "10", pages = "127--129", month = oct, year = "1998", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Fri Sep 11 09:12:05 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.ddj.com/ftp/1998/1998_10/jqa108.txt; http://www.ddj.com/ftp/1998/1998_10/jqa108.zip", abstract = "In examining queuing techniques in Java, Mike presents one approach to multithreading he has implemented, and examines the differences between centralized- and distributed-queuing models. Additional resources include jqa108.txt (listings) and jqa108.zip (source code).", acknowledgement = ack-nhfb, fjournal = "Dr. Dobb's Journal of Software Tools", } @Article{Cromwell:1998:PBD, author = "Jeff Cromwell", title = "Programmer's Bookshelf: The Dawning of the Age of Multithreading", journal = j-DDJ, volume = "23", number = "9", pages = "127, 129", month = sep, year = "1998", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Wed Aug 05 10:12:23 1998", bibsource = "http://www.ddj.com/ddj/1998/1998_09/index.htm; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "", abstract = "Jeff's focus this month is multithreading, as he examines {\em Multithreading Programming Techniques in Win32}, by Jim Beveridge and R. Wiener, {\em Object-Oriented Multithreading Using C++}, by Cameron and Tracy Hughes, and {\em Multithreading Programming Techniques}, by Shashi Prasad.", acknowledgement = ack-nhfb, fjournal = "Dr. Dobb's Journal of Software Tools", } @Article{Dagum:1998:OIS, author = "Leonardo Dagum and Ramesh Menon", title = "{OpenMP}: An Industry-Standard {API} for Shared-Memory Programming", journal = j-IEEE-COMPUT-SCI-ENG, volume = "5", number = "1", pages = "46--55", month = jan # "\slash " # mar, year = "1998", CODEN = "ISCEE4", DOI = "http://dx.doi.org/10.1109/99.660313", ISSN = "1070-9924", ISSN-L = "1070-9924", bibdate = "Sat Jan 9 08:57:23 MST 1999", bibsource = "http://www.computer.org/cse/cs1998; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://dlib.computer.org/cs/books/cs1998/pdf/c1046.pdf; http://www.computer.org/cse/cs1998/c1046abs.htm", acknowledgement = ack-nhfb, fjournal = "IEEE Computational Science \& Engineering", } @Article{DeRusso:1998:MEH, author = "Joe {DeRusso, III} and Peter Haggar", title = "Multithreaded Exception Handling in {Java}", journal = j-JAVA-REPORT, volume = "3", number = "??", pages = "??--??", month = aug, year = "1998", CODEN = "JREPFI", ISSN = "1086-4660", bibdate = "Sat Dec 26 13:52:53 1998", bibsource = "http://archive.javareport.com/9808/html/from_pages/index.shtml; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://archive.javareport.com/9808/html/from_pages/ftp_feature.shtml", abstract = "Introducing new classes and interfaces to be used when writing multithreaded Java programs. These classes are small, easy to use, and effectively enable you to handle exceptions occurring on secondary threads.", acknowledgement = ack-nhfb, } @Article{Dyer:1998:CAS, author = "Dave Dyer", title = "Can {Assure} save {Java} from the perils of multithreading?", journal = j-JAVAWORLD, volume = "3", number = "10", pages = "??--??", year = "1998", CODEN = "????", ISSN = "1091-8906", bibdate = "Mon Jan 4 06:11:43 MST 1999", bibsource = "http://www.javaworld.com/javaworld/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.javaworld.com/javaworld/jw-10-1998/jw-10-assure.htm", acknowledgement = ack-nhfb, } @Article{Frigo:1998:ICM, author = "Matteo Frigo and Charles E. Leiserson and Keith H. Randall", title = "The Implementation of the {Cilk-5} Multithreaded Language", journal = j-SIGPLAN, volume = "33", number = "5", pages = "212--223", month = may, year = "1998", CODEN = "SINODQ", ISBN = "0-89791-987-4", ISBN-13 = "978-0-89791-987-6", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:17:47 MST 2003", bibsource = "http://portal.acm.org/; http://www.acm.org/pubs/contents/proceedings/pldi/277650/index.html; http://www.cs.virginia.edu/pldi98/program.html; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org:80/pubs/citations/proceedings/pldi/277650/p212-frigo/", acknowledgement = ack-nhfb, annote = "Published as part of the Proceedings of PLDI'98.", fjournal = "ACM SIGPLAN Notices", keywords = "algorithms; languages; performance", subject = "{\bf D.3.2} Software, PROGRAMMING LANGUAGES, Language Classifications, Concurrent, distributed, and parallel languages. {\bf D.1.3} Software, PROGRAMMING TECHNIQUES, Concurrent Programming, Parallel programming. {\bf D.3.3} Software, PROGRAMMING LANGUAGES, Language Constructs and Features, Control structures. {\bf D.3.2} Software, PROGRAMMING LANGUAGES, Language Classifications, C.", } @Article{Geary:1998:SM, author = "David Geary", title = "{Swing} and multithreading", journal = j-JAVA-REPORT, volume = "3", number = "??", pages = "??--??", month = nov, year = "1998", CODEN = "JREPFI", ISSN = "1086-4660", bibdate = "Sat Dec 26 13:52:53 1998", bibsource = "http://archive.javareport.com/9811/html/from_pages/index.shtml; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://archive.javareport.com/9811/html/from_pages/ftp_col1.shtml", abstract = "Read about why Swing is not thread-safe and the ramifications of a single-threaded design for developers using Swing.", acknowledgement = ack-nhfb, } @Article{Golla:1998:CEB, author = "Prasad N. Golla and Eric C. Lin", title = "A comparison of the effect of branch prediction on multithreaded and scalar architectures", journal = j-COMP-ARCH-NEWS, volume = "26", number = "4", pages = "3--11", month = sep, year = "1998", CODEN = "CANED2", DOI = "http://doi.acm.org/10.1145/1216475.1216476", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 12:06:40 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Speculative instructions execution requires dynamic branch predictors to increase the performance of a processor by executing from predicted branch target routines. Conventional Scalar architectures such as the Superscalar or Multiscalar architecture executes from a single stream, while a Multithreaded architecture executes from multiple streams at a time. Several aggressive branch predictors have been proposed with high prediction accuracies. Unfortunately, none of the branch predictors can provide 100\% accuracy. Therefore, there is an inherent limitation on speculative execution in real implementation. In this paper, we show that Multithreaded architecture is a better candidate for utilizing speculative execution than Scalar architectures. Generally the branch prediction performance degradation is compounded for larger window sizes on Scalar architectures, while for a Multithreaded architecture, by increasing the number of executing threads, we could sustain a higher performance for a large aggregated speculative window size. Hence, heavier workloads may increase performance and utilization for Multithreaded architectures. We present analytical and simulation results to support our argument.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", } @TechReport{Golla:1998:CMR, author = "Prasad N. Golla and Eric C. Lin", title = "Cache memory requirements for multithreaded uniprocessor architecture", type = "Technical paper", number = "98-CSE-03", institution = "Dept. of Computer Science and Engineering, Southern Methodist University", address = "Dallas, TX, USA", pages = "32", year = "1998", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Article{Gomez:1998:CAM, author = "J. C. Gomez and E. Mascarenhas and V. Rego", title = "The {CLAM} Approach to Multithreaded Communication on Shared Memory Multiprocessors: Design and Experiments", journal = j-IEEE-TRANS-PAR-DIST-SYS, volume = "9", number = "1", pages = "36--49", month = jan, year = "1998", CODEN = "ITDSEO", DOI = "http://dx.doi.org/10.1109/71.655241", ISSN = "1045-9219 (print), 1558-2183 (electronic)", ISSN-L = "1045-9219", bibdate = "Fri Nov 6 12:31:15 MST 1998", bibsource = "http://www.computer.org/tpds/td1998/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://dlib.computer.org/td/books/td1998/pdf/l0036.pdf; http://www.computer.org/tpds/td1998/l0036abs.htm", acknowledgement = ack-nhfb, classification = "B6150M (Protocols); B6210L (Computer communications); C5440 (Multiprocessing systems); C5640 (Protocols); C5670 (Network performance)", corpsource = "Dept. of Comput. Sci., Purdue Univ., West Lafayette, IN, USA", fjournal = "IEEE Transactions on Parallel and Distributed Systems", keywords = "CLAM approach; communications environment; message passing; multithreaded communication; OS-level process; performance evaluation; protocols; scalable multiprotocol support; scheduling algorithms; shared memory systems; shared-memory multiprocessors; user-space protocols", treatment = "A Application; P Practical", } @Article{Holub:1998:PJTa, author = "Allen Holub", title = "Programming {Java} threads in the real world: Threading Architectures", journal = j-JAVAWORLD, volume = "3", number = "9", pages = "??--??", month = sep, year = "1998", CODEN = "????", ISSN = "1091-8906", bibdate = "Thu Sep 10 14:37:36 MDT 1998", bibsource = "http://www.javaworld.com/javaworld/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.holub.com/goodies/javaworld/jw_index.html; http://www.javaworld.com/javaworld/jw-09-1998/jw-09-threads.htm", acknowledgement = ack-nhfb, } @Article{Holub:1998:PJTb, author = "Allen Holub", title = "Programming {Java} threads in the real world, {Part} 2: Common multithreading Pitfalls (Deadlock, etc.)", journal = j-JAVAWORLD, volume = "3", number = "10", pages = "??--??", year = "1998", CODEN = "????", ISSN = "1091-8906", bibdate = "Mon Jan 4 06:11:43 MST 1999", bibsource = "http://www.javaworld.com/javaworld/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.holub.com/goodies/javaworld/jw_index.html; http://www.javaworld.com/javaworld/jw-10-1998/jw-10-toolbox.htm", acknowledgement = ack-nhfb, } @Article{Holub:1998:PJTc, author = "Allen Holub", title = "Programming {Java} threads in the real world, {Part} 3: Semaphore, Lock\_manager, and Mutex", journal = j-JAVAWORLD, volume = "3", number = "11", pages = "??--??", year = "1998", CODEN = "????", ISSN = "1091-8906", bibdate = "Mon Jan 4 06:11:43 MST 1999", bibsource = "http://www.javaworld.com/javaworld/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.holub.com/goodies/javaworld/jw_index.html; http://www.javaworld.com/javaworld/jw-11-1998/jw-11-toolbox.htm", acknowledgement = ack-nhfb, } @Article{Holub:1998:PJTd, author = "Allen Holub", title = "Programming {Java} threads in the real world, {Part} 4: Condition Variables and Counting Semaphores", journal = j-JAVAWORLD, volume = "3", number = "12", pages = "??--??", year = "1998", CODEN = "????", ISSN = "1091-8906", bibdate = "Mon Jan 4 06:22:03 MST 1999", bibsource = "http://www.javaworld.com/javaworld/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.holub.com/goodies/javaworld/jw_index.html; http://www.javaworld.com/javaworld/jw-12-1998/jw-12-toolbox.htm", acknowledgement = ack-nhfb, } @PhdThesis{Hopper:1998:CFM, author = "Michael A. Hopper", title = "A compiler framework for multithreaded parallel systems", type = "Thesis (Ph.D.)", school = "School of Electrical and Computer Engineering, Georgia Institute of Technology", address = "Atlanta, GA, USA", pages = "xii + 110", year = "1998", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, annote = "Directed by William Appelbe.", keywords = "Compilers (Computer programs); Parallel processing (Electronic computers)", } @Article{Howes:1998:TPC, author = "Brad Howes", title = "Template processing classes for {Python}", journal = j-DDJ, volume = "23", number = "2", pages = "38, 40, 42, 44--46, 48, 100", month = feb, year = "1998", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Thu May 21 19:02:04 MDT 1998", bibsource = "ftp://ftp.math.utah.edu/pub/tex/bib/dr-dobbs.bib; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Brad shows how you can embed Python objects in HTML pages using boilerplate template processing classes. Then Python creator Guido van Rossum adds a note on what's new in the just-released Python 1.5.", acknowledgement = ack-nhfb, classification = "C6130D (Document processing techniques); C6130M (Multimedia); C6160J (Object- oriented databases)", fjournal = "Dr. Dobb's Journal of Software Tools", keywords = "application program interfaces; BoilerPlate; CGI infrastructure; conditional control; Emacs; embedded HTML text; errors; HTML document template; HTML editing; hypermedia; iterative control; multithreaded CGI service; object database; object paradigm; object-oriented databases; page description languages; persistent objects; placeholders; print statements; Python; run- time values; run-time HTML generation; syntax coloring; tagged locations; template HTML constructs; template processing classes; text regions", treatment = "P Practical", } @Article{Itzkovitz:1998:TMA, author = "Ayal Itzkovitz and Assaf Schuster and Lea Shalev", title = "Thread migration and its applications in distributed shared memory systems", journal = j-J-SYST-SOFTW, volume = "42", number = "1", pages = "71--87", month = jul, year = "1998", CODEN = "JSSODM", ISSN = "0164-1212 (print), 1873-1228 (electronic)", ISSN-L = "0164-1212", bibdate = "Thu Dec 17 14:07:21 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "The Journal of systems and software", } @Article{Ji:1998:PMM, author = "Minwen Ji and Edward W. Felten and Kai Li", title = "Performance measurements for multithreaded programs", journal = j-SIGMETRICS, volume = "26", number = "1", pages = "161--170", month = jun, year = "1998", CODEN = "????", DOI = "http://doi.acm.org/10.1145/277858.277900", ISSN = "0163-5999 (print), 1557-9484 (electronic)", ISSN-L = "0163-5999", bibdate = "Thu Jun 26 11:25:18 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Multithreaded programming is an effective way to exploit concurrency, but it is difficult to debug and tune a highly threaded program. This paper describes a performance tool called Tmon for monitoring, analyzing and tuning the performance of multithreaded programs. The performance tool has two novel features: it uses `thread waiting time' as a measure and constructs thread waiting graphs to show thread dependencies and thus performance bottlenecks, and it identifies `semi-busy-waiting' points where CPU cycles are wasted in condition checking and context switching. We have implemented the Tmon tool and, as a case study, we have used it to measure and tune a heavily threaded file system. We used four workloads to tune different aspects of the file system. We were able to improve the file system bandwidth and throughput significantly. In one case, we were able to improve the bandwidth by two orders of magnitude.", acknowledgement = ack-nhfb, fjournal = "ACM SIGMETRICS Performance Evaluation Review", } @Article{Keckler:1998:EFG, author = "Stephen W. Keckler and William J. Dally and Daniel Maskit and Nicholas P. Carter and Andrew Chang and Whay S. Lee", title = "Exploiting fine-grain thread level parallelism on the {MIT} multi-{ALU} processor", journal = j-COMP-ARCH-NEWS, volume = "26", number = "3", pages = "306--317", month = jun, year = "1998", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:58 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", } @Article{Krinke:1998:SST, author = "Jens Krinke", title = "Static Slicing of Threaded Programs", journal = j-SIGPLAN, volume = "33", number = "7", pages = "35--42", month = jul, year = "1998", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:17:49 MST 2003", bibsource = "Compendex database; http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Static program slicing is an established method for analyzing sequential programs, especially for program understanding, debugging and testing. Until now, there was no slicing method for threaded programs which handles interference correctly. We present such a method which also calculates more precise static slices. This paper extends the well known structures of the control flow graph and the program dependence graph for threaded programs with interference. This new technique does not require serialization of threaded programs.", acknowledgement = ack-nhfb, affiliation = "Technische Universitaet Braunschweig", affiliationaddress = "Braunschweig, Ger", classification = "723; 723.1; 723.2; 723.5", conference = "Proceedings of the 1998 ACM SIGPLAN\slash SIGSOFT Workshop on Program Analysis for Software Tools and Engineering", fjournal = "ACM SIGPLAN Notices", journalabr = "ACM SIGPLAN SIGSOFT Workshop Program Anal Software Tools Eng", keywords = "Computer aided software engineering; Computer software selection and evaluation; Control flow graphs; Data flow analysis; Data structures; Program debugging; Static program slicing; Threaded programs", meetingaddress = "Montreal, Can", meetingdate = "Jun 16 1998", meetingdate2 = "06/16/98", sponsor = "ACM", } @Book{Lewis:1998:MPP, author = "Bil Lewis and Daniel J. Berg", title = "Multithreaded programming with pthreads", publisher = pub-SUN, address = pub-SUN:adr, pages = "xxx + 382", year = "1998", ISBN = "0-13-680729-1 (paperback)", ISBN-13 = "978-0-13-680729-2 (paperback)", LCCN = "QA76.76.T55 L49 1998", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.amazon.com/exec/obidos/ASIN/0136807291/ref=sim_books/002-4892305-5599452; http://www.sun.com/books/catalog/lewis2/index.html", acknowledgement = ack-nhfb, alttitle = "Pthreads", keywords = "POSIX (Computer software standard); Threads (Computer programs); UNIX (Computer file)", } @Article{Lo:1998:ADW, author = "Jack L. Lo and Luiz Andr{\'e} Barroso and Susan J. Eggers and Kourosh Gharachorloo and Henry M. Levy and Sujay S. Parekh", title = "An analysis of database workload performance on simultaneous multithreaded processors", journal = j-COMP-ARCH-NEWS, volume = "26", number = "3", pages = "39--50", month = jun, year = "1998", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:58 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", } @InProceedings{Lu:1998:ONW, author = "Honghui Lu", title = "{OpenMP} on Networks of Workstations", crossref = "ACM:1998:SHP", pages = "??--??", year = "1998", bibdate = "Wed Oct 07 08:50:26 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.supercomp.org/sc98/papers/", acknowledgement = ack-nhfb, } @Article{Manley:1998:GPT, author = "Kevin T. Manley", title = "General-Purpose Threads with {I/O} Completion Ports", journal = j-CCCUJ, volume = "16", number = "4", pages = "??--??", month = apr, year = "1998", CODEN = "CCUJEX", ISSN = "1075-2838", bibdate = "Tue May 14 18:09:15 MDT 2002", bibsource = "http://www.cuj.com/articles/1998/9804/9804toc.htm?topic=articles; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Divide and conquer is a good strategy for partitioning a large job, provided you don't divide too much. Windows NT helps you guess right.", acknowledgement = ack-nhfb, fjournal = "C/C++ Users Journal", } @Article{Mascarenhas:1998:MTP, author = "Edward Mascarenhas and Vernon Rego", title = "Migrant threads on process farms: parallel programming with {Ariadne}", journal = j-CPE, volume = "10", number = "9", pages = "673--698", day = "10", month = aug, year = "1998", CODEN = "CPEXEI", ISSN = "1040-3108", ISSN-L = "1040-3108", bibdate = "Tue Sep 7 06:06:42 MDT 1999", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www3.interscience.wiley.com/journalfinder.html", URL = "http://www3.interscience.wiley.com/cgi-bin/abstract?ID=10008703; http://www3.interscience.wiley.com/cgi-bin/fulltext?ID=10008703&PLACEBO=IE.pdf", acknowledgement = ack-nhfb, fjournal = "Concurrency, practice and experience", } @Article{McManis:1998:DUT, author = "Chuck McManis", title = "In Depth: Using threads with collections, {Part 1}", journal = j-JAVAWORLD, volume = "3", number = "3", pages = "??--??", month = mar, year = "1998", CODEN = "????", ISSN = "1091-8906", bibdate = "Thu Aug 13 08:48:26 MDT 1998", bibsource = "http://www.javaworld.com/javaworld/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.javaworld.com/javaworld/jw-03-1998/jw-03-indepth.html", acknowledgement = ack-nhfb, } @Article{McManis:1998:JDU, author = "Chuck McManis", title = "{Java} In Depth: Using threads with collections, part 2", journal = j-JAVAWORLD, volume = "3", number = "6", pages = "??--??", month = jun, year = "1998", CODEN = "????", ISSN = "1091-8906", bibdate = "Thu Aug 13 08:48:26 MDT 1998", bibsource = "http://www.javaworld.com/javaworld/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.javaworld.com/javaworld/jw-06-1998/jw-06-indepth.html", acknowledgement = ack-nhfb, } @Book{Nichols:1998:PP, author = "Bradford Nichols and Dick Buttlar and Jacqueline Proulx Farrell", title = "Pthreads programming", publisher = pub-ORA, address = pub-ORA:adr, pages = "xvi + 267", year = "1998", ISBN = "1-56592-115-1", ISBN-13 = "978-1-56592-115-3", LCCN = "QA76.642 .N53 1998", bibdate = "Fri May 10 12:18:17 MDT 2002", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", series = "Nutshell handbook", acknowledgement = ack-nhfb, annote = "A POSIX standard for better multiprocessing.", keywords = "compilers (computer programs); parallel programming (computer science)", } @Article{Piumarta:1998:ODT, author = "Ian Piumarta and Fabio Riccardi", title = "Optimizing Direct-threaded Code by Selective Inlining", journal = j-SIGPLAN, volume = "33", number = "5", pages = "291--300", month = may, year = "1998", CODEN = "SINODQ", ISBN = "0-89791-987-4", ISBN-13 = "978-0-89791-987-6", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:17:47 MST 2003", bibsource = "http://portal.acm.org/; http://www.acm.org/pubs/contents/proceedings/pldi/277650/index.html; http://www.cs.virginia.edu/pldi98/program.html; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org:80/pubs/citations/proceedings/pldi/277650/p291-piumarta/", acknowledgement = ack-nhfb, annote = "Published as part of the Proceedings of PLDI'98.", fjournal = "ACM SIGPLAN Notices", keywords = "algorithms; experimentation; languages; performance", subject = "{\bf D.3.4} Software, PROGRAMMING LANGUAGES, Processors, Optimization. {\bf D.3.4} Software, PROGRAMMING LANGUAGES, Processors, Interpreters. {\bf D.3.4} Software, PROGRAMMING LANGUAGES, Processors, Translator writing systems and compiler generators.", } @Article{Plauger:1998:SCCl, author = "P. J. Plauger", title = "{Standard C/C++}: Thread Safety", journal = j-CCCUJ, volume = "16", number = "12", pages = "??--??", month = dec, year = "1998", CODEN = "CCUJEX", ISSN = "1075-2838", bibdate = "Tue May 14 18:09:18 MDT 2002", bibsource = "http://www.cuj.com/articles/1998/9812/9812toc.htm?topic=articles; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "The C++ Standard doesn't talk about thread safety, but everyone else does.", acknowledgement = ack-nhfb, fjournal = "C/C++ Users Journal", } @Article{Pomerantz:1998:CNS, author = "Dave Pomerantz", title = "{C++} Notifiers: Simplifying system development", journal = j-DDJ, volume = "23", number = "8", pages = "26, 28, 30--31, 89--90", month = aug, year = "1998", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Thu Jul 16 13:01:59 MDT 1998", bibsource = "http://www.ddj.com/ddj/1998/1998_08/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.ddj.com/ftp/1998/1998_08/notifier.txt; http://www.ddj.com/ftp/1998/1998_08/notifier.zip", abstract = "Notifiers, also called ``events'' or ``messages,'' are used to pass information anonymously between objects. Dave shows how notifiers can work in C++, using a multithreaded application as an example.", acknowledgement = ack-nhfb, fjournal = "Dr. Dobb's Journal of Software Tools", } @Article{Reck:1998:TSR, author = "Bill Reck", title = "Thread Synchronization with Reference-Counting Handles", journal = j-CCCUJ, volume = "16", number = "2", pages = "??--??", month = feb, year = "1998", CODEN = "CCUJEX", ISSN = "1075-2838", bibdate = "Tue May 14 18:09:14 MDT 2002", bibsource = "http://www.cuj.com/articles/1998/9802/9802toc.htm?topic=articles; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Often, the best time to protect access to a shared object is right when you reach for it.", acknowledgement = ack-nhfb, fjournal = "C/C++ Users Journal", } @Article{Schmidt:1998:EAM, author = "Douglas C. Schmidt", title = "Evaluating architectures for multithreaded object request brokers", journal = j-CACM, volume = "41", number = "10", pages = "54--60", month = oct, year = "1998", CODEN = "CACMA2", ISSN = "0001-0782 (print), 1557-7317 (electronic)", ISSN-L = "0001-0782", bibdate = "Tue Oct 6 21:15:42 MDT 1998", bibsource = "http://www.acm.org/pubs/toc/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org:80/pubs/citations/journals/cacm/1998-41-10/p54-schmidt/", acknowledgement = ack-nhfb, fjournal = "Communications of the ACM", } @Article{Shaw:1998:CIP, author = "Andrew Shaw and Arvind and Kyoo-Chan Cho and Christopher Hill and R. Paul Johnson and John Marshall", title = "A Comparison of Implicitly Parallel Multithreaded and Data-Parallel Implementations of an Ocean Model", journal = j-J-PAR-DIST-COMP, volume = "48", number = "1", pages = "1--51", day = "10", month = jan, year = "1998", CODEN = "JPDCER", DOI = "http://dx.doi.org/10.1006/jpdc.1997.1390", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Thu Mar 9 09:19:04 MST 2000", bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1997.1390/production; http://www.idealibrary.com/links/doi/10.1006/jpdc.1997.1390/production/pdf; http://www.idealibrary.com/links/doi/10.1006/jpdc.1997.1390/production/ref", acknowledgement = ack-nhfb, fjournal = "Journal of Parallel and Distributed Computing", } @PhdThesis{Shaw:1998:CPM, author = "Andrew Shaw", title = "Compiling for parallel multithreaded computation on symmetric multiprocessors", type = "Thesis (Ph.D.)", school = "Massachusetts Institute of Technology, Department of Electrical Engineering and Computer Science", address = "Cambridge, MA, USA", pages = "149", year = "1998", bibdate = "Fri Aug 7 09:34:36 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Article{Silc:1998:APC, author = "J. Silc and B. Robic and T. Ungerer", title = "Asynchrony in Parallel Computing: From Dataflow to Multithreading", journal = j-PARALLEL-DIST-COMP-PRACT, volume = "1", number = "1", pages = "??--??", month = "????", year = "1998", CODEN = "????", ISSN = "1097-2803", bibdate = "Fri Dec 19 08:14:11 MST 2003", bibsource = "http://www.cs.okstate.edu/~pdcp/vols/vol01/vol01no1.html; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.cs.okstate.edu/~pdcp/vols/vol01/vol01no1abs.html#silc", acknowledgement = ack-nhfb, fjournal = "PDCP: Parallel and Distributed Computing Practices", } @Article{Skillicorn:1998:MLP, author = "David B. Skillicorn and Domenico Talia", title = "Models and languages for parallel computation", journal = j-COMP-SURV, volume = "30", number = "2", pages = "123--169", month = jun, year = "1998", CODEN = "CMSVAN", ISSN = "0360-0300 (print), 1557-7341 (electronic)", ISSN-L = "0360-0300", bibdate = "Fri Sep 11 08:35:51 MDT 1998", bibsource = "http://www.acm.org/pubs/contents/journals/surveys/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org:80/pubs/citations/journals/surveys/1998-30-2/p123-skillicorn/", abstract = "We survey parallel programming models and languages using six criteria to assess their suitability for realistic portable parallel programming. We argue that an ideal model should by easy to program, should have a software development methodology, should be architecture-independent, should be easy to understand, should guarantee performance, and should provide accurate information about the cost of programs. These criteria reflect our belief that developments in parallelism must be driven by a parallel software industry based on portability and efficiency. We consider programming models in six categories, depending on the level of abstraction they provide. Those that are very abstract conceal even the presence of parallelism at the software level. Such models make software easy to build and port, but efficient and predictable performance is usually hard to achieve. At the other end of the spectrum, low-level models make all of the messy issues of parallel programming explicit (how many threads, how to place them, how to express communication, and how to schedule communication), so that software is hard to build and not very portable, but is usually efficient. Most recent models are near the center of this spectrum, exploring the best tradeoffs between expressiveness and performance. A few models have achieved both abstractness and efficiency. Both kinds of models raise the possibility of parallelism as part of the mainstream of computing.", acknowledgement = ack-nhfb, fjournal = "ACM Computing Surveys", keywords = "languages; performance; theory", subject = "{\bf C.4} Computer Systems Organization, PERFORMANCE OF SYSTEMS. {\bf D.1} Software, PROGRAMMING TECHNIQUES. {\bf D.3.2} Software, PROGRAMMING LANGUAGES, Language Classifications.", } @InProceedings{Smith:1998:SIF, author = "Geoffrey Smith and Dennis Volpano", title = "Secure information flow in a multi-threaded imperative language", crossref = "ACM:1998:CRP", pages = "355--364", year = "1998", bibdate = "Mon May 3 12:57:52 MDT 1999", bibsource = "http://www.acm.org/pubs/toc/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org:80/pubs/citations/proceedings/plan/268946/p355-smith/", acknowledgement = ack-nhfb, keywords = "algorithms; languages; security; theory", subject = "{\bf F.3.3} Theory of Computation, LOGICS AND MEANINGS OF PROGRAMS, Studies of Program Constructs, Type structure. {\bf D.3.0} Software, PROGRAMMING LANGUAGES, General. {\bf D.2.0} Software, SOFTWARE ENGINEERING, General, Protection mechanisms. {\bf D.1.3} Software, PROGRAMMING TECHNIQUES, Concurrent Programming.", } @Article{Tennberg:1998:CAD, author = "Patrick Tennberg", title = "Creating Active Data Types via Multithreading", journal = j-CCCUJ, volume = "16", number = "1", pages = "??--??", month = jan, year = "1998", CODEN = "CCUJEX", ISSN = "1075-2838", bibdate = "Tue May 14 18:09:13 MDT 2002", bibsource = "http://www.cuj.com/articles/1998/9801/9801toc.htm?topic=articles; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "If you need multiple active agents in a program, you need multiple threads to synchronize them.", acknowledgement = ack-nhfb, fjournal = "C/C++ Users Journal", } @Article{Thitikamol:1998:PNM, author = "K. Thitikamol and P. Keleher", title = "Per-node multithreading and remote latency", journal = j-IEEE-TRANS-COMPUT, volume = "47", number = "4", pages = "414--426", month = apr, year = "1998", CODEN = "ITCOB4", DOI = "http://dx.doi.org/10.1109/12.675711", ISSN = "0018-9340 (print), 1557-9956 (electronic)", ISSN-L = "0018-9340", bibdate = "Wed Jul 6 09:35:54 MDT 2011", bibsource = "http://www.computer.org/tc/; http://www.math.utah.edu/pub/tex/bib/ieeetranscomput1990.bib; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=675711", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Computers", } @InProceedings{Tullsen:1998:RSM, author = "Dean M. Tullsen and Susan J. Eggers and Henry M. Levy", title = "Retrospective: {Simultaneous} multithreading: maximizing on-chip parallelism", crossref = "ACM:1998:PAI", pages = "115--116", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @InProceedings{Tullsen:1998:SMM, author = "Dean M. Tullsen and Susan J. Eggers and Henry M. Levy", title = "Simultaneous multithreading: maximizing on-chip parallelism", crossref = "ACM:1998:PAI", pages = "533--544", year = "1998", bibdate = "Fri May 12 17:56:30 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, remark = "25 years of the International Symposia on Computer Architecture (selected papers).", } @Article{Venners:1998:DTS, author = "Bill Venners", title = "Design for thread safety", journal = j-JAVAWORLD, volume = "3", number = "8", pages = "??--??", month = aug, year = "1998", CODEN = "????", ISSN = "1091-8906", bibdate = "Thu Sep 10 14:37:30 MDT 1998", bibsource = "http://www.javaworld.com/javaworld/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.javaworld.com/javaworld/jw-08-1998/jw-08-techniques.htm", acknowledgement = ack-nhfb, } @InProceedings{Vishkin:1998:EMT, author = "Uzi Vishkin and Shlomit Dascal and Efraim Berkovich and Joseph Nuzman", booktitle = "SPAA '98: 10th Annual ACM Symposium on Parallel Algorithms and Architectures, June 28--July 2, 1998, Puerto Vallarta, Mexico", title = "Explicit multi-threading ({XMT}) bridging models for instruction parallelism (extended abstract)", publisher = pub-ACM, address = pub-ACM:adr, year = "1998", DOI = "http://dx.doi.org/10.1145.277680", ISBN = "0-89791-989-0", ISBN-13 = "978-0-89791-989-0", LCCN = "QA76.58 .A26 1998", bibdate = "Fri Jul 27 05:37:45 2001", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "ACM order number 417980.", URL = "http://delivery.acm.org/10.1145/280000/277680/p140-vishkin.pdf", acknowledgement = ack-nhfb, bookpages = "viii + 310", keywords = "IA-64", } @Article{Wallace:1998:TMP, author = "Steven Wallace and Brad Calder and Dean M. Tullsen", title = "Threaded multiple path execution", journal = j-COMP-ARCH-NEWS, volume = "26", number = "3", pages = "238--249", month = jun, year = "1998", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:58 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", } @PhdThesis{Weissman:1998:ATT, author = "Boris Weissman", title = "Active threads: towards efficient fine-grained parallelism in object-oriented systems", type = "Thesis ({Ph.D.} in {Computer Science})", school = "Department of Computer Science, University of California, Berkeley", address = "Berkeley, CA, USA", year = "1998", LCCN = "T7.6.1998 W457", bibdate = "Fri May 10 12:18:17 MDT 2002", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "dissertations, academic -- UCB -- Computer Science -- 1991--2000; University of California, Berkeley, Dept. Of Computer Science -- dissertations", } @Article{Weissman:1998:PCS, author = "Boris Weissman", title = "Performance Counters and State Sharing Annotations: a Unified Approach to Thread Locality", journal = j-SIGPLAN, volume = "33", number = "11", pages = "127--138", month = nov, year = "1998", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:17:54 MST 2003", bibsource = "http://portal.acm.org/; http://www.acm.org/pubs/toc/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "Co-published in {\em Operating Systems Review}, {\bf 32}(5).", URL = "http://www.acm.org:80/pubs/citations/proceedings/asplos/291069/p127-weissman/", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "design; experimentation; measurement; performance; theory", subject = "{\bf D.4.1} Software, OPERATING SYSTEMS, Process Management, Scheduling. {\bf F.1.2} Theory of Computation, COMPUTATION BY ABSTRACT DEVICES, Modes of Computation, Parallelism and concurrency. {\bf D.4.8} Software, OPERATING SYSTEMS, Performance, Simulation. {\bf G.3} Mathematics of Computing, PROBABILITY AND STATISTICS, Markov processes.", } @Article{Wilde:1998:RES, author = "Norman Wilde and Christopher Casey and Joe Vandeville and Gary Trio and Dick Hotz", title = "Reverse engineering of software threads: {A} design recovery technique for large multi-process systems", journal = j-J-SYST-SOFTW, volume = "43", number = "1", pages = "11--17", month = oct, year = "1998", CODEN = "JSSODM", ISSN = "0164-1212", ISSN-L = "0164-1212", bibdate = "Wed Dec 16 08:24:49 MST 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "The Journal of systems and software", } @Article{Wilmot:1998:DTM, author = "Dick Wilmot", title = "Data threaded microarchitecture", journal = j-COMP-ARCH-NEWS, volume = "26", number = "5", pages = "22--32", month = dec, year = "1998", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:21 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", } @Article{Zhou:1998:LST, author = "Honbo Zhou and Al Geist", title = "{LPVM}: a step towards multithread {PVM}", journal = j-CPE, volume = "10", number = "5", pages = "407--416", day = "25", month = apr, year = "1998", CODEN = "CPEXEI", ISSN = "1040-3108", ISSN-L = "1040-3108", bibdate = "Tue Sep 7 06:06:40 MDT 1999", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www3.interscience.wiley.com/journalfinder.html", URL = "http://www3.interscience.wiley.com/cgi-bin/abstract?ID=5385; http://www3.interscience.wiley.com/cgi-bin/fulltext?ID=5385&PLACEBO=IE.pdf", acknowledgement = ack-nhfb, fjournal = "Concurrency, practice and experience", } @Article{Antoniu:1999:ETT, author = "G. Antoniu and L. Bouge and R. Namyst", title = "An Efficient and Transparent Thread Migration Scheme in the {PM2} Runtime System", journal = j-LECT-NOTES-COMP-SCI, volume = "1586", pages = "496--??", year = "1999", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Mon Sep 13 16:57:02 MDT 1999", bibsource = "ftp://ftp.math.utah.edu/pub/tex/bib/lncs1999a.bib; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Azagury:1999:NIR, author = "Alain Azagury and Elliot K. Kolodner and Erez Petrank", title = "A Note on the Implementation of Replication-Based Garbage Collection for Multithreaded Applications and Multiprocessor Environments", journal = j-PARALLEL-PROCESS-LETT, volume = "9", number = "3", pages = "391--??", month = sep, year = "1999", CODEN = "PPLTEE", ISSN = "0129-6264", bibdate = "Thu Jan 6 12:02:35 MST 2005", bibsource = "http://ejournals.wspc.com.sg/ppl/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Parallel Processing Letters", } @Article{Blumofe:1999:SMC, author = "Robert D. Blumofe and Charles E. Leiserson", title = "Scheduling multithreaded computations by work stealing", journal = j-J-ACM, volume = "46", number = "5", pages = "720--748", month = sep, year = "1999", CODEN = "JACOAH", ISSN = "0004-5411", ISSN-L = "0004-5411", bibdate = "Sun Jan 23 12:19:49 MST 2000", bibsource = "http://www.acm.org/pubs/toc/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org/pubs/citations/journals/jacm/1999-46-5/p720-blumofe/", acknowledgement = ack-nhfb, fjournal = "Journal of the ACM", } @Article{Bouge:1999:ECM, author = "L. Bouge and J.-F. Mehaut and R. Namyst", title = "Efficient Communications in Multithreaded Runtime Systems", journal = j-LECT-NOTES-COMP-SCI, volume = "1586", pages = "468--482", year = "1999", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Fri Mar 16 07:33:54 2001", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Broadman:1999:ECM, author = "Allen Broadman and Eric Shaw", title = "Executing a Class Member in Its Own Thread", journal = j-CCCUJ, volume = "17", number = "12", pages = "??--??", month = dec, year = "1999", CODEN = "CCUJEX", ISSN = "1075-2838", bibdate = "Tue May 14 18:09:24 MDT 2002", bibsource = "http://www.cuj.com/articles/1999/9912/9912toc.htm?topic=articles; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Creating a separate thread to execute a member function call is a messy business that's often necessary. It's a task well worth encapsulating.", acknowledgement = ack-nhfb, fjournal = "C/C++ Users Journal", } @Article{Cappello:1999:PNB, author = "F. Cappello and O. Richard and D. Etiemble", title = "Performance of the {NAS} Benchmarks on a Cluster of {SMP PCs} Using a Parallelization of the {MPI} Programs with {OpenMP}", journal = j-LECT-NOTES-COMP-SCI, volume = "1662", pages = "339--??", year = "1999", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Mon Sep 13 16:57:02 MDT 1999", bibsource = "ftp://ftp.math.utah.edu/pub/tex/bib/lncs1999b.bib; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Cenciarelli:1999:EBS, author = "P. Cenciarelli and A. Knapp and B. Reus and M. Wirsing", title = "An Event-Based Structural Operational Semantics of Multi-Threaded {Java}", journal = j-LECT-NOTES-COMP-SCI, volume = "1523", pages = "157--??", year = "1999", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Mon Sep 13 16:57:02 MDT 1999", bibsource = "ftp://ftp.math.utah.edu/pub/tex/bib/lncs1999a.bib; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Chappell:1999:SSM, author = "Robert S. Chappell and Jared Stark and Sangwook P. Kim and Steven K. Reinhardt and Yale N. Patt", title = "Simultaneous subordinate microthreading {(SSMT)}", journal = j-COMP-ARCH-NEWS, volume = "27", number = "2", pages = "186--195", month = may, year = "1999", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:49 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", } @Article{delaPuente:1999:RTP, author = "Juan A. de la Puente and Jos{\'e} F. Ruiz and Jes{\'u}s M. Gonz{\'a}lez-Barahona", title = "Real-Time Programming with {GNAT}: Specialized Kernels versus {POSIX} Threads", journal = j-SIGADA-LETTERS, volume = "19", number = "2", pages = "73--77", month = jun, year = "1999", CODEN = "AALEE5", ISSN = "0736-721X", ISSN-L = "0736-721X", bibdate = "Tue Aug 31 07:04:20 1999", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGADA Ada Letters", } @Article{DeWitt:1999:PTL, author = "Anthony DeWitt and Thomas Gross", title = "The potential of thread-level speculation based on value profiling", journal = j-COMP-ARCH-NEWS, volume = "27", number = "1", pages = "22--22", month = mar, year = "1999", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:35 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", } @Article{Duda:1999:BVT, author = "Kenneth J. Duda and David R. Cheriton", title = "Borrowed-virtual-time {(BVT)} scheduling: supporting latency-sensitive threads in a general-purpose scheduler", journal = j-OPER-SYS-REV, volume = "33", number = "5", pages = "261--276", month = dec, year = "1999", CODEN = "OSRED8", ISSN = "0163-5980", ISSN-L = "0163-5980", bibdate = "Sat Aug 26 08:55:55 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Operating Systems Review", } @InProceedings{Garcia:1999:MMI, author = "F. Garcia and A. Calderon and J. Carretero", title = "{MiMPI}: {A} multithread-safe implementation of {MPI}", crossref = "Dongarra:1999:RAP", number = "1697", pages = "207--214", year = "1999", bibdate = "Thu Dec 9 06:08:35 MST 1999", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Article{Greiner:1999:PTE, author = "John Greiner and Guy E. Blelloch", title = "A provably time-efficient parallel implementation of full speculation", journal = j-TOPLAS, volume = "21", number = "2", pages = "240--285", month = mar, year = "1999", CODEN = "ATPSDT", ISSN = "0164-0925 (print), 1558-4593 (electronic)", ISSN-L = "0164-0925", bibdate = "Tue Sep 26 10:12:58 MDT 2000", bibsource = "http://www.acm.org/pubs/contents/journals/toplas/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org/pubs/citations/journals/toplas/1999-21-2/p240-greiner/", abstract = "Speculative evaluation, including leniency and futures, is often used to produce high degrees of parallelism. Understanding the performance characteristics of such evaluation, however, requires having a detailed understanding of the implementation. For example, the particular implementation technique used to suspend and reactivate threads can have an asymptotic effect on performance. With the goal of giving the users some understanding of performance without requiring them to understand the implementation, we present a provable implementation bound for a language based on speculative evaluation. The idea is (1) to supply the users with a semantics for a language that defines abstract costs for measuring or analyzing the performance of computations, (2) to supply the users with a mapping of these costs onto runtimes on various machine models, and (3) to describe an implementation strategy of the language and prove that it meets these mappings. For this purpose we consider a simple language based on speculative evaluation. For every computation, the semantics of the language returns a directed acyclic graph (DAG) in which each node represents a unit of computation, and each edge represents a dependence. We then describe an implementation strategy of the language and show that any computation with $w$ work (the number of nodes in the DAG) and $d$ depth (the length of the longest path in the DAG) will run on a $p$-processor PRAM in $O(w/p + d \log p)$ time. The bounds are work efficient (within a constant factor of linear speedup) when there is sufficient parallelism, $w/d p\log p$. These are the first time bounds we know of for languages with speculative evaluation. The main challenge is in parallelizing the necessary queuing operations on suspended threads.", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Programming Languages and Systems", generalterms = "Languages; Performance; Theory", keywords = "abstract machines; parallel languages; profiling semantics; speculation; threads", subject = "Software --- Software Engineering --- Metrics (D.2.8); Software --- Programming Languages --- Language Classifications (D.3.2): {\bf Data-flow languages}; Software --- Programming Languages --- Language Classifications (D.3.2); Theory of Computation --- Computation by Abstract Devices --- Modes of Computation (F.1.2): {\bf Parallelism and concurrency}; Theory of Computation --- Computation by Abstract Devices --- Modes of Computation (F.1.2); Theory of Computation --- Logics and Meanings of Programs --- Specifying and Verifying and Reasoning about Programs (F.3.1)", } @Article{Gu:1999:EJT, author = "Yan Gu and B. S. Lee and Wentong Cai", title = "Evaluation of {Java} thread performance on two different multithreaded kernels", journal = j-OPER-SYS-REV, volume = "33", number = "1", pages = "34--46", month = jan, year = "1999", CODEN = "OSRED8", ISSN = "0163-5980", ISSN-L = "0163-5980", bibdate = "Sat Aug 26 08:55:37 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Operating Systems Review", } @Article{Harrington:1999:WMM, author = "John Harrington", title = "{Win32} Multithreading Made Easy", journal = j-CCCUJ, volume = "17", number = "8", pages = "48, 50--52, 54--56", month = aug, year = "1999", CODEN = "CCUJEX", ISSN = "1075-2838", bibdate = "Tue May 14 18:09:22 MDT 2002", bibsource = "http://www.cuj.com/articles/1999/9908/9908toc.htm?topic=articles; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Multithreading logic is hard to write and hard to maintain. So keep it simple and separate.", acknowledgement = ack-nhfb, fjournal = "C/C++ Users Journal", } @Article{Holub:1999:PJTa, author = "Allen Holub", title = "Programming {Java} threads in the real world, {Part} 5: Timers", journal = j-JAVAWORLD, volume = "4", number = "2", pages = "??--??", month = feb, year = "1999", CODEN = "????", ISSN = "1091-8906", bibdate = "Thu Mar 04 12:56:16 1999", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.holub.com/goodies/javaworld/jw_index.html", acknowledgement = ack-nhfb, } @Article{Holub:1999:PJTb, author = "Allen Holub", title = "Programming {Java} threads in the real world, {Part} 6: {Mach '99}: Observer and the Mysteries of the {AWTEventMulticaster}", journal = j-JAVAWORLD, volume = "4", number = "3", pages = "??--??", month = mar, year = "1999", CODEN = "????", ISSN = "1091-8906", bibdate = "Thu Mar 04 12:56:16 1999", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.holub.com/goodies/javaworld/jw_index.html", acknowledgement = ack-nhfb, } @Article{Jonsson:1999:NPS, author = "J. Jonsson and H. Loenn and K. G. Shin", title = "Non-preemptive Scheduling of Real-Time Threads on Multi-Level-Context Architectures", journal = j-LECT-NOTES-COMP-SCI, volume = "1586", pages = "363--??", year = "1999", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Mon Sep 13 16:57:02 MDT 1999", bibsource = "ftp://ftp.math.utah.edu/pub/tex/bib/lncs1999a.bib; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Karamcheti:1999:ASM, author = "Vijay Karamcheti and Andrew A. Chien", title = "Architectural Support and Mechanisms for Object Caching in Dynamic Multithreaded Computations", journal = j-J-PAR-DIST-COMP, volume = "58", number = "2", pages = "260--300", month = aug, year = "1999", CODEN = "JPDCER", DOI = "http://dx.doi.org/10.1006/jpdc.1999.1555", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Thu Mar 9 09:19:08 MST 2000", bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1999.1555/production; http://www.idealibrary.com/links/doi/10.1006/jpdc.1999.1555/production/pdf; http://www.idealibrary.com/links/doi/10.1006/jpdc.1999.1555/production/ref", acknowledgement = ack-nhfb, fjournal = "Journal of Parallel and Distributed Computing", } @Article{Kekckler:1999:CEH, author = "S. W. Kekckler and A. Chang and W. S. L. S. Chatterjee and W. J. Dally", title = "Concurrent event handling through multithreading", journal = j-IEEE-TRANS-COMPUT, volume = "48", number = "9", pages = "903--916", month = sep, year = "1999", CODEN = "ITCOB4", DOI = "http://dx.doi.org/10.1109/12.795220", ISSN = "0018-9340 (print), 1557-9956 (electronic)", ISSN-L = "0018-9340", bibdate = "Wed Jul 6 08:46:59 MDT 2011", bibsource = "http://www.computer.org/tc/; http://www.math.utah.edu/pub/tex/bib/ieeetranscomput1990.bib; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=795220", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Computers", } @Article{Krishnan:1999:CMA, author = "V. Krishnan and J. Torrellas", title = "A chip-multiprocessor architecture with speculative multithreading", journal = j-IEEE-TRANS-COMPUT, volume = "48", number = "9", pages = "866--880", month = sep, year = "1999", CODEN = "ITCOB4", DOI = "http://dx.doi.org/10.1109/12.795218", ISSN = "0018-9340 (print), 1557-9956 (electronic)", ISSN-L = "0018-9340", bibdate = "Wed Jul 6 08:46:59 MDT 2011", bibsource = "http://www.computer.org/tc/; http://www.math.utah.edu/pub/tex/bib/ieeetranscomput1990.bib; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=795218", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Computers", } @Article{Kusakabe:1999:INS, author = "S. Kusakabe and K. Inenaga and M. Amamiya and X. Tang", title = "Implementing a Non-strict Functional Programming Language on a Threaded Architecture", journal = j-LECT-NOTES-COMP-SCI, volume = "1586", pages = "138--??", year = "1999", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Mon Sep 13 16:57:02 MDT 1999", bibsource = "ftp://ftp.math.utah.edu/pub/tex/bib/lncs1999a.bib; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Kwak:1999:EMC, author = "H. Kwak and B. Lee and A. R. Hurson and Suk-Han Yoon and Woo-Jong Hahn", title = "Effects of multithreading on cache performance", journal = j-IEEE-TRANS-COMPUT, volume = "48", number = "2", pages = "176--184", month = feb, year = "1999", CODEN = "ITCOB4", DOI = "http://dx.doi.org/10.1109/12.752659", ISSN = "0018-9340 (print), 1557-9956 (electronic)", ISSN-L = "0018-9340", bibdate = "Wed Jul 6 08:46:56 MDT 2011", bibsource = "http://www.computer.org/tc/; http://www.math.utah.edu/pub/tex/bib/ieeetranscomput1990.bib; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=752659", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Computers", } @Article{Lundberg:1999:PBS, author = "Lars Lundberg", title = "Predicting and Bounding the Speedup of Multithreaded {Solaris} Programs", journal = j-J-PAR-DIST-COMP, volume = "57", number = "3", pages = "322--333", month = jun, year = "1999", CODEN = "JPDCER", DOI = "http://dx.doi.org/10.1006/jpdc.1999.1536", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Thu Mar 9 09:19:07 MST 2000", bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1999.1536/production; http://www.idealibrary.com/links/doi/10.1006/jpdc.1999.1536/production/pdf; http://www.idealibrary.com/links/doi/10.1006/jpdc.1999.1536/production/ref", acknowledgement = ack-nhfb, fjournal = "Journal of Parallel and Distributed Computing", } @Article{Manley:1999:IPT, author = "Kevin Manley", title = "Improving Performance with Thread-Private Heaps", journal = j-CCCUJ, volume = "17", number = "9", pages = "50--??", month = sep, year = "1999", CODEN = "CCUJEX", ISSN = "1075-2838", bibdate = "Tue May 14 18:09:22 MDT 2002", bibsource = "http://www.cuj.com/articles/1999/9909/9909toc.htm?topic=articles; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Threads interact in the darndest ways, but conflicts with a common heap are particularly pernicious. Luckily they can be avoided.", acknowledgement = ack-nhfb, fjournal = "C/C++ Users Journal", } @Article{Marcuello:1999:EST, author = "P. Marcuello and A. Gonzalez", title = "Exploiting Speculative Thread-Level Parallelism on a {SMT} Processor", journal = j-LECT-NOTES-COMP-SCI, volume = "1593", pages = "754--??", year = "1999", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Mon Sep 13 16:57:02 MDT 1999", bibsource = "ftp://ftp.math.utah.edu/pub/tex/bib/lncs1999a.bib; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Masney:1999:IMT, author = "Brian Masney", title = "Introduction to Multi-Threaded Programming", journal = j-LINUX-J, volume = "61", pages = "??--??", month = may, year = "1999", CODEN = "LIJOFX", ISSN = "1075-3583 (print), 1938-3827 (electronic)", ISSN-L = "1075-3583", bibdate = "Thu Jun 3 06:34:02 MDT 1999", bibsource = "http://www.linuxjournal.com/issue61/index.html; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "A description of thread programming basics.", acknowledgement = ack-nhfb, fjournal = "Linux journal", } @InProceedings{Mitchell:1999:ILP, author = "Nicholas Mitchell and Larry Carter and Jeanne Ferrante and Dean Tullsen", title = "Instruction-level Parallelism vs. Thread-level Parallelism on Simultaneous Multi-threading Processors", crossref = "ACM:1999:SPO", pages = "??--??", year = "1999", bibdate = "Thu Feb 24 09:02:57 2000", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.sc99.org/techpapers/", acknowledgement = ack-nhfb, } @Article{Moody:1999:STT, author = "Scott Arthur Moody and Samuel Kwok and Dale Karr", title = "{SimpleGraphics}: {Tcl\slash Tk} visualization of real-time multi-threaded and distributed applications", journal = j-SIGADA-LETTERS, volume = "19", number = "2", pages = "60--66", month = jun, year = "1999", CODEN = "AALEE5", ISSN = "0736-721X", ISSN-L = "0736-721X", bibdate = "Sat Aug 9 09:06:06 MDT 2003", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGAda Ada Letters", } @Article{Narlikar:1999:SES, author = "Girija J. Narlikar and Guy E. Blelloch", title = "Space-Efficient Scheduling of Nested Parallelism", journal = j-TOPLAS, volume = "21", number = "1", pages = "138--173", month = jan, year = "1999", CODEN = "ATPSDT", ISSN = "0164-0925 (print), 1558-4593 (electronic)", ISSN-L = "0164-0925", bibdate = "Tue Sep 26 10:12:58 MDT 2000", bibsource = "http://www.acm.org/pubs/contents/journals/toplas/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org/pubs/citations/journals/toplas/1999-21-1/p138-narlikar/", abstract = "Many of today's high-level parallel languages support dynamic, fine-grained parallelism. These languages allow the user to expose all the parallelism in the program, which is typically of a much higher degree than the number of processors. Hence an efficient scheduling algorithm is required to assign computations to processors at runtime. Besides having low overheads and good load balancing, it is important for the scheduling algorithm to minimize the space usage of the parallel program. This article presents an on-line scheduling algorithm that is provably space efficient and time efficient for nested-parallel languages. For a computation with depth $D$ and serial space requirement $S_1$, the algorithm generates a schedule that requires at most $S_1 + O(K\cdot D\cdot p)$ space (including scheduler space) on $p$ processors. Here, $K$ is a user-adjustable runtime parameter specifying the net amount of memory that a thread may allocate before it is preempted by the scheduler. Adjusting the value of $K$ provides a trade-off between the running time and the memory requirement of a parallel computation. To allow the scheduler to scale with the number of processors we also parallelize the scheduler and analyze the space and time bounds of the computation to include scheduling costs. In addition to showing that the scheduling algorithm is space and time efficient in theory, we demonstrate that it is effective in practice. We have implemented a runtime system that uses our algorithm to schedule lightweight parallel threads. The results of executing parallel programs on this system show that our scheduling algorithm significantly reduces memory usage compared to previous techniques, without compromising performance.", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Programming Languages and Systems", generalterms = "Algorithms; Languages; Performance", keywords = "dynamic scheduling; multithreading; nested parallelism; parallel language implementation; space efficiency", subject = "Software --- Programming Techniques --- Concurrent Programming (D.1.3): {\bf Parallel programming}; Software --- Programming Languages --- Processors (D.3.4): {\bf Run-time environments}; Theory of Computation --- Analysis of Algorithms and Problem Complexity --- General (F.2.0)", } @Article{Nemeth:1999:MLK, author = "Z. Nemeth and H. Tomiyasu and P. Kacsuk and M. Amamiya", title = "Multithreaded {LOGFLOW} on {KUMP\slash} {D}", journal = j-LECT-NOTES-COMP-SCI, volume = "1615", pages = "320--??", year = "1999", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Mon Sep 13 16:57:02 MDT 1999", bibsource = "ftp://ftp.math.utah.edu/pub/tex/bib/lncs1999b.bib; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Book{Oaks:1999:JT, author = "Scott Oaks and Henry Wong", title = "{Java} threads", publisher = pub-ORA, address = pub-ORA:adr, edition = "Second", pages = "xiii + 319", year = "1999", ISBN = "1-56592-418-5", ISBN-13 = "978-1-56592-418-5", LCCN = "QA76.73.J38 O25 1999", bibdate = "Fri May 10 12:18:17 MDT 2002", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", series = "Java series", acknowledgement = ack-nhfb, keywords = "Java (computer program language); threads (computer programs)", } @Article{Pant:1999:TCP, author = "Lalit Pant", title = "Thread Communication In Parallel Algorithms: Enabling efficient interaction between threads", journal = j-DDJ, volume = "24", number = "4", pages = "32, 34, 36, 38--39", month = apr, year = "1999", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Wed Mar 3 06:30:11 MST 1999", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.ddj.com/ftp/1999/1999_04/parallel.txt", abstract = "With the increasing availability of multiprocessing hardware, thread-based parallel algorithms are becoming more and more important. Lalit presents thread communication mechanisms for use within parallel algorithms. Additional resources include parallel.txt (listings).", acknowledgement = ack-nhfb, fjournal = "Dr. Dobb's Journal of Software Tools", } @Book{Pham:1999:MPW, author = "Thuan Q. Pham and Pankaj K. Garg", title = "Multithreaded Programming with {Win32}", publisher = pub-PHPTR, address = pub-PHPTR:adr, pages = "xix + 219", year = "1999", ISBN = "0-13-010912-6", ISBN-13 = "978-0-13-010912-5", LCCN = "QA76.642.P518 1998", bibdate = "Thu Jan 21 18:58:23 1999", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "Includes CD-ROM.", URL = "http://www.phptr.com/ptrbooks/ptr_0130109126.html", acknowledgement = ack-nhfb, publishersnote = "If you want to deliver NT applications with maximum performance, efficiency and robustness, you need to master multithreading. Multithreaded Programming with Win32 brings together every Win32 multithreading technique and concept you must know --- all brilliantly explained with practical examples and sample code.", xxnote = "Check pages and year??", } @Article{Plauger:1999:SCCg, author = "P. J. Plauger", title = "{Standard C/C++}: {A} Better Red-Black Tree", journal = j-CCCUJ, volume = "17", number = "7", pages = "10--??", month = jul, year = "1999", CODEN = "CCUJEX", ISSN = "1075-2838", bibdate = "Tue May 14 18:09:21 MDT 2002", bibsource = "http://www.cuj.com/articles/1999/9907/9907toc.htm?topic=articles; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "The C++ Standard is silent about issues such as thread safety and DLL safety, but customers and reviewers certainly aren't.", acknowledgement = ack-nhfb, fjournal = "C/C++ Users Journal", } @Article{Richards:1999:ALT, author = "Etienne Richards", title = "Adding Level-2 Thread Safety to Existing Objects", journal = j-CCCUJ, volume = "17", number = "2", pages = "??--??", month = feb, year = "1999", CODEN = "CCUJEX", ISSN = "1075-2838", bibdate = "Tue May 14 18:09:19 MDT 2002", bibsource = "http://www.cuj.com/articles/1999/9902/9902toc.htm?topic=articles; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "The code required to share an object among multiple threads is tedious and error prone. But it can be neatly encapsulated.", acknowledgement = ack-nhfb, fjournal = "C/C++ Users Journal", } @Article{Ringle:1999:SCT, author = "Jonathan Ringle", title = "Singleton Creation the Thread-safe Way", journal = j-CCCUJ, volume = "17", number = "10", pages = "??--??", month = oct, year = "1999", CODEN = "CCUJEX", ISSN = "1075-2838", bibdate = "Tue May 14 18:09:23 MDT 2002", bibsource = "http://www.cuj.com/articles/1999/9910/9910toc.htm?topic=articles; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Singletons avoid problems with order of construction, at the cost of more problems for multithreading.", acknowledgement = ack-nhfb, fjournal = "C/C++ Users Journal", } @Article{Rodgers:1999:TSN, author = "Jeremy B. Rodgers and Rhonda Kay Gaede and Jeffrey H. Kulick", title = "{IN-Tune}: an {In-Situ} non-invasive performance tuning tool for multi-threaded {Linux} on symmetric multiprocessing {Pentium} workstations", journal = j-SPE, volume = "29", number = "9", pages = "775--792", day = "25", month = jul, year = "1999", CODEN = "SPEXBL", ISSN = "0038-0644 (print), 1097-024X (electronic)", ISSN-L = "0038-0644", bibdate = "Thu Jul 29 15:12:27 MDT 1999", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www3.interscience.wiley.com/journalfinder.html", URL = "http://www3.interscience.wiley.com/cgi-bin/abstract?ID=62501865; http://www3.interscience.wiley.com/cgi-bin/fulltext?ID=62501865&PLACEBO=IE.pdf", acknowledgement = ack-nhfb, fjournal = "Software---Practice and Experience", } @TechReport{Roe:1999:PMI, author = "Kevin Roe and Piyush Mehrotra", title = "Parallelization of a multigrid incompressible viscous cavity flow solver using {openMP}", type = "{NASA} contractor report", number = "NASA\slash CR-1999-209551", institution = inst-NLRC, address = inst-NLRC:adr, pages = "????", year = "1999", bibdate = "Thu Mar 16 07:20:02 2000", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "Also ICASE report 99-36.", acknowledgement = ack-nhfb, } @Article{Ronsse:1999:RFI, author = "Michiel Ronsse and Koen De Bosschere", title = "{RecPlay}: a fully integrated practical record\slash replay system", journal = j-TOCS, volume = "17", number = "2", pages = "133--152", month = may, year = "1999", CODEN = "ACSYEC", ISSN = "0734-2071", ISSN-L = "0734-2071", bibdate = "Tue Sep 26 07:54:31 MDT 2000", bibsource = "http://www.acm.org/pubs/contents/journals/tocs/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org/pubs/citations/journals/tocs/1999-17-2/p133-ronsse/", abstract = "This article presents a practical solution for the cyclic debugging of nondeterministic parallel programs. The solution consists of a combination of record\slash replay with automatic on-the-fly data race detection. This combination enables us to limit the record phase to the more efficient recording of the synchronization operations, while deferring the time-consuming data race detection to the replay phase. As the record phase is highly efficient, there is no need to switch it off, hereby eliminating the possibility of Heisenbugs because tracing can be left on all the time. This article describes an implementation of the tools needed to support RecPlay.", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Computer Systems", generalterms = "Algorithms; Experimentation; Reliability", keywords = "binary code modification; multithreaded programming; race detection", subject = "Software --- Programming Techniques --- Concurrent Programming (D.1.3): {\bf Parallel programming}; Software --- Software Engineering --- Testing and Debugging (D.2.5): {\bf Debugging aids}; Software --- Software Engineering --- Testing and Debugging (D.2.5): {\bf Monitors}; Software --- Software Engineering --- Testing and Debugging (D.2.5): {\bf Tracing}; Software --- Operating Systems --- Process Management (D.4.1): {\bf Concurrency}; Software --- Operating Systems --- Process Management (D.4.1): {\bf Deadlocks}; Software --- Operating Systems --- Process Management (D.4.1): {\bf Multiprocessing/multiprogramming/multitasking}; Software --- Operating Systems --- Process Management (D.4.1): {\bf Mutual exclusion}; Software --- Operating Systems --- Process Management (D.4.1): {\bf Synchronization}", } @Article{Rugina:1999:PAM, author = "Radu Rugina and Martin Rinard", title = "Pointer Analysis for Multithreaded Programs", journal = j-SIGPLAN, volume = "34", number = "5", pages = "77--90", month = may, year = "1999", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:18:03 MST 2003", bibsource = "http://portal.acm.org/; http://www.acm.org/pubs/contents/proceedings/pldi/301122/index.html; http://www.acm.org/pubs/contents/proceedings/pldi/301618/index.html; http://www.cs.rutgers.edu/pldi99/program.html; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "See PLDI'99 proceedings \cite{ACM:1999:PASa}.", URL = "http://www.acm.org:80/pubs/citations/proceedings/pldi/301122/p77-rugina/", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", } @Article{Saito:1999:MRS, author = "H. Saito and N. Stavrakos and C. Polychronopoulos", title = "Multithreading Runtime Support for Loop and Functional Parallelism", journal = j-LECT-NOTES-COMP-SCI, volume = "1615", pages = "133--??", year = "1999", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Mon Sep 13 16:57:02 MDT 1999", bibsource = "ftp://ftp.math.utah.edu/pub/tex/bib/lncs1999b.bib; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @MastersThesis{Samorodin:1999:SFS, author = "Steven Howard Samorodin", title = "Supporting flexible safety and sharing in multi-threaded environments", type = "Thesis ({M.S.})", school = "Computer Science Department, University of California, Davis", address = "Davis, CA, USA", pages = "39", year = "1999", bibdate = "Sat Apr 20 11:17:26 2002", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Article{Scherer:1999:TAP, author = "Alex Scherer and Honghui Lu and Thomas Gross and Willy Zwaenepoel", title = "Transparent adaptive parallelism on {NOWs} using {OpenMP}", journal = j-SIGPLAN, volume = "34", number = "8", pages = "96--106", month = aug, year = "1999", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:18:06 MST 2003", bibsource = "http://portal.acm.org/; http://www.acm.org/pubs/contents/proceedings/ppopp/301104/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org/pubs/citations/proceedings/ppopp/301104/p96-scherer/", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", } @InProceedings{Shen:1999:ATL, author = "Kai Shen and Hong Tang and Tao Yang", title = "Adaptive Two-level Thread Management for Fast {MPI} Execution on Shared Memory Machines", crossref = "ACM:1999:SPO", pages = "??--??", year = "1999", bibdate = "Thu Feb 24 09:02:57 2000", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.sc99.org/techpapers/", acknowledgement = ack-nhfb, } @Article{Sinharoy:1999:COI, author = "Balaram Sinharoy", title = "Compiler optimization to improve data locality for processor multithreading", journal = j-SCI-PROG, volume = "7", number = "1", pages = "21--37", month = "????", year = "1999", CODEN = "SCIPEV", ISSN = "1058-9244 (print), 1875-919X (electronic)", ISSN-L = "1058-9244", bibdate = "Thu Mar 28 12:27:27 MST 2002", bibsource = "Compendex database; http://www.iospress.nl/site/html/10589244.html; http://www.math.utah.edu/pub/tex/bib/multithreading.bib; OCLC Article1st database", URL = "http://iospress.metapress.com/app/home/contribution.asp%3Fwasp=64cr5a4mg33tuhcbdr02%26referrer=parent%26backto=issue%2C2%2C7%3Bjournal%2C8%2C9%3Blinkingpublicationresults%2C1%2C1", acknowledgement = ack-nhfb, fjournal = "Scientific Programming", } @Article{Sutter:1999:OAM, author = "Herb Sutter", title = "Optimizations That Aren't (In a Multithreaded World)", journal = j-CCCUJ, volume = "17", number = "6", pages = "??--??", month = jun, year = "1999", CODEN = "CCUJEX", ISSN = "1075-2838", bibdate = "Tue May 14 18:09:21 MDT 2002", bibsource = "http://www.cuj.com/articles/1999/9906/9906toc.htm?topic=articles; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "An ``obvious'' optimization can really lose ground when thread safety has to be ensured as well.", acknowledgement = ack-nhfb, fjournal = "C/C++ Users Journal", } @Article{Tang:1999:APT, author = "Xinan Tang and Guang R. Gao", title = "Automatically Partitioning Threads for Multithreaded Architectures", journal = j-J-PAR-DIST-COMP, volume = "58", number = "2", pages = "159--189", month = aug, year = "1999", CODEN = "JPDCER", DOI = "http://dx.doi.org/10.1006/jpdc.1999.1551", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Thu Mar 9 09:19:08 MST 2000", bibsource = "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.idealibrary.com/links/doi/10.1006/jpdc.1999.1551/production; http://www.idealibrary.com/links/doi/10.1006/jpdc.1999.1551/production/pdf; http://www.idealibrary.com/links/doi/10.1006/jpdc.1999.1551/production/ref", acknowledgement = ack-nhfb, fjournal = "Journal of Parallel and Distributed Computing", } @Article{Tang:1999:CRT, author = "Hong Tang and Kai Shen and Tao Yang", title = "Compile\slash run-time support for threaded {MPI} execution on multiprogrammed shared memory machines", journal = j-SIGPLAN, volume = "34", number = "8", pages = "107--118", month = aug, year = "1999", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:18:06 MST 2003", bibsource = "http://portal.acm.org/; http://www.acm.org/pubs/contents/proceedings/ppopp/301104/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org/pubs/citations/proceedings/ppopp/301104/p107-tang/", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", } @Article{Taura:1999:SMI, author = "Kenjiro Taura and Kunio Tabata and Akinori Yonezawa", title = "{StackThreads\slash MP}: integrating futures into calling standards", journal = j-SIGPLAN, volume = "34", number = "8", pages = "60--71", month = aug, year = "1999", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:18:06 MST 2003", bibsource = "http://portal.acm.org/; http://www.acm.org/pubs/contents/proceedings/ppopp/301104/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org/pubs/citations/proceedings/ppopp/301104/p60-taura/", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", } @Article{Throop:1999:SOS, author = "Joe Throop", title = "Standards: {OpenMP}: Shared-Memory Parallelism from the Ashes", journal = j-COMPUTER, volume = "32", number = "5", pages = "108--109", month = may, year = "1999", CODEN = "CPTRB4", ISSN = "0018-9162 (print), 1558-0814 (electronic)", ISSN-L = "0018-9162", bibdate = "Thu May 6 06:17:23 MDT 1999", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://dlib.computer.org/co/books/co1999/pdf/r5108.pdf", acknowledgement = ack-nhfb, fjournal = "Computer", } @Article{Torrant:1999:SMS, author = "Marc Torrant and Muhammad Shaaban and Roy Czernikowski and Ken Hsu", title = "A simultaneous multithreading simulator", journal = j-COMP-ARCH-NEWS, volume = "27", number = "5", pages = "1--5", month = dec, year = "1999", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:22 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", } @Article{Vlassov:1999:QMM, author = "V. Vlassov and A. Kraynikov", title = "A Queuing Model of a Multi-threaded Architecture: {A} Case Study", journal = j-LECT-NOTES-COMP-SCI, volume = "1662", pages = "306--??", year = "1999", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Mon Sep 13 16:57:02 MDT 1999", bibsource = "ftp://ftp.math.utah.edu/pub/tex/bib/lncs1999b.bib; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Weissman:1999:HPT, author = "B. Weissman and B. Gomes", title = "High Performance Thread Migration on Clusters of {SMPs}", journal = j-PARALLEL-DIST-COMP-PRACT, volume = "2", number = "2", pages = "??--??", month = "????", year = "1999", CODEN = "????", ISSN = "1097-2803", bibdate = "Fri Dec 19 08:14:13 MST 2003", bibsource = "http://www.cs.okstate.edu/~pdcp/vols/vol02/vol02no2.html; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.cs.okstate.edu/~pdcp/vols/vol02/vol02no2abs.html#boris", acknowledgement = ack-nhfb, fjournal = "PDCP: Parallel and Distributed Computing Practices", } @Article{Xu:1999:DIT, author = "Zhichen Xu and Barton P. Miller and Oscar Naim", title = "Dynamic instrumentation of threaded applications", journal = j-SIGPLAN, volume = "34", number = "8", pages = "49--59", month = aug, year = "1999", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:18:06 MST 2003", bibsource = "http://portal.acm.org/; http://www.acm.org/pubs/contents/proceedings/ppopp/301104/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org/pubs/citations/proceedings/ppopp/301104/p49-xu/", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", } @Article{Anonymous:2000:NPAa, author = "Anonymous", title = "New Products: {AVP for Linux/FreeBSD UNIX, Kaspersky Lab Ltd.; API PowerRAC Chassis 320, Alpha Processor Inc.; ODBC-ODBC Bridge, Easysoft Ltd.; LinkScan 6.1, Electronic Software Publishing Corporation; Metro-X Enhanced Server CD, Metro Link, Inc.; P-STAT Statistical Software, P-STAT, Inc.; System Manager in a Box v1.0, PegaSoft Canada; PGI Workstation 3.1, PGI; Quick Restore 2.6, Workstation Solutions, Inc.; Threads.h++ and Tools.h++ Professional, Rogue Wave Software; Scriptics Connect 1.0, 1.1, Scriptics Corporation; TapeWare 6.2 Backup Software, Yosemite Technologies, Inc.; DoubleVision for Linux Systems, Tridia Corporation}", journal = j-LINUX-J, volume = "71", pages = "??--??", month = mar, year = "2000", CODEN = "LIJOFX", ISSN = "1075-3583 (print), 1938-3827 (electronic)", ISSN-L = "1075-3583", bibdate = "Thu Sep 21 07:44:12 MDT 2000", bibsource = "http://noframes.linuxjournal.com/lj-issues/issue71/index.html; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Linux journal", } @Article{Anonymous:2000:SLT, author = "Anonymous", title = "Strictly On-Line: {T/TCP: TCP for Transactions by Mark Stacey, Ivan Griffin and John Nelson; POSIX Thread Libraries by Felix Garcia and Javier Fernandez; Linux and Open-Source Applications by Peter Jones and M. B. Jorgenson; Laptops for Linux! by Jason Kroll}", journal = j-LINUX-J, volume = "70", pages = "??--??", month = feb, year = "2000", CODEN = "LIJOFX", ISSN = "1075-3583 (print), 1938-3827 (electronic)", ISSN-L = "1075-3583", bibdate = "Thu Sep 21 16:32:31 MDT 2000", bibsource = "http://noframes.linuxjournal.com/lj-issues/issue70/index.html; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://noframes.linuxjournal.com/lj-issues/issue70/3075.html; http://noframes.linuxjournal.com/lj-issues/issue70/3184.html; http://noframes.linuxjournal.com/lj-issues/issue70/3683.html; http://noframes.linuxjournal.com/lj-issues/issue70/3766.html", acknowledgement = ack-nhfb, fjournal = "Linux journal", } @Article{Antoniu:2000:CDP, author = "G. Antoniu and L. Boug{\'e} and R. Namyst and C. P{\'e}rez", title = "Compiling Data-Parallel Programs to a Distributed Runtime Environment with Thread Isomigration", journal = j-PARALLEL-PROCESS-LETT, volume = "10", number = "2/3", pages = "201--??", month = sep, year = "2000", CODEN = "PPLTEE", ISSN = "0129-6264", bibdate = "Wed Apr 18 07:29:37 2001", bibsource = "http://ejournals.wspc.com.sg/ppl/10/1002_03/S01296264001002_03.html; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://ejournals.wspc.com.sg/ppl/10/1002_03/S0129626400000202.html", acknowledgement = ack-nhfb, fjournal = "Parallel Processing Letters", } @Article{Becker:2000:JSU, author = "Pete Becker", title = "The Journeyman's Shop: Unraveling Multithreading", journal = j-CCCUJ, volume = "18", number = "8", pages = "71--??", month = aug, year = "2000", CODEN = "CCUJEX", ISSN = "1075-2838", bibdate = "Tue May 14 18:09:27 MDT 2002", bibsource = "http://www.cuj.com/articles/2000/0008/0008toc.htm?topic=articles; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Sometimes you have to spend a lot of time on just a little bit of code, to avoid spending much more time not knowing where to begin debugging.", acknowledgement = ack-nhfb, fjournal = "C/C++ Users Journal", } @Article{Berger:2000:HSMa, author = "Emery D. Berger and Kathryn S. McKinley and Robert D. Blumofe and Paul R. Wilson", title = "{Hoard}: a scalable memory allocator for multithreaded applications", journal = j-COMP-ARCH-NEWS, volume = "28", number = "5", pages = "117--128", month = dec, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:22 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", } @Article{Berger:2000:HSMb, author = "Emery D. Berger and Kathryn S. McKinley and Robert D. Blumofe and Paul R. Wilson", title = "{Hoard}: {A} Scalable Memory Allocator for Multithreaded Applications", journal = j-SIGPLAN, volume = "35", number = "11", pages = "117--128", month = nov, year = "2000", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:18:19 MST 2003", bibsource = "http://foothill.lcs.mit.edu/asplos2k/program.html; http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", } @Article{Berger:2000:HSMc, author = "Emery D. Berger and Kathryn S. McKinley and Robert D. Blumofe and Paul R. Wilson", title = "{Hoard}: a scalable memory allocator for multithreaded applications", journal = j-OPER-SYS-REV, volume = "34", number = "5", pages = "117--128", month = dec, year = "2000", CODEN = "OSRED8", ISSN = "0163-5980", ISSN-L = "0163-5980", bibdate = "Sat Aug 26 08:55:56 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGOPS Operating Systems Review", } @Article{Borkenhagen:2000:MPP, author = "J. M. Borkenhagen and R. J. Eickemeyer and R. N. Kalla and S. R. Kunkel", title = "A multithreaded {PowerPC} processor for commercial servers", journal = j-IBM-JRD, volume = "44", number = "6", pages = "885--898", month = nov, year = "2000", CODEN = "IBMJAE", ISSN = "0018-8646 (print), 2151-8556 (electronic)", ISSN-L = "0018-8646", bibdate = "Sat Feb 24 09:44:45 MST 2001", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.research.ibm.com/journal/", URL = "http://www.research.ibm.com/journal/rd/446/borkenhagen.html", acknowledgement = ack-nhfb, fjournal = "IBM Journal of Research and Development", ordernumber = "G322-0224", } @Article{Boussinot:2000:JTS, author = "Fr{\'e}d{\'e}ric Boussinot and Jean-Ferdy Susini", title = "{Java} threads and {SugarCubes}", journal = j-SPE, volume = "30", number = "5", pages = "545--566", day = "25", month = apr, year = "2000", CODEN = "SPEXBL", DOI = "http://dx.doi.org/10.1002/(SICI)1097-024X(20000425)30:5<545::AID-SPE308>3.0.CO;2-Q", ISSN = "0038-0644 (print), 1097-024X (electronic)", ISSN-L = "0038-0644", bibdate = "Tue Mar 13 06:45:44 2001", bibsource = "http://www.interscience.wiley.com/jpages/0038-0644; http://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www3.interscience.wiley.com/journalfinder.html", URL = "http://www3.interscience.wiley.com/cgi-bin/abstract/71004433/START; http://www3.interscience.wiley.com/cgi-bin/fulltext?ID=71004433&PLACEBO=IE.pdf", acknowledgement = ack-nhfb, fjournal = "Software---Practice and Experience", } @Article{Bova:2000:DLP, author = "Steve W. Bova and Clay P. Breshears and Christine E. Cuicchi and Zeki Demirbilek and Henry A. Gabb", title = "Dual-Level Parallel Analysis of Harbor Wave Response Using {MPI} and {OpenMP}", journal = j-IJHPCA, volume = "14", number = "1", pages = "49--64", month = "Spring", year = "2000", CODEN = "IHPCFL", ISSN = "1094-3420 (print), 1741-2846 (electronic)", ISSN-L = "1094-3420", bibdate = "Tue Sep 12 12:39:11 2000", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @InCollection{Cahir:2000:PMM, author = "Margaret Cahir and Robert Moench and Alice E. Koniges", title = "Programming Models and Methods", crossref = "Koniges:2000:ISP", chapter = "3", pages = "27--54", year = "2000", bibdate = "Fri Feb 04 18:32:51 2000", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "Discusses PVM, MPI, SHMEM, High-Performance Fortran, and POSIX threads.", acknowledgement = ack-nhfb, } @Article{Cahoon:2000:EPD, author = "Brendon Cahoon and Kathryn S. McKinley and Zhihong Lu", title = "Evaluating the performance of distributed architectures for information retrieval using a variety of workloads", journal = j-TOIS, volume = "18", number = "1", pages = "1--43", month = jan, year = "2000", CODEN = "ATISET", ISSN = "1046-8188", ISSN-L = "0734-2047", bibdate = "Tue Sep 26 09:34:01 MDT 2000", bibsource = "http://www.acm.org/pubs/contents/journals/tois/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org/pubs/citations/journals/tois/2000-18-1/p1-cahoon/", abstract = "The information explosion across the Internet and elsewhere offers access to an increasing number of document collections. In order for users to effectively access these collections, information retrieval (IR) systems must provide coordinated, concurrent, and distributed access. In this article, we explore how to achieve scalable performance in a distributed system for collection sizes ranging from 1GB to 128GB. We implement a fully functional distributed IR system based on a multithreaded version of the Inquery simulation model. We measure performance as a function of system parameters such as client command rate, number of document collections, ter ms per query, query term frequency, number of answers returned, and command mixture. Our results show that it is important to model both query and document commands because the heterogeneity of commands significantly impacts performance. Based on our results, we recommend simple changes to the prototype and evaluate the changes using the simulator. Because of the significant resource demands of information retrieval, it is not difficult to generate workloads that overwhelm system resources regardless of the architecture. However under some realistic workloads, we demonstrate system organizations for which response time gracefully degrades as the workload increases and performance scales with the number of processors. This scalable architecture includes a surprisingly small number of brokers through which a large number of clients and servers communicate.", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Information Systems", keywords = "distributed information retrieval architectures", subject = "Computer Systems Organization --- Computer-Communication Networks --- Distributed Systems (C.2.4); Computer Systems Organization --- Performance of Systems (C.4); Computer Systems Organization --- Performance of Systems (C.4): {\bf Performance attributes}; Information Systems --- Information Storage and Retrieval --- Systems and Software (H.3.4)", } @Article{Calkins:2000:ITT, author = "Charles Calkins", title = "Integrating Threads with Template Classes", journal = j-CCCUJ, volume = "18", number = "5", pages = "32--??", month = may, year = "2000", CODEN = "CCUJEX", ISSN = "1075-2838", bibdate = "Tue May 14 18:09:26 MDT 2002", bibsource = "http://www.cuj.com/articles/2000/0005/0005toc.htm?topic=articles; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "It's obviously a good idea to encapsulate a thread as an object. It is less obvious how to get all the interfaces right.", acknowledgement = ack-nhfb, fjournal = "C/C++ Users Journal", } @Article{ChassindeKergommeaux:2000:PIV, author = "J. {Chassin de Kergommeaux} and B. Stein and P. E. Bernard", title = "{Paj{\'e}}, an interactive visualization tool for tuning multi-threaded parallel applications", journal = j-PARALLEL-COMPUTING, volume = "26", number = "10", pages = "1253--1274", day = "15", month = aug, year = "2000", CODEN = "PACOEJ", ISSN = "0167-8191", ISSN-L = "0167-8191", bibdate = "Sat Oct 28 17:44:14 MDT 2000", bibsource = "http://www.elsevier.com/locate/issn/01678191; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.elsevier.nl/gej-ng/10/35/21/42/31/24/abstract.html; http://www.elsevier.nl/gej-ng/10/35/21/42/31/24/article.pdf", acknowledgement = ack-nhfb, fjournal = "Parallel Computing", } @Book{Christopher:2000:HPJ, author = "Thomas Christopher and George Thiruvathukal", title = "High Performance {Java} Platform Computing: Multithreaded and Networked Programming", publisher = pub-PH, address = pub-PH:adr, pages = "xxii + 409", year = "2000", ISBN = "0-13-016164-0", ISBN-13 = "978-0-13-016164-2", LCCN = "????", bibdate = "Tue Feb 20 18:03:50 2001", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", price = "US\$49.99", URL = "http://www.sun.com/books/catalog/christopher/", acknowledgement = ack-nhfb, } @Article{Corbett:2000:USA, author = "James C. Corbett", title = "Using shape analysis to reduce finite-state models of concurrent {Java} programs", journal = j-TOSEM, volume = "9", number = "1", pages = "51--93", month = jan, year = "2000", CODEN = "ATSMER", ISSN = "1049-331X (print), 1557-7392 (electronic)", ISSN-L = "1049-331X", bibdate = "Fri Apr 20 08:21:35 MDT 2001", bibsource = "http://www.acm.org/pubs/toc/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org/pubs/articles/journals/tosem/2000-9-1/p51-corbett/p51-corbett.pdf; http://www.acm.org/pubs/citations/journals/tosem/2000-9-1/p51-corbett/", abstract = "Finite-state verification (e.g., model checking) provides a powerful means to detect concurrency errors, which are often subtle and difficult to reproduce. Nevertheless, widespread use of this technology by developers is unlikely until tools provide automated support for extracting the required finite-state models directly from program source. Unfortunately, the dynamic features of modern languages such as Java complicate the construction of compact finite-state models for verification. In this article, we show how shape analysis, which has traditionally been used for computing alias information in optimizers, can be used to greatly reduce the size of finite-state models of concurrent Java programs by determining which heap-allocated variables are accessible only by a single thread, and which shared variables are protected by locks. We also provide several other state-space reductions based on the semantics of Java monitors. A prototype of the reductions demonstrates their effectiveness.", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Software Engineering and Methodology", keywords = "concurrent systems; finite-state verification; Java; model extraction; modeling; shape analysis; state-space reductions", subject = "Software --- Software Engineering --- Software/Program Verification (D.2.4)", } @Article{Duda:2000:BVT, author = "Kenneth J. Duda and David R. Cheriton", title = "Borrowed-virtual-time {(BVT)} scheduling: supporting latency-sensitive threads in a general-purpose scheduler", journal = j-OPER-SYS-REV, volume = "34", number = "2", pages = "27--28", month = apr, year = "2000", CODEN = "OSRED8", ISSN = "0163-5980", ISSN-L = "0163-5980", bibdate = "Sat Aug 26 08:55:42 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Operating Systems Review", } @Article{Flautner:2000:TLPa, author = "Kristi{\'a}n Flautner and Rich Uhlig and Steve Reinhardt and Trevor Mudge", title = "Thread-level parallelism and interactive performance of desktop applications", journal = j-COMP-ARCH-NEWS, volume = "28", number = "5", pages = "129--138", month = dec, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:22 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", } @Article{Flautner:2000:TLPb, author = "Kriszti{\'a}n Flautner and Rich Uhlig and Steve Reinhardt and Trevor Mudge", title = "Thread Level Parallelism and Interactive Performance of Desktop Applications", journal = j-SIGPLAN, volume = "35", number = "11", pages = "129--138", month = nov, year = "2000", CODEN = "SINODQ", DOI = "http://dx.doi.org/10.1145.357001", ISBN = "1-58113-317-0", ISBN-13 = "978-1-58113-317-2", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:18:19 MST 2003", bibsource = "http://foothill.lcs.mit.edu/asplos2k/program.html; http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://delivery.acm.org/10.1145/360000/357001/p129-flautner.pdf", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "IA-64", } @Article{Flautner:2000:TLPc, author = "Kristi{\'a}n Flautner and Rich Uhlig and Steve Reinhardt and Trevor Mudge", title = "Thread-level parallelism and interactive performance of desktop applications", journal = j-OPER-SYS-REV, volume = "34", number = "5", pages = "129--138", month = dec, year = "2000", CODEN = "OSRED8", ISSN = "0163-5980", ISSN-L = "0163-5980", bibdate = "Sat Aug 26 08:55:56 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Operating Systems Review", } @Article{Garcia:2000:PTL, author = "Felix Garcia and Javier Fernandez", title = "{POSIX} Thread Libraries", journal = j-LINUX-J, volume = "70", pages = "??--??", month = feb, year = "2000", CODEN = "LIJOFX", ISSN = "1075-3583 (print), 1938-3827 (electronic)", ISSN-L = "1075-3583", bibdate = "Thu Sep 21 16:46:44 MDT 2000", bibsource = "http://noframes.linuxjournal.com/lj-issues/issue70/index.html; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://noframes.linuxjournal.com/lj-issues/issue/3184.html", acknowledgement = ack-nhfb, fjournal = "Linux journal", } @Article{Gontmakher:2000:JCN, author = "Alex Gontmakher and Assaf Schuster", title = "{Java} consistency: nonoperational characterizations for {Java} memory behavior", journal = j-TOCS, volume = "18", number = "4", pages = "333--386", year = "2000", CODEN = "ACSYEC", ISSN = "0734-2071", ISSN-L = "0734-2071", bibdate = "Wed Jul 18 10:18:45 MDT 2001", bibsource = "http://www.acm.org/pubs/toc/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org/pubs/articles/journals/tocs/2000-18-4/p333-gontmakher/p333-gontmakher.pdf; http://www.acm.org/pubs/citations/journals/tocs/2000-18-4/p333-gontmakher/", abstract = "The Java Language Specification (JLS) [Gosling et al. 1996] provides an operational definition for the consistency of shared variables. The definition remains unchanged in the JLS 2nd edition, currently under peer review, which relies on a specific abstract machine as its underlying model, is very complicated. Several subsequent works have tried to simplify and formalize it. However, these revised definitions are also operational, and thus have failed to highlight the intuition behind the original specification. In this work we provide a complete nonoperational specification for Java and for the JVM, excluding synchronized operations. We provide a simpler definition, in which we clearly distinguish the consistency model that is promised to the programmer from that which should be implemented in the JVM. This distinction, which was implicit in the original definition, is crucial for building the JVM. We find that the programmer model is strictly weaker than that of the JVM, and precisely define their discrepancy. Moreover, our definition is independent of any specific (or even abstract) machine, and can thus be used to verify JVM implementations and compiler optimizations on any platform. Finally, we show the precise range of consistency relaxations obtainable for the Java memory model when a certain compiler optimization-- called {\em prescient stores\/} in JLS--is applicable.", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Computer Systems", generalterms = "Verification", keywords = "Java memory models; multithreading; nonoperational specification", subject = "Hardware --- Memory Structures --- Performance Analysis and Design Aids** (B.3.3): {\bf Formal models**}", } @Book{Holub:2000:TJT, author = "Allen I. Holub", title = "Taming {Java} Threads", publisher = pub-APRESS, address = pub-APRESS:adr, pages = "x + 300", year = "2000", ISBN = "1-893115-10-0", ISBN-13 = "978-1-893115-10-1", LCCN = "QA76.73.J38 H635 2000", bibdate = "Fri May 10 12:18:17 MDT 2002", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www1.fatbrain.com/asp/bookinfo/bookinfo.asp?theisbn=1893115100&from=NCN454", price = "US\$34.95", acknowledgement = ack-nhfb, keywords = "Java (computer program language); threads (computer programs)", } @Article{Horwood:2000:DMA, author = "Peter Horwood and Shlomo Wygodny and Martin Zardecki", title = "Debugging Multithreaded Applications", journal = j-DDJ, volume = "25", number = "3", pages = "32, 34--37", month = mar, year = "2000", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Thu Nov 9 08:25:14 MST 2000", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.ddj.com/ftp/2000/2000_03/dbgmulti.txt", abstract = "It is often significantly harder to locate and test for bugs in multithreaded and multiprocess applications than for nonthreaded, single process situations. Our authors describe some of the problems with multithreaded applications and discuss common debugging techniques. Additional resources include dbgmulti.txt (listings).", acknowledgement = ack-nhfb, fjournal = "Dr. Dobb's Journal of Software Tools", } @Article{Howard:2000:UPW, author = "David M. Howard", title = "Using Predicate Waits with {Win32} Threads", journal = j-CCCUJ, volume = "18", number = "5", pages = "18--??", month = may, year = "2000", CODEN = "CCUJEX", ISSN = "1075-2838", bibdate = "Tue May 14 18:09:26 MDT 2002", bibsource = "http://www.cuj.com/articles/2000/0005/0005toc.htm?topic=articles; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Most Win32 synchronization primitives are just that --- primitive. But you can use them to build queues that are safe and easy to use.", acknowledgement = ack-nhfb, fjournal = "C/C++ Users Journal", } @Book{Hyde:2000:JTP, author = "Paul Hyde", title = "{Java} thread programming", publisher = pub-SAMS, address = pub-SAMS:adr, pages = "iv + 510", year = "2000", ISBN = "0-672-31585-8", ISBN-13 = "978-0-672-31585-5", LCCN = "QA76.73.J38 H93 1999", bibdate = "Wed Feb 21 06:02:14 2001", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Article{Kleber:2000:TSA, author = "Jeff Kleber", title = "Thread-Safe Access to Collections", journal = j-CCCUJ, volume = "18", number = "5", pages = "36--??", month = may, year = "2000", CODEN = "CCUJEX", ISSN = "1075-2838", bibdate = "Tue May 14 18:09:26 MDT 2002", bibsource = "http://www.cuj.com/articles/2000/0005/0005toc.htm?topic=articles; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "The best place to store a thread lock for a shared container is somewhere inside the container --- deep inside.", acknowledgement = ack-nhfb, fjournal = "C/C++ Users Journal", } @Article{Lafreniere:2000:SMD, author = "David Lafreniere", title = "State Machine Design in {C++}", journal = j-CCCUJ, volume = "18", number = "5", pages = "58--??", month = may, year = "2000", CODEN = "CCUJEX", ISSN = "1075-2838", bibdate = "Tue May 14 18:09:26 MDT 2002", bibsource = "http://www.cuj.com/articles/2000/0005/0005toc.htm?topic=articles; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "It's not all that hard to implement a finite-state machine, unless it's very large, and you have to worry about multithreading, and \ldots{}.", acknowledgement = ack-nhfb, fjournal = "C/C++ Users Journal", } @Book{Lewis:2000:MPJ, author = "Bil Lewis and Daniel J. Berg", title = "Multithreaded Programming with {Java} Technology", publisher = pub-SUN-MICROSYSTEMS-PRESS, address = pub-SUN-MICROSYSTEMS-PRESS:adr, pages = "xxv + 461", year = "2000", ISBN = "0-13-017007-0", ISBN-13 = "978-0-13-017007-1", LCCN = "QA76.73.J38 L488 2000", bibdate = "Fri Apr 11 15:58:52 2003", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", price = "US\$39.99", series = "Sun BluePrints Program", URL = "http://www.sun.com/books/catalog/lewis3/index.html", acknowledgement = ack-nhfb, } @Article{Ling:2000:AOT, author = "Yibei Ling and Tracy Mullen and Xiaola Lin", title = "Analysis of optimal thread pool size", journal = j-OPER-SYS-REV, volume = "34", number = "2", pages = "42--55", month = apr, year = "2000", CODEN = "OSRED8", ISSN = "0163-5980", ISSN-L = "0163-5980", bibdate = "Sat Aug 26 08:55:42 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Operating Systems Review", } @Article{Lowy:2000:MPO, author = "Juval Lowy", title = "Making Primitive Objects Thread Safe", journal = j-CCCUJ, volume = "18", number = "3", pages = "85--??", month = mar, year = "2000", CODEN = "CCUJEX", ISSN = "1075-2838", bibdate = "Tue May 14 18:09:25 MDT 2002", bibsource = "http://www.cuj.com/articles/2000/0003/0003toc.htm?topic=articles; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "All sorts of things need thread locks. A fairly simple template or two can do the job.", acknowledgement = ack-nhfb, fjournal = "C/C++ Users Journal", } @InProceedings{Matsushita:2000:MSC, author = "Satoshi Matsushita and Sunao Torii and Masahiko Nomura and Toshiaki Inoue and Atsufumi Shibayama and Sachiko Shimada and Taku Osawa and Hiroaki Inoue and Kouichiro Minami and Junji Sakai and Yoshiyuki Ito and Yuichi Nakamura and Masato Edahiro and Naoki Nishi and Masakazu Yamashina", title = "{Merlot}: {A} Single-Chip Tightly Coupled Four-Way Multi-Thread Processor", crossref = "Anonymous:2000:CCI", pages = "??--??", year = "2000", bibdate = "Mon Jan 08 05:28:04 2001", bibsource = "http://www.coolchips.org/index-cool3.html; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "We developed an on-chip four-way multiprocessor, MP98 version 1, code-named Merlot. It is fabricated with a 0.15 $\mu$m process and has a die size of 110 mm2. Merlot is a high performance embedded processor for intelligent appliances. We extract a higher degree of parallelism with low voltage operation. In our presentation, we describe our multi-threading model. Then, we explain Merlot's pipeline architecture, focusing on fast thread creation and memory renaming. We also describe our on-chip SDRAM interface which has a throughput greater than 1 GB/sec and cache miss penalty less than 100 ns. Finally, we show a performance estimation for speech recognition and MPEG2 code, power dissipation, and average memory latency. Restructured speech recognition code was compiled with directives, and IPC of 2.72 is estimated.", acknowledgement = ack-nhfb, } @Article{Mount:2000:ADP, author = "John Mount", title = "Automatic Detection Of Potential Deadlock", journal = j-DDJ, volume = "25", number = "12", pages = "64, 66--70, 72", month = dec, year = "2000", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Wed Nov 8 15:09:25 MST 2000", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.ddj.com/ftp/2000/2000_12/deadlock.txt; http://www.ddj.com/ftp/2000/2000_12/deadlock.zip", abstract = "Deadlock can occur when a number of consumers (typically threads) access a set of resources in an unacceptable pattern. To combat it, John presents a solution based on run-time lock analysis that analyzes all transactions. Additional resources include deadlock.txt (listings) and deadlock.zip (source code).", acknowledgement = ack-nhfb, fjournal = "Dr. Dobb's Journal of Software Tools", } @Article{Nemeth:2000:AMD, author = "Zsolt N{\'e}meth", title = "Abstract machine design on a multithreaded architecture", journal = j-FUT-GEN-COMP-SYS, volume = "16", number = "6", pages = "705--716", month = apr, year = "2000", CODEN = "FGSEVI", ISSN = "0167-739X (print), 1872-7115 (electronic)", ISSN-L = "0167-739X", bibdate = "Wed Feb 27 12:41:20 MST 2002", bibsource = "http://www.elsevier.com/locate/issn/0167739X; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.elsevier.com/gej-ng/10/19/19/41/29/36/abstract.html", acknowledgement = ack-nhfb, fjournal = "Future Generation Computer Systems", } @Article{Peterson:2000:CCT, author = "Mark Peterson", title = "{C/C++} Tips: Tip \#4: Self Destructing Threads", journal = j-CCCUJ, volume = "18", number = "12", pages = "44--??", month = dec, year = "2000", CODEN = "CCUJEX", ISSN = "1075-2838", bibdate = "Tue May 14 18:09:29 MDT 2002", bibsource = "http://www.cuj.com/articles/2000/0012/0012toc.htm?topic=articles; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "A way to make threads easier to manage.", acknowledgement = ack-nhfb, fjournal = "C/C++ Users Journal", } @Article{Pulleyn:2000:EPM, author = "Ivan Pulleyn", title = "Embedding {Python} in Multi-Threaded {C\slash C++} Applications", journal = j-LINUX-J, volume = "73", pages = "??--??", month = may, year = "2000", CODEN = "LIJOFX", ISSN = "1075-3583 (print), 1938-3827 (electronic)", ISSN-L = "1075-3583", bibdate = "Thu Sep 21 07:44:12 MDT 2000", bibsource = "http://noframes.linuxjournal.com/lj-issues/issue73/index.html; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Python provides a clean intuitive interface to complex,threaded applications.", acknowledgement = ack-nhfb, fjournal = "Linux journal", } @Article{Redstone:2000:AOSa, author = "Joshua A. Redstone and Susan J. Eggers and Henry M. Levy", title = "An analysis of operating system behavior on a simultaneous multithreaded architecture", journal = j-COMP-ARCH-NEWS, volume = "28", number = "5", pages = "245--256", month = dec, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:22 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", } @Article{Redstone:2000:AOSb, author = "Joshua A. Redstone and Susan J. Eggers and Henry M. Levy", title = "An Analysis of Operating System Behavior on a Simultaneous Multithreaded Architecture", journal = j-SIGPLAN, volume = "35", number = "11", pages = "245--256", month = nov, year = "2000", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:18:19 MST 2003", bibsource = "http://foothill.lcs.mit.edu/asplos2k/program.html; http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", } @Article{Redstone:2000:AOSc, author = "Joshua A. Redstone and Susan J. Eggers and Henry M. Levy", title = "An analysis of operating system behavior on a simultaneous multithreaded architecture", journal = j-OPER-SYS-REV, volume = "34", number = "5", pages = "245--256", month = dec, year = "2000", CODEN = "OSRED8", ISSN = "0163-5980", ISSN-L = "0163-5980", bibdate = "Sat Aug 26 08:55:56 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGOPS Operating Systems Review", } @Article{Reinhardt:2000:TFD, author = "Steven K. Reinhardt and Shubhendu S. Mukherjee", title = "Transient fault detection via simultaneous multithreading", journal = j-COMP-ARCH-NEWS, volume = "28", number = "2", pages = "25--36", month = may, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:49 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", } @Article{Shinjo:2000:DCEa, author = "Yasushi Shinjo and Calton Pu", title = "Developing correct and efficient multithreaded programs with thread-specific data and a partial evaluator", journal = j-OPER-SYS-REV, volume = "34", number = "2", pages = "33--33", month = apr, year = "2000", CODEN = "OSRED8", ISSN = "0163-5980", ISSN-L = "0163-5980", bibdate = "Sat Aug 26 08:55:42 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Operating Systems Review", } @Article{Shinjo:2000:DCEb, author = "Yasushi Shinjo", title = "Developing correct and efficient multithreaded programs with thread-specific data and a partial evaluator", journal = j-OPER-SYS-REV, volume = "34", number = "2", pages = "40--40", month = apr, year = "2000", CODEN = "OSRED8", ISSN = "0163-5980", ISSN-L = "0163-5980", bibdate = "Sat Aug 26 08:55:42 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Operating Systems Review", } @Article{Snavely:2000:SJSa, author = "Allan Snavely and Dean M. Tullsen", title = "Symbiotic job scheduling for a simultaneous multithreaded processor", journal = j-COMP-ARCH-NEWS, volume = "28", number = "5", pages = "234--244", month = dec, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:22 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", } @Article{Snavely:2000:SJSb, author = "Allan Snavely and Dean M. Tullsen", title = "Symbiotic Jobscheduling for a Simultaneous Multithreading Processor", journal = j-SIGPLAN, volume = "35", number = "11", pages = "234--244", month = nov, year = "2000", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:18:19 MST 2003", bibsource = "http://foothill.lcs.mit.edu/asplos2k/program.html; http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", } @Article{Snavely:2000:SJSc, author = "Allan Snavely and Dean M. Tullsen", title = "Symbiotic jobscheduling for a simultaneous multithreaded processor", journal = j-OPER-SYS-REV, volume = "34", number = "5", pages = "234--244", month = dec, year = "2000", CODEN = "OSRED8", ISSN = "0163-5980", ISSN-L = "0163-5980", bibdate = "Sat Aug 26 08:55:56 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGOPS Operating Systems Review", } @Article{Steffan:2000:SAT, author = "J. Greggory Steffan and Christopher B. Colohan and Antonia Zhai and Todd C. Mowry", title = "A scalable approach to thread-level speculation", journal = j-COMP-ARCH-NEWS, volume = "28", number = "2", pages = "1--12", month = may, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:49 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", } @Article{Tan:2000:PEN, author = "Kian-Lee Tan and Cheng Hian Goh and Beng Chin Ooi", title = "Progressive evaluation of nested aggregate queries", journal = j-VLDB-J, volume = "9", number = "3", pages = "261--278", month = dec, year = "2000", CODEN = "VLDBFR", ISSN = "1066-8888 (print), 0949-877X (electronic)", ISSN-L = "1066-8888", bibdate = "Mon Jun 23 10:50:54 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "In many decision-making scenarios, decision makers require rapid feedback to their queries, which typically involve aggregates. The traditional {\em blocking execution model\/} can no longer meet the demands of these users. One promising approach in the literature, called {\em online aggregation}, evaluates an aggregation query progressively as follows: as soon as certain data have been evaluated, approximate answers are produced with their respective running confidence intervals; as more data are examined, the answers and their corresponding running confidence intervals are refined. In this paper, we extend this approach to handle nested queries with aggregates (i.e., at least one inner query block is an aggregate query) by providing users with (approximate) answers progressively as the inner aggregation query blocks are evaluated. We address the new issues pose by nested queries. In particular, the answer space begins with a superset of the final answers and is refined as the aggregates from the inner query blocks are refined. For the intermediary answers to be meaningful, they have to be interpreted with the aggregates from the inner queries. We also propose a {\em multi-threaded model\/} in evaluating such queries: each query block is assigned to a thread, and the threads can be evaluated concurrently and independently. The time slice across the threads is {\em nondeterministic\/} in the sense that the user controls the relative rate at which these subqueries are being evaluated. For {\em enumerative\/} nested queries, we propose a priority-based evaluation strategy to present answers that are certainly in the final answer space first, before presenting those whose validity may be affected as the inner query aggregates are refined. We implemented a prototype system using Java and evaluated our system. Results for nested queries with a level and multiple levels of nesting are reported. Our results show the effectiveness of the proposed mechanisms in providing progressive feedback that reduces the initial waiting time of users significantly without sacrificing the quality of the answers.", acknowledgement = ack-nhfb, fjournal = "VLDB Journal: Very Large Data Bases", keywords = "approximate answers; multi-threading; nested aggregate queries; online aggregation; progressive query processing", } @Article{Tang:2000:PTR, author = "Hong Tang and Kai Shen and Tao Yang", title = "Program transformation and runtime support for threaded {MPI} execution on shared-memory machines", journal = j-TOPLAS, volume = "22", number = "4", pages = "673--700", year = "2000", CODEN = "ATPSDT", ISSN = "0164-0925 (print), 1558-4593 (electronic)", ISSN-L = "0164-0925", bibdate = "Tue Apr 17 10:05:24 MDT 2001", bibsource = "http://www.acm.org/pubs/toc/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org/pubs/citations/journals/toplas/2000-22-4/p673-tang/", abstract = "Parallel programs written in MPI have been widely used for developing high-performance applications on various platforms. Because of a restriction of the MPI computation model, conventional MPI implementations on shared-memory machines map each MPI node to an OS process, which can suffer serious performance degradation in the presence of multiprogramming. This paper studies compile-time and runtime techniques for enhancing performance portability of MPI code running on multiprogrammed shared-memory machines. The proposed techniques allow MPI nodes to be executed safety and efficiently as threads. Compile-time transformation eliminates global and static variables in C code using node-specific data. The runtime support includes an efficient and provably correct communication protocol that uses lock-free data structure and takes advantage of address space sharing among threads. The experiments on SGI Origin 2000 show that our MPI prototype called TMPI using the proposed techniques is competitive with SGI's native MPI implementation in a dedicated environment, and that it has significant performance advantages in a multiprogrammed environment.", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Programming Languages and Systems", generalterms = "Algorithms; Design; Experimentation; Languages; Performance", keywords = "lock-free synchronization; MPI; multiprogrammed environments; program transformation; shared-memory machines; threaded execution", subject = "Hardware --- Memory Structures --- Design Styles (B.3.2): {\bf Shared memory}; Software --- Programming Techniques --- Concurrent Programming (D.1.3): {\bf Parallel programming}; Software --- Programming Languages --- Language Classifications (D.3.2): {\bf Concurrent, distributed, and parallel languages}; Software --- Programming Languages --- Processors (D.3.4): {\bf Preprocessors}; Software --- Programming Languages --- Processors (D.3.4): {\bf Run-time environments}; Software --- Operating Systems --- Process Management (D.4.1): {\bf Multiprocessing/multiprogramming/multitasking}; Data --- Data Structures (E.1): {\bf Lists, stacks, and queues}", } @Article{Unger:2000:CCA, author = "A. Unger and E. Zehendner and Th. Ungerer", title = "A combined compiler and architecture technique to control multithreaded execution of branches and loop iterations", journal = j-COMP-ARCH-NEWS, volume = "28", number = "1", pages = "53--61", month = mar, year = "2000", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:36 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", } @Article{Vishkin:2000:ELR, author = "Dascal Vishkin and Uzi Vishkin", title = "Experiments with list ranking for explicit multi-threaded {(XMT)} instruction parallelism", journal = j-ACM-J-EXP-ALGORITHMICS, volume = "5", pages = "10:1--10:??", month = "????", year = "2000", CODEN = "????", DOI = "http://doi.acm.org/10.1145/351827.384252", ISSN = "1084-6654", bibdate = "Mon Oct 6 16:03:09 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Algorithms for the problem of list ranking are empirically studied with respect to the Explicit Multi-Threaded (XMT) platform for instruction-level parallelism (ILP). The main goal of this study is to understand the differences between XMT and more traditional parallel computing implementation platforms/models as they pertain to the well studied list ranking problem. The main two findings are: (i) good speedups for much smaller inputs are possible and (ii) in part, the first finding is based on a new variant of a 1984 algorithm, called the No-Cut algorithm. The paper incorporates analytic (non-asymptotic) performance analysis into experimental performance analysis for relatively small inputs. This provides an interesting example where experimental research and theoretical analysis complement one another. Explicit Multi-Threading (XMT) is a fine-grained computation framework introduced in our SPAA'98 paper. Building on some key ideas of parallel computing, XMT covers the spectrum from algorithms through architecture to implementation; the main implementation related innovation in XMT was through the incorporation of low-overhead hardware and software mechanisms (for more effective fine-grained parallelism). The reader is referred to that paper for detail on these mechanisms. The XMT platform aims at faster single-task completion time by way of ILP.", acknowledgement = ack-nhfb, articleno = "10", fjournal = "Journal of Experimental Algorithmics (JEA)", } @Book{Walmsley:2000:MTP, author = "Mark Walmsley", title = "Multi-threaded programming in {C++}", publisher = pub-SV, address = pub-SV:adr, pages = "x + 223", year = "2000", ISBN = "1-85233-146-1", ISBN-13 = "978-1-85233-146-7", LCCN = "QA76.73.C153 W3148 2000", bibdate = "Sat Apr 20 11:14:00 2002", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", price = "US\$49.95", acknowledgement = ack-nhfb, } @Article{Wilson:2000:PBC, author = "Gregory V. Wilson", title = "Programmer's Bookshelf: Classics Old and New", journal = j-DDJ, volume = "25", number = "11", pages = "159--160", month = nov, year = "2000", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Wed Nov 8 15:09:25 MST 2000", bibsource = "http://www.ddj.com/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "This month Greg looks at Programming Pearls, Second Edition, by Jon Bentley; Foundations of Multithreaded, Parallel, and Distributing Programming, by Gregory R. Andrews; GUI Bloopers, by Jeff Johnson; The Humane Interface, by Jef Raskin; Legal Battles That Shaped the Software Industry, by Lawrence D. Graham; The World of Scripting Languages, by David Barron; C for Java Programmers, by Tomasz Muldner; and XML Elements of Style, by Simon St. Laurent.", acknowledgement = ack-nhfb, fjournal = "Dr. Dobb's Journal of Software Tools", } @Article{Zhang:2000:WMH, author = "Peter Zhang", title = "{Webrelay}: {A} Multithreaded {HTTP} Relay Server", journal = j-DDJ, volume = "25", number = "2", pages = "86, 88, 90--94, 96", month = feb, year = "2000", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Thu Nov 9 08:25:13 MST 2000", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.ddj.com/ftp/2000/2000_02/webrelay.txt; http://www.ddj.com/ftp/2000/2000_02/webrelay.zip", abstract = "Webrelay is a freely available multithreaded HTTP relay server that authenticates that clients are legitimate users before they are connected to vendor web servers. Additional resources include webrelay.txt (listings) and webrelay.zip (source code).", acknowledgement = ack-nhfb, fjournal = "Dr. Dobb's Journal of Software Tools", } @Article{Antoniu:2001:HSC, author = "Gabriel Antoniu and others", title = "The {Hyperion} system: {Compiling} multithreaded {Java} bytecode for distributed execution", journal = j-PARALLEL-COMPUTING, volume = "27", number = "10", pages = "1279--1297", month = sep, year = "2001", CODEN = "PACOEJ", ISSN = "0167-8191", ISSN-L = "0167-8191", bibdate = "Fri Feb 22 16:52:42 MST 2002", bibsource = "http://www.elsevier.com/locate/issn/01678191; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.elsevier.com/gej-ng/10/35/21/47/40/27/abstract.html; http://www.elsevier.nl/gej-ng/10/35/21/47/40/27/article.pdf", acknowledgement = ack-nhfb, fjournal = "Parallel Computing", } @Article{Attali:2001:GVJ, author = "Isabelle Attali and Denis Caromel and Marjorie Russo", title = "Graphical Visualization of {Java} Objects, Threads, and Locks", journal = j-IEEE-DISTRIB-SYST-ONLINE, volume = "2", number = "1", year = "2001", ISSN = "1541-4922 (print), 1558-1683 (electronic)", ISSN-L = "1541-4922", bibdate = "Wed Oct 23 17:47:56 2002", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://dsonline.computer.org/0101/features/att0101_print.htm", acknowledgement = ack-nhfb, fjournal = "IEEE Distributed Systems Online", } @Article{Becker:2001:SMW, author = "Thomas Becker", title = "Synchronization Monitors For {Win32}", journal = j-DDJ, volume = "26", number = "12", pages = "46, 48, 50--52, 54", month = dec, year = "2001", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Tue Feb 12 05:21:41 MST 2002", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.ddj.com/ftp/2001/2001_12/monitor.txt; http://www.ddj.com/ftp/2001/2001_12/monitor.zip", abstract = "Thomas presents a Java-style synchronization monitor for multithreaded Win32 development. Additional resources include {\tt monitor.txt} (listings) and {\tt monitor.zip} (source code).", acknowledgement = ack-nhfb, fjournal = "Dr. Dobb's Journal of Software Tools", } @Article{Bull:2001:MSO, author = "J. Mark Bull and Darragh O'Neill", title = "A microbenchmark suite for {OpenMP 2.0}", journal = j-COMP-ARCH-NEWS, volume = "29", number = "5", pages = "41--48", month = dec, year = "2001", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:22 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", } @Book{Chandra:2001:PPO, author = "Rohit Chandra and Leonardo Dagum and David Kohr and Dror Maydan and Jeff McDonald and Ramesh Menon", title = "Parallel Programming in {OpenMP}", publisher = pub-MORGAN-KAUFMANN, address = pub-MORGAN-KAUFMANN:adr, pages = "xvi + 230", year = "2001", ISBN = "1-55860-671-8", ISBN-13 = "978-1-55860-671-5", LCCN = "QA76.642 .P38 2001", bibdate = "Thu Jul 14 11:09:17 2005", bibsource = "ftp://ftp.math.utah.edu/pub/tex/bib/multithreading.bib; ftp://ftp.math.utah.edu/pub/tex/bib/pvm.bib; ftp://ftp.math.utah.edu/pub/tex/bib/unix.bib; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", price = "US\$39.95", URL = "http://www.mkp.com/books_catalog/catalog.asp?ISBN=1-55860-671-8", acknowledgement = ack-nhfb, keywords = "parallel programming (computer science)", } @Article{Duncan:2001:LPD, author = "Ray Duncan and Duncan Harris and Douglas Reilly and Craig Rodrigues and Michael Birken and Paul S. Person", title = "Letters: Plug-in Desupport; Threading and the {.Net} Framework; {CORBA} Interoperability; Game Over for {Java}; Totally Wired", journal = j-DDJ, volume = "26", number = "11", pages = "10, 12", month = nov, year = "2001", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Tue Feb 12 05:21:40 MST 2002", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.ddj.com/", acknowledgement = ack-nhfb, fjournal = "Dr. Dobb's Journal of Software Tools", } @InProceedings{Edelstein:2001:MJP, author = "Orit Edelstein and Eitan Farchi and Yarden Nir and Gil Ratsaby and Shmuel Ur", title = "Multithreaded {Java} Program Test Generation", crossref = "ACM:2001:PAJ", pages = "181--181", year = "2001", bibdate = "Mon May 06 09:31:01 2002", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.philippsen.com/JGI2001/camerareadyabstracts/18.html; http://www.philippsen.com/JGI2001/finalpapers/18500181.ps", acknowledgement = ack-nhfb, keywords = "Java", } @Article{Evripidou:2001:MDD, author = "Paraskevas Evripidou", title = "{$D^3$-Machine}: {A} decoupled data-driven multithreaded architecture with variable resolution support", journal = j-PARALLEL-COMPUTING, volume = "27", number = "9", pages = "1197--1225", month = aug, year = "2001", CODEN = "PACOEJ", ISSN = "0167-8191", ISSN-L = "0167-8191", bibdate = "Wed Jul 18 06:31:16 MDT 2001", bibsource = "http://www.elsevier.com/locate/issn/01678191; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.elsevier.nl/gej-ng/10/35/21/47/35/25/abstract.html; http://www.elsevier.nl/gej-ng/10/35/21/47/35/25/article.pdf", acknowledgement = ack-nhfb, fjournal = "Parallel Computing", } @Article{Garber:2001:NBT, author = "Lee Garber", title = "News Briefs: Is Tech Downturn Changing Education and Employment Trends; {HTMT} Promises High-Performance Computing; Controversial Software Law [{UCITA}] Hist Resistance", journal = j-COMPUTER, volume = "34", number = "10", pages = "19--21", month = oct, year = "2001", CODEN = "CPTRB4", ISSN = "0018-9162 (print), 1558-0814 (electronic)", ISSN-L = "0018-9162", bibdate = "Fri Feb 8 07:11:46 MST 2002", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://dlib.computer.org/co/books/co2001/pdf/rx019.pdf; http://www.computer.org/computer/co2001/rx019abs.htm", acknowledgement = ack-nhfb, fjournal = "Computer", keywords = "hybrid technology multithreaded architecture (HTMT); Uniform Computer Information Transactions Act (UCITA)", } @Article{Geiselbrecht:2001:NOS, author = "Travis K. Geiselbrecht", title = "The {NewOS} Operating System", journal = j-DDJ, volume = "26", number = "12", pages = "33, 35, 38, 40, 42, 44", month = dec, year = "2001", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Tue Feb 12 05:21:41 MST 2002", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "See correction \cite{Editors:2002:LUC}.", URL = "http://www.ddj.com/ftp/2001/2001_12/newos.txt; http://www.ddj.com/ftp/2001/2001_12/newos.zip", abstract = "NewOS is a freely available lightweight operating system written in C for platforms ranging from Intel- and AMD-based PCs to the Sega Dreamcast. Additional resources include {\tt newos.txt} (listings) and {\tt newos.zip} (source code).", acknowledgement = ack-nhfb, fjournal = "Dr. Dobb's Journal of Software Tools", } @Article{Goeschl:2001:JTT, author = "Siegfried Goeschl", title = "The {JUnit++} Testing Tool", journal = j-DDJ, volume = "26", number = "2", pages = "34, 36--38", month = feb, year = "2001", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Thu Feb 15 12:14:41 MST 2001", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.ddj.com/ftp/2001/2001_02/junitpp.txt; http://www.ddj.com/ftp/2001/2001_02/junitpp.zip", abstract = "JUnit++ is a freely available Java unit test framework that includes a test data repository, command-line arguments, and a TestRunner class that supports a built-in repetition counter and multithreading at the command line. Additional resources include junitpp.txt (listings) and junitpp.zip (source code).", acknowledgement = ack-nhfb, fjournal = "Dr. Dobb's Journal of Software Tools", } @InProceedings{Hanson:2001:UFI, author = "Richard J. Hanson and Clay P. Breshears and Henry A. Gabb", title = "Using a {Fortran} Interface to {POSIX} Threads", crossref = "Boisvert:2001:ASS", pages = "257--272", year = "2001", bibdate = "Sat Dec 29 09:54:37 2007", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Article{Huber:2001:EFC, author = "Andreas Huber", title = "Elegant Function Call Wrappers", journal = j-CCCUJ, volume = "19", number = "5", pages = "8--??", month = may, year = "2001", CODEN = "CCUJEX", ISSN = "1075-2838", bibdate = "Tue May 14 18:09:31 MDT 2002", bibsource = "http://www.cuj.com/articles/2001/0105/0105toc.htm?topic=articles; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Scheduling functions for later execution is an obvious requirement in multithreaded programs. How to do that and preserve both type safety and modularity is not so obvious. The author combines an old pattern and some new template techniques to pull it off rather nicely.", acknowledgement = ack-nhfb, fjournal = "C/C++ Users Journal", } @Article{Kakulavarapu:2001:DLB, author = "P. Kakulavarapu and O. C. Maquelin and J. N. Amaral and G. R. Gao", title = "Dynamic Load Balancers for a Multithreaded Multiprocessor System", journal = j-PARALLEL-PROCESS-LETT, volume = "11", number = "1", pages = "169--??", month = mar, year = "2001", CODEN = "PPLTEE", ISSN = "0129-6264", bibdate = "Sat Feb 23 19:27:51 MST 2002", bibsource = "http://ejournals.wspc.com.sg/ppl/ppl.shtml; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Parallel Processing Letters", } @Article{Kienzle:2001:CTT, author = "J{\"o}rg Kienzle and Alexander Romanovsky", title = "Combining tasking and transactions, part {II}: open multithreaded transactions", journal = j-SIGADA-LETTERS, volume = "21", number = "1", pages = "67--74", month = mar, year = "2001", CODEN = "AALEE5", ISSN = "0736-721X", ISSN-L = "0736-721X", bibdate = "Sat Aug 9 09:06:10 MDT 2003", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGAda Ada Letters", } @Article{Kienzle:2001:IEO, author = "J{\"o}rg Kienzle and Alexander Romanovsky", title = "Implementing exceptions in open multithreaded transactions based on {Ada 95} exceptions", journal = j-SIGADA-LETTERS, volume = "21", number = "3", pages = "57--63", month = sep, year = "2001", CODEN = "AALEE5", ISSN = "0736-721X", ISSN-L = "0736-721X", bibdate = "Sat Aug 9 09:06:11 MDT 2003", bibsource = "http://portal.acm.org/; http://www.acm.org/sigada/ada_letters/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGAda Ada Letters", } @Article{Lopes:2001:FGM, author = "L. Lopes and V. T. Vasconcelos and F. Silva", title = "Fine-grained multithreading with process calculi", journal = j-IEEE-TRANS-COMPUT, volume = "50", number = "8", pages = "852--862", month = aug, year = "2001", CODEN = "ITCOB4", DOI = "http://dx.doi.org/10.1109/12.947014", ISSN = "0018-9340 (print), 1557-9956 (electronic)", ISSN-L = "0018-9340", bibdate = "Tue Jul 5 10:03:11 MDT 2011", bibsource = "http://www.computer.org/tc/; http://www.math.utah.edu/pub/tex/bib/ieeetranscomput2000.bib; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=947014", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Computers", } @Article{Luk:2001:TML, author = "Chi-Keung Luk", title = "Tolerating memory latency through software-controlled pre-execution in simultaneous multithreading processors", journal = j-COMP-ARCH-NEWS, volume = "29", number = "2", pages = "40--51", month = may, year = "2001", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:50 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", } @InProceedings{Manson:2001:CSM, author = "Jeremy Manson and William Pugh", title = "Core Semantics of Multithreaded {Java}", crossref = "ACM:2001:PAJ", pages = "29--38", year = "2001", bibdate = "Mon May 06 09:31:01 2002", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.philippsen.com/JGI2001/camerareadyabstracts/42.html; http://www.philippsen.com/JGI2001/finalpapers/18500029.pdf", acknowledgement = ack-nhfb, keywords = "Java", } @Article{Nagle:2001:MFV, author = "Dan Nagle", title = "Multithreading, {Fthreads}, and {Visual Fortran}", journal = j-DDJ, volume = "26", number = "7", pages = "36, 38, 40", month = jul, year = "2001", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Thu Jun 7 06:07:17 MDT 2001", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.ddj.com/ftp/2001/2001_07/fthreads.zip", abstract = "Dan presents a Fortran module that helps you write multithreaded programs for Windows-based applications. Additional resources include fthreads.zip (source code).", acknowledgement = ack-nhfb, fjournal = "Dr. Dobb's Journal of Software Tools", } @Article{Nakhimovsky:2001:ISM, author = "Greg Nakhimovsky", title = "Improving Scalability Of Multithreaded Dynamic Memory Allocation", journal = j-DDJ, volume = "26", number = "7", pages = "44, 46, 48--50, 52, 54", month = jul, year = "2001", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Thu Jun 7 06:07:17 MDT 2001", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.ddj.com/ftp/2001/2001_07/mthot.txt; http://www.ddj.com/ftp/2001/2001_07/mthot.zip", abstract = "Multiprocessor/multithreaded environments add a new dimension to the familiar malloc facility. The ``MT-hot'' implementation Greg presents here lets multiple threads execute in parallel without major delays. Additional resources include mthot.txt (listings) and mthot.zip (source code).", acknowledgement = ack-nhfb, fjournal = "Dr. Dobb's Journal of Software Tools", } @Article{Nikolopoulos:2001:EMA, author = "D. S. Nikolopoulos and E. Artiaga and E. Ayguad{\'e} and J. Labarta", title = "Exploiting memory affinity in {OpenMP} through schedule reuse", journal = j-COMP-ARCH-NEWS, volume = "29", number = "5", pages = "49--55", month = dec, year = "2001", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:22 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", } @InProceedings{Pang:2001:PSR, author = "James Pang and Gholamali Shoja and Eric Manning", title = "Providing Soft Real-time {QoS} Guarantees for {Java} Threads", crossref = "ACM:2001:PAJ", pages = "39--46", year = "2001", bibdate = "Mon May 06 09:31:01 2002", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.philippsen.com/JGI2001/camerareadyabstracts/21.html; http://www.philippsen.com/JGI2001/finalpapers/18500039.pdf", acknowledgement = ack-nhfb, keywords = "Java", } @Article{Parcerisa:2001:ILT, author = "J.-M. Parcerisa and A. Gonzalez", title = "Improving latency tolerance of multithreading through decoupling", journal = j-IEEE-TRANS-COMPUT, volume = "50", number = "10", pages = "1084--1094", month = oct, year = "2001", CODEN = "ITCOB4", DOI = "http://dx.doi.org/10.1109/12.956093", ISSN = "0018-9340 (print), 1557-9956 (electronic)", ISSN-L = "0018-9340", bibdate = "Tue Jul 5 10:03:12 MDT 2011", bibsource = "http://www.computer.org/tc/; http://www.math.utah.edu/pub/tex/bib/ieeetranscomput2000.bib; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=956093", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Computers", } @Article{Plakal:2001:CGC, author = "Manoj Plakal and Charles N. Fischer", title = "Concurrent Garbage Collection Using Program Slices on Multithreaded Processors", journal = j-SIGPLAN, volume = "36", number = "1", pages = "94--100", month = jan, year = "2001", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:18:22 MST 2003", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "ACM SIGPLAN International Symposium on Memory Management (ISMM'00)", } @Article{Pyarali:2001:EOT, author = "Irfan Pyarali and Marina Spivak and Ron Cytron and Douglas C. Schmidt", title = "Evaluating and Optimizing Thread Pool Strategies for Real-Time {CORBA}", journal = j-SIGPLAN, volume = "36", number = "8", pages = "214--222", month = aug, year = "2001", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:18:29 MST 2003", bibsource = "http://portal.acm.org/; http://www.cs.wisc.edu/~bodik/om2001/program.html; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "", acknowledgement = ack-nhfb, annote = "OM'01: The First Workshop on Optimization of Middleware and Distributed Systems", fjournal = "ACM SIGPLAN Notices", } @Article{Reilly:2001:TNF, author = "Douglas Reilly", title = "Threading and the {.Net} Framework", journal = j-DDJ, volume = "26", number = "8", pages = "30, 32--33, 36, 38", month = aug, year = "2001", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Wed Jul 11 06:31:35 MDT 2001", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.ddj.com/ftp/2001/2001_08/thrednet.txt", abstract = "Microsoft's .NET Framework offers a number of features, such as threading, that simplify difficult tasks. Additional resources include thrednet.txt (listings).", acknowledgement = ack-nhfb, fjournal = "Dr. Dobb's Journal of Software Tools", } @Article{Salcianu:2001:PEA, author = "Alexandru Salcianu and Martin Rinard", title = "Pointer and escape analysis for multithreaded programs", journal = j-SIGPLAN, volume = "36", number = "7", pages = "12--23", month = jul, year = "2001", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:18:28 MST 2003", bibsource = "http://portal.acm.org/; http://www.acm.org/pubs/contents/proceedings/series/ppopp/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org/pubs/articles/proceedings/ppopp/379539/p12-salcianu/p12-salcianu.pdf; http://www.acm.org/pubs/citations/proceedings/ppopp/379539/p12-salcianu/", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", } @Article{Steensgaard:2001:TSH, author = "Bjarne Steensgaard", title = "Thread-Specific Heaps for Multi-Threaded Programs", journal = j-SIGPLAN, volume = "36", number = "1", pages = "18--24", month = jan, year = "2001", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sun Dec 14 09:18:22 MST 2003", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "ACM SIGPLAN International Symposium on Memory Management (ISMM'00)", } @Article{Sung:2001:MDA, author = "Michael Sung and Ronny Krashinsky and Krste Asanovi{\'c}", title = "Multithreading decoupled architectures for complexity-effective general purpose computing", journal = j-COMP-ARCH-NEWS, volume = "29", number = "5", pages = "56--61", month = dec, year = "2001", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:22 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", } @Article{Adiletta:2002:NGI, author = "Matthew Adiletta and Mark Rosenbluth and Debra Bernstein and Gilbert Wolrich and Hugh Wilkinson", title = "The Next Generation of {Intel IXP} Network Processors", journal = j-INTEL-TECH-J, volume = "6", number = "3", pages = "6--18", day = "15", month = aug, year = "2002", ISSN = "1535-766X", bibdate = "Sun Nov 17 11:06:06 2002", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://developer.intel.com/technology/itj/2002/volume06issue03/art01_nextgenixp/p01_abstract.htm; http://developer.intel.com/technology/itj/2002/volume06issue03/art01_nextgenixp/vol6iss3_art01.pdf", keywords = "10Gb/s; ATM; communication architecture; Ethernet; IXP; microprocessor architecture; multi-processors; multi-service switches; multi-threading; network processors; OC-192; OC-48; routing; switching", } @Article{Adiletta:2002:PSA, author = "Matthew Adiletta and Donald Hooper and Myles Wilde", title = "Packet over {SONET}: Achieving 10 {Gigabit}/sec Packet Processing with an {IXP2800}", journal = j-INTEL-TECH-J, volume = "6", number = "3", pages = "29--39", day = "15", month = aug, year = "2002", ISSN = "1535-766X", bibdate = "Sun Nov 17 11:06:06 2002", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://developer.intel.com/technology/itj/2002/volume06issue03/art05_packetoversonet/p01_abstract.htm; http://developer.intel.com/technology/itj/2002/volume06issue03/art05_packetoversonet/vol6iss3_art05.pdf", keywords = "10Gbs; ATM; communication architecture; Ethernet; hardware-based multi-threading; IXP; microprocessor architecture; multi-processors; multi-service switches; network processors; OC-192; OC-48; routing; switching", } @Article{Anonymous:2002:ST, author = "Anonymous", title = "Speculative threads", journal = j-COMP-ARCH-NEWS, volume = "30", number = "5", pages = "??--??", month = dec, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:23 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", } @Article{Boudol:2002:NCP, author = "G{\'e}rard Boudol and Ilaria Castellani", title = "Noninterference for concurrent programs and thread systems", journal = j-THEOR-COMP-SCI, volume = "281", number = "1-2", pages = "109--130", month = may, year = "2002", CODEN = "TCSCDI", ISSN = "0304-3975 (print), 1879-2294 (electronic)", ISSN-L = "0304-3975", bibdate = "Wed Nov 20 18:08:56 MST 2002", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Theoretical Computer Science", } @Article{Bouge:2002:IRE, author = "L. Boug{\'e} and V. Danjean and R. Namyst", title = "Improving Reactivity to {I/O} Events in Multithreaded Environments Using a Uniform, Scheduler-Centric {API}", journal = j-LECT-NOTES-COMP-SCI, volume = "2400", pages = "605--??", year = "2002", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Thu Sep 12 08:40:04 2002", bibsource = "http://link.springer-ny.com/link/service/series/0558/tocs/t2400.htm; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://link.springer-ny.com/link/service/series/0558/bibs/2400/24000605.htm; http://link.springer-ny.com/link/service/series/0558/papers/2400/24000605.pdf", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Brebner:2002:MLC, author = "Gordon Brebner", title = "Multithreading for Logic-Centric Systems", journal = j-LECT-NOTES-COMP-SCI, volume = "2438", pages = "5--??", year = "2002", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Tue Sep 10 19:10:28 MDT 2002", bibsource = "http://link.springer-ny.com/link/service/series/0558/tocs/t2438.htm; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://link.springer-ny.com/link/service/series/0558/bibs/2438/24380005.htm; http://link.springer-ny.com/link/service/series/0558/papers/2438/24380005.pdf", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @PhdThesis{Callaway:2002:VTR, author = "John Callaway", title = "Visualization of threads in a running {Java} program", type = "Thesis (M.S.)", school = "University of California, Santa Cruz", address = "Santa Cruz, CA, USA", year = "2002", LCCN = "QA76.73.J38 C36 2002", bibdate = "Tue May 6 05:26:58 MDT 2003", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "academic dissertations -- University of California, Santa Cruz -- 2002; academic dissertations -- University of California, Santa Cruz -- computer; computer science; computer software -- development; Java (computer program language); object-oriented programming (computer science); science; software engineering; visualization", } @Article{Carothers:2002:CMP, author = "Christopher D. Carothers and Boleslaw K. Szymanski", title = "Checkpointing Multithreaded Programs", journal = j-DDJ, volume = "27", number = "8", pages = "??--??", month = aug, year = "2002", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Fri Sep 13 06:15:52 MDT 2002", bibsource = "http://www.ddj.com/articles/2002/0208/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.ddj.com/ftp/2002/2002_08/checkpt.txt", abstract = "Checkpointing is the process by which you grab snapshots of running programs. Additional resources include checkpt.txt (listings).", acknowledgement = ack-nhfb, fjournal = "Dr. Dobb's Journal of Software Tools", } @Article{Cazals:2002:NID, author = "Fr{\'e}d{\'e}ric Cazals", title = "Non-Intrusive Debugging and Incremental Visualization with the Geometric Stethoscope", journal = j-J-GRAPHICS-TOOLS, volume = "7", number = "2", pages = "27--40", year = "2002", CODEN = "JGTOFD", ISSN = "1086-7651", bibdate = "Tue Dec 16 13:47:48 2003", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org/jgt/papers/Cazals02/", abstract = "Developing and debugging geometric applications is known to be a difficult task: The calculations and data structures can be involved, there are degenerate cases and numerical issues, etc. THis paper presents a software setup aiming at easing the development, the debugging, ad the maintenance of geometric applications. \par More precisely, {\em incremental visualization\/} is defined as the possibility for the programmer to visualize interactively any significant update of the geometric data structures at any time. {\em Non-intrusive debugging\/} is defined as the possibility of visualizing any geometric entity in three dimensions from a standard debugger at any time without modifying the source code. We present a setup to perform incremental visualization and non-intrusive debugging. This setup is based on multithreading and requires a three-dimensional viewer, such as Open Inventor, Vtk, or Geomview, and a standard debugger (dbx or gdb). \par An Open Inventor based C++ implementation of this setup accompanies this paper. Using it simply requires writing the functions converting the user's data structures into Open Inventor's data structures. The setup could easily be extended to accommodate other medias such as sound, video, etc.", acknowledgement = ack-nhfb, fjournal = "Journal of Graphics Tools: JGT", } @Article{Chappell:2002:DPB, author = "Robert S. Chappell and Francis Tseng and Adi Yoaz and Yale N. Patt", title = "Difficult-path branch prediction using subordinate microthreads", journal = j-COMP-ARCH-NEWS, volume = "30", number = "2", pages = "307--317", month = may, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:50 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", } @Article{Chaudhry:2002:PTS, author = "Puneesh Chaudhry", title = "A Per-Thread Singleton Class", journal = j-CCCUJ, volume = "20", number = "5", pages = "14--??", month = may, year = "2002", CODEN = "CCUJEX", ISSN = "1075-2838", bibdate = "Tue May 14 18:09:36 MDT 2002", bibsource = "http://www.cuj.com/articles/2002/0205/0205toc.htm?topic=articles; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "A refreshing look at an old pattern.", acknowledgement = ack-nhfb, fjournal = "C/C++ Users Journal", } @Article{Choi:2002:EPD, author = "Jong-Deok Choi and Keunwoo Lee and Alexey Loginov and Robert O'Callahan and Vivek Sarkar and Manu Sridharan", title = "Efficient and precise datarace detection for multithreaded object-oriented programs", journal = j-SIGPLAN, volume = "37", number = "5", pages = "258--269", month = may, year = "2002", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Thu May 15 12:23:02 MDT 2003", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", } @Article{Clark:2002:AMT, author = "Keith Clark and Peter J. Robinson", title = "Agents as Multi-threaded Logical Objects", journal = j-LECT-NOTES-COMP-SCI, volume = "2407", pages = "33--??", year = "2002", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Tue Sep 10 19:10:17 MDT 2002", bibsource = "http://link.springer-ny.com/link/service/series/0558/tocs/t2407.htm; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://link.springer-ny.com/link/service/series/0558/bibs/2407/24070033.htm; http://link.springer-ny.com/link/service/series/0558/papers/2407/24070033.pdf", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @InProceedings{Ding:2002:MOP, author = "Yun He and Chris H. Q. Ding", key = "multidimensional arrays; index reshuffle; vacancy tracking cycles; global exchange; dynamical remapping; MPI; OpenMP; hybrid MPI/OpenMP; SMP cluster.", title = "{MPI} and {OpenMP} Paradigms on Cluster of {SMP} Architectures", crossref = "IEEE:2002:STI", pages = "??--??", year = "2002", bibdate = "Wed Nov 26 07:34:20 2003", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.sc-2002.org/paperpdfs/pap.pap325.pdf", abstract = "We investigate remapping multi-dimensional arrays on cluster of SMP architectures under OpenMP, MPI, and hybrid paradigms. Traditional method of array transpose needs an auxiliary array of the same size and a copy back stage. We recently developed an in-place method using vacancy tracking cycles. The vacancy tracking algorithm outperforms the traditional 2-array method as demonstrated by extensive comparisons. The independence of vacancy tracking cycles allows efficient parallelization of the in-place method on SMP architectures at node level. Performance of multi-threaded parallelism using OpenMP are tested with different scheduling methods and different number of threads. The vacancy tracking method is parallelized using several parallel paradigms. At node level, pure OpenMP outperforms pure MPI by a factor of 2.76. Across entire cluster of SMP nodes, the hybrid MPI/OpenMP implementation outperforms pure MPI by a factor of 4.44, demonstrating the validity of the parallel paradigm of mixing MPI with OpenMP.", acknowledgement = ack-nhfb, } @Article{Donnelly:2002:LTT, author = "Austin Donnelly", title = "Lightweight Thread Tunnelling in Network Applications", journal = j-LECT-NOTES-COMP-SCI, volume = "2546", pages = "48--??", year = "2002", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Sat Nov 30 20:58:13 MST 2002", bibsource = "http://link.springer-ny.com/link/service/series/0558/tocs/t2546.htm; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://link.springer.de/link/service/series/0558/bibs/2546/25460048.htm; http://link.springer.de/link/service/series/0558/papers/2546/25460048.pdf", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Edelstein:2002:MJP, author = "O. Edelstein and E. Farchi and Y. Nir and G. Ratsaby and S. Ur", title = "Multithreaded {Java} program test generation", journal = j-IBM-SYS-J, volume = "41", number = "1", pages = "111--125", month = "????", year = "2002", CODEN = "IBMSA7", ISSN = "0018-8670", bibdate = "Tue Feb 12 17:23:05 MST 2002", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.research.ibm.com/journal/", URL = "http://www.research.ibm.com/journal/sj/411/edelstein.html; http://www.research.ibm.com/journal/sj/411/edelstein.pdf", acknowledgement = ack-nhfb, fjournal = "IBM Systems Journal", ordernumber = "G321-0144", } @Article{Editors:2002:LUC, author = "{The Editors} and Kim Reidar Lantz and Ze'ev Atlas and Pete Nelson and Gus J. Grubba", title = "Letters: {URL} Correction [``{The NewOS Operating System}'']; Passing Context to Threads; Compiling {Perl\slash Tk} Scripts; Standing by {Al}'s Principles; Understanding Photomosaics", journal = j-DDJ, volume = "27", number = "1", pages = "10, 12", month = jan, year = "2002", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Tue Feb 12 05:21:41 MST 2002", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "See \cite{Geiselbrecht:2001:NOS}.", URL = "http://www.ddj.com/", acknowledgement = ack-nhfb, fjournal = "Dr. Dobb's Journal of Software Tools", } @InProceedings{El-Ghazawi:2002:UPP, author = "Tarek El-Ghazawi and Fran{\c{c}}ois Cantonnet", title = "{UPC} Performance and Potential: {A} {NPB} Experimental Study", crossref = "IEEE:2002:STI", pages = "??--??", year = "2002", bibdate = "Wed Nov 26 07:34:20 2003", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.sc-2002.org/paperpdfs/pap.pap316.pdf", abstract = "UPC, or Unified Parallel C, is a parallel extension of ANSI C. UPC follows a distributed shared memory programming model aimed at leveraging the ease of programming of the shared memory paradigm, while enabling the exploitation of data locality. UPC incorporates constructs that allow placing data near the threads that manipulate them to minimize remote accesses. This paper gives an overview of the concepts and features of UPC and establishes, through extensive performance measurements of NPB workloads, the viability of the UPC programming language compared to the other popular paradigms. Further, through performance measurements we identify the challenges, the remaining steps and the priorities for UPC. It will be shown that with proper hand tuning libraries, UPC performance will be comparable incorporating such improvements into automatic compare quite favorably to message passing in ease and optimized collective operations to that of MPI. Furthermore, by compiler optimizations, UPC will of programming.", acknowledgement = ack-nhfb, keywords = "NPB (NAS Parallel Benchmark)", } @Article{Feuerstein:2002:LMT, author = "E. Feuerstein and A. Strejilevich de Loma", title = "On-Line Multi-Threaded Paging", journal = j-ALGORITHMICA, volume = "32", number = "1", pages = "36--60", month = jan, year = "2002", CODEN = "ALGOEJ", DOI = "http://www.springerlink.com/openurl.asp?genre=article&id=doi:10.1007/s00453-001-0073-z", ISSN = "0178-4617 (print), 1432-0541 (electronic)", ISSN-L = "0178-4617", MRclass = "68N25 (68Q10 68W05)", MRnumber = "MR1867023 (2002h:68033)", bibdate = "Fri Jan 6 11:38:14 MST 2006", bibsource = "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0178-4617&volume=32&issue=1; http://www.math.utah.edu/pub/tex/bib/index-table-a.html#algorithmica; http://www.math.utah.edu/pub/tex/bib/multithreading.bib; MathSciNet database", URL = "http://www.springerlink.com/openurl.asp?genre=article&issn=0178-4617&volume=32&issue=1&spage=36", acknowledgement = ack-nhfb, fjournal = "Algorithmica", } @Article{Flanagan:2002:MCM, author = "Cormac Flanagan and Shaz Qadeer and Sanjit A. Seshia", title = "A Modular Checker for Multithreaded Programs", journal = j-LECT-NOTES-COMP-SCI, volume = "2404", pages = "180--??", year = "2002", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Sat Nov 30 20:57:05 MST 2002", bibsource = "http://link.springer-ny.com/link/service/series/0558/tocs/t2404.htm; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://link.springer.de/link/service/series/0558/bibs/2404/24040180.htm; http://link.springer.de/link/service/series/0558/papers/2404/24040180.pdf", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Book{Garg:2002:TOA, author = "Rajat P. Garg and Ilya Sharapov", title = "Techniques for optimizing applications: high performance computing", publisher = pub-SUN-MICROSYSTEMS-PRESS, address = pub-SUN-MICROSYSTEMS-PRESS:adr, pages = "xliii + 616", year = "2002", ISBN = "0-13-093476-3", ISBN-13 = "978-0-13-093476-5", LCCN = "QA76.88 .G37 2002", bibdate = "Fri Apr 11 08:26:42 2003", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.sun.com/blueprints/", series = "Sun BluePrints Program", URL = "http://www.sun.com/books/catalog/garg.html/index.html", acknowledgement = ack-nhfb, annote = "From the Web site: The \verb=HPC_code_examples.tar.Z= tar-file contains the source code, makefiles, and shell scripts required to compile, link, and run the example programs discussed in the book.", keywords = "Forte Developer; MPI; OpenMP; Sun ClusterTools; Sun Solaris", } @Article{Haggar:2002:JQD, author = "Peter Haggar", title = "{Java Q\&A}: Does {Java} Guarantee Thread Safety?", journal = j-DDJ, volume = "27", number = "6", pages = "91--83", month = jun, year = "2002", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Wed May 1 15:43:59 MDT 2002", bibsource = "http://www.ddj.com/articles/2002/0206/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "Comments on lack of atomic-update guarantee in Java for objects larger than 32 bits, such as {\tt long} and {\tt double}, with sample code to exhibit the failure.", URL = "http://www.ddj.com/ftp/2002/2002_06/jqa0602.txt", abstract = "Additional resources include jqa0602.txt (listings).", acknowledgement = ack-nhfb, fjournal = "Dr. Dobb's Journal of Software Tools", } @Article{Hanson:2002:AFI, author = "Richard J. Hanson and Clay P. Breshears and Henry A. Gabb", title = "{Algorithm 821}: {A} {Fortran} interface to {POSIX} threads", journal = j-TOMS, volume = "28", number = "3", pages = "354--371", month = sep, year = "2002", CODEN = "ACMSCU", DOI = "http://doi.acm.org/10.1145/569147.569152", ISSN = "0098-3500 (print), 1557-7295 (electronic)", ISSN-L = "0098-3500", bibdate = "Sat Nov 9 11:16:50 MST 2002", bibsource = "http://www.acm.org/pubs/contents/journals/toms/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Pthreads is the library of POSIX standard functions for concurrent, multithreaded programming. The POSIX standard only defines an application programming interface (API) to the C programming language, not to Fortran. Many scientific and engineering applications are written in Fortran. Also, many of these applications exhibit functional, or task-level, concurrency. They would benefit from multithreading, especially on symmetric multiprocessors (SMP). We present here an interface to that part of the Pthreads library that is compatible with standard Fortran. The contribution consists of two primary source files: a Fortran module and a collection of C wrappers to Pthreads functions. The Fortran module defines the data structures, interface and initialization routines used to manage threads. The stability and portability of the Fortran API to Pthreads is demonstrated using common mathematical computations on three different systems.", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Mathematical Software", } @InProceedings{Karniadakis:2002:DLP, author = "Suchuan Dong and George Em. Karniadakis", title = "Dual-Level Parallelism for Deterministic and Stochastic {CFD} Problems", crossref = "IEEE:2002:STI", pages = "??--??", year = "2002", bibdate = "Wed Nov 26 07:34:20 2003", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.sc-2002.org/paperpdfs/pap.pap137.pdf", abstract = "A hybrid two-level parallelism using MPI/OpenMP is implemented in the general-purpose spectral/hp element CFD code NekTar to take advantage of the hierarchical structures arising in deterministic and stochastic CFD problems. We take a coarse grain approach to shared-memory parallelism with OpenMP and employ a workload-splitting scheme that can reduce the OpenMP synchronizations to the minimum. The hybrid implementation shows good scalability with respect to both the problem size and the number of processors in case of a fixed problem size. With the same number of processors, the hybrid model with 2 (or 4) OpenMP threads per MPI process is observed to perform better than pure MPI and pure OpenMP on the NCSA SGI Origin 2000, while the pure MPI model performs the best on the IBM SP3 at SDSC and on the Compaq Alpha cluster at PSC. A key new result is that the use of threads facilitates effectively prefinement, which is crucial to adaptive discretization using high-order methods.", acknowledgement = ack-nhfb, } @Article{Kempf:2002:BTL, author = "Bill Kempf", title = "The {Boost.Threads} Library", journal = j-CCCUJ, volume = "20", number = "5", pages = "6--??", month = may, year = "2002", CODEN = "CCUJEX", ISSN = "1075-2838", bibdate = "Tue May 14 18:09:36 MDT 2002", bibsource = "http://www.cuj.com/articles/2002/0205/0205toc.htm?topic=articles; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Standard C++ threads are imminent. CUJ predicts they will derive from the Boost.Threads library, explored here by the eminent author.", acknowledgement = ack-nhfb, fjournal = "C/C++ Users Journal", } @Article{Laneve:2002:TSJ, author = "Cosimo Laneve", title = "A type system for {JVM} threads", journal = j-THEOR-COMP-SCI, volume = "290", number = "1", pages = "741--778", month = oct, year = "2002", CODEN = "TCSCDI", ISSN = "0304-3975 (print), 1879-2294 (electronic)", ISSN-L = "0304-3975", bibdate = "Wed Nov 20 18:15:29 MST 2002", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Theoretical Computer Science", } @Article{Leman:2002:EFT, author = "Dmitri Leman", title = "An Efficient and Flexible Tracing Technique", journal = j-CCCUJ, volume = "20", number = "4", pages = "24--??", month = apr, year = "2002", CODEN = "CCUJEX", ISSN = "1075-2838", bibdate = "Tue May 14 18:09:36 MDT 2002", bibsource = "http://www.cuj.com/articles/2002/0204/0204toc.htm?topic=articles; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "This extensible tracing framework tames the dreaded multithreaded debugging demon.", acknowledgement = ack-nhfb, fjournal = "C/C++ Users Journal", } @Article{Mahinthakumar:2002:HMO, author = "G. Mahinthakumar and F. Saied", title = "A Hybrid {MPI-OpenMP} Implementation of an Implicit Finite-Element Code on Parallel Architectures", journal = j-IJHPCA, volume = "16", number = "4", pages = "371--393", month = "Winter", year = "2002", CODEN = "IHPCFL", ISSN = "1094-3420 (print), 1741-2846 (electronic)", ISSN-L = "1094-3420", bibdate = "Fri Nov 28 06:52:13 2003", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Article{Martinez:2002:SSAa, author = "Jos{\'e} F. Mart{\'\i}nez and Josep Torrellas", title = "Speculative synchronization: applying thread-level speculation to explicitly parallel applications", journal = j-COMP-ARCH-NEWS, volume = "30", number = "5", pages = "18--29", month = dec, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:23 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", } @Article{Martinez:2002:SSAb, author = "Jos{\'e} F. Mart{\'\i}nez and Josep Torrellas", title = "Speculative synchronization: applying thread-level speculation to explicitly parallel applications", journal = j-SIGPLAN, volume = "37", number = "10", pages = "18--29", month = oct, year = "2002", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Thu May 15 12:23:09 MDT 2003", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", } @Article{Martinez:2002:SSAc, author = "Jos{\'e} F. Mart{\'\i}nez and Josep Torrellas", title = "Speculative synchronization: applying thread-level speculation to explicitly parallel applications", journal = j-OPER-SYS-REV, volume = "36", number = "5", pages = "18--29", month = dec, year = "2002", CODEN = "OSRED8", ISSN = "0163-5980", ISSN-L = "0163-5980", bibdate = "Sat Aug 26 08:55:56 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Operating Systems Review", } @Article{Mauer:2002:FST, author = "Carl J. Mauer and Mark D. Hill and David A. Wood", title = "Full-system timing-first simulation", journal = j-SIGMETRICS, volume = "30", number = "1", pages = "108--116", month = jun, year = "2002", CODEN = "????", DOI = "http://doi.acm.org/10.1145/511334.511349", ISSN = "0163-5999 (print), 1557-9484 (electronic)", ISSN-L = "0163-5999", bibdate = "Thu Jun 26 11:38:22 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Computer system designers often evaluate future design alternatives with detailed simulators that strive for {\em functional fidelity\/} (to execute relevant workloads) and {\em performance fidelity\/} (to rank design alternatives). Trends toward multi-threaded architectures, more complex micro-architectures, and richer workloads, make authoring detailed simulators increasingly difficult. To manage simulator complexity, this paper advocates decoupled simulator organizations that separate functional and performance concerns. Furthermore, we define an approach, called {\em timing-first simulation}, that uses an augmented timing simulator to execute instructions important to performance in conjunction with a functional simulator to insure correctness. This design simplifies software development, leverages existing simulators, and can model micro-architecture timing in detail. We describe the timing-first organization and our experiences implementing TFsim, a full-system multiprocessor performance simulator. TFsim models a pipelined, out-of-order micro-architecture in detail, was developed in less than one person-year, and performs competitively with previously-published simulators. TFsim's timing simulator implements dynamically common instructions (99.99\% of them), while avoiding the vast and exacting implementation efforts necessary to run unmodified commercial operating systems and workloads. Virtutech Simics, a full-system functional simulator, checks and corrects the timing simulator's execution, contributing 18-36\% to the overall run-time. TFsim's mostly correct functional implementation introduces a worst-case performance error of 4.8\% for our commercial workloads. Some additional simulator performance is gained by verifying functional correctness less often, at the cost of some additional performance error.", acknowledgement = ack-nhfb, fjournal = "ACM SIGMETRICS Performance Evaluation Review", } @Article{Mukherjee:2002:DDE, author = "Shubhendu S. Mukherjee and Michael Kontz and Steven K. Reinhardt", title = "Detailed design and evaluation of redundant multithreading alternatives", journal = j-COMP-ARCH-NEWS, volume = "30", number = "2", pages = "99--110", month = may, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:50 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", } @Article{Oplinger:2002:ESRa, author = "Jeffrey Oplinger and Monica S. Lam", title = "Enhancing software reliability with speculative threads", journal = j-COMP-ARCH-NEWS, volume = "30", number = "5", pages = "184--196", month = dec, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:23 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", } @Article{Oplinger:2002:ESRb, author = "Jeffrey Oplinger and Monica S. Lam", title = "Enhancing software reliability with speculative threads", journal = j-SIGPLAN, volume = "37", number = "10", pages = "184--196", month = oct, year = "2002", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Thu May 15 12:23:09 MDT 2003", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", } @Article{Oplinger:2002:ESRc, author = "Jeffrey Oplinger and Monica S. Lam", title = "Enhancing software reliability with speculative threads", journal = j-OPER-SYS-REV, volume = "36", number = "5", pages = "184--196", month = dec, year = "2002", CODEN = "OSRED8", ISSN = "0163-5980", ISSN-L = "0163-5980", bibdate = "Sat Aug 26 08:55:56 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Operating Systems Review", } @Article{Plachetka:2002:QTS, author = "Tomas Plachetka", title = "(Quasi-) Thread-Safe {PVM} and (Quasi-) Thread-Safe {MPI} without Active Polling", journal = j-LECT-NOTES-COMP-SCI, volume = "2474", pages = "296--??", year = "2002", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Sat Nov 30 20:57:35 MST 2002", bibsource = "http://link.springer-ny.com/link/service/series/0558/tocs/t2474.htm; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://link.springer.de/link/service/series/0558/bibs/2474/24740296.htm; http://link.springer.de/link/service/series/0558/papers/2474/24740296.pdf", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Sato:2002:SJL, author = "Y. Sato", title = "A Study of {Java} Language for Effective Thread Migration", journal = "Record of Electrical and Communication Engineering Conversazione Tohoku University", volume = "71", number = "1", publisher = "Tohoku Daigaku Dentsu Danwakai", pages = "597--598", year = "2002", CODEN = "????", ISSN = "0385-7719", bibdate = "Tue Dec 24 07:09:37 MST 2002", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib; Ingenta database", acknowledgement = ack-nhfb, } @Article{Snavely:2002:SJP, author = "Allan Snavely and Dean M. Tullsen and Geoff Voelker", title = "Symbiotic jobscheduling with priorities for a simultaneous multithreading processor", journal = j-SIGMETRICS, volume = "30", number = "1", pages = "66--76", month = jun, year = "2002", CODEN = "????", DOI = "http://doi.acm.org/10.1145/511399.511343", ISSN = "0163-5999 (print), 1557-9484 (electronic)", ISSN-L = "0163-5999", bibdate = "Thu Jun 26 11:38:22 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Simultaneous Multithreading machines benefit from jobscheduling software that monitors how well coscheduled jobs share CPU resources, and coschedules jobs that interact well to make more efficient use of those resources. As a result, informed coscheduling can yield significant performance gains over naive schedulers. However, prior work on coscheduling focused on equal-priority job mixes, which is an unrealistic assumption for modern operating systems. This paper demonstrates that a scheduler for an SMT machine can both satisfy process priorities and symbiotically schedule low and high priority threads to increase system throughput. Naive priority schedulers dedicate the machine to high priority jobs to meet priority goals, and as a result decrease opportunities for increased performance from multithreading and coscheduling. More informed schedulers, however, can dynamically monitor the progress and resource utilization of jobs on the machine, and dynamically adjust the degree of multithreading to improve performance while still meeting priority goals. Using detailed simulation of an SMT architecture, we introduce and evaluate a series of five software and hardware-assisted priority schedulers. Overall, our results indicate that coscheduling priority jobs can significantly increase system throughput by as much as 40\%, and that (1) the benefit depends upon the relative priority of the coscheduled jobs, and (2) more sophisticated schedulers are more effective when the differences in priorities are greatest. We show that our priority schedulers can decrease average turnaround times for a random job mix by as much as 33\%.", acknowledgement = ack-nhfb, fjournal = "ACM SIGMETRICS Performance Evaluation Review", keywords = "job scheduling; priorities; simultaneous multithreading", } @Article{Sodan:2002:AMA, author = "Angela C. Sodan", title = "Applications on a multithreaded architecture: {A} case study with {EARTH-MANNA}", journal = j-PARALLEL-COMPUTING, volume = "28", number = "1", pages = "3--33", month = jan, year = "2002", CODEN = "PACOEJ", ISSN = "0167-8191", ISSN-L = "0167-8191", bibdate = "Fri Feb 22 16:52:43 MST 2002", bibsource = "http://www.elsevier.com/locate/issn/01678191; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.elsevier.com/gej-ng/10/35/21/60/27/28/abstract.html; http://www.elsevier.nl/gej-ng/10/35/21/60/27/28/00001684.pdf", acknowledgement = ack-nhfb, fjournal = "Parallel Computing", } @Article{Solihin:2002:UUL, author = "Yan Solihin and Jaejin Lee and Josep Torrellas", title = "Using a user-level memory thread for correlation prefetching", journal = j-COMP-ARCH-NEWS, volume = "30", number = "2", pages = "171--182", month = may, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:50 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", } @InProceedings{Sterling:2002:GMP, author = "Thomas L. Sterling and Hans P. Zima", title = "{Gilgamesh}: {A} Multithreaded Processor-In-Memory Architecture for Petaflops Computing", crossref = "IEEE:2002:STI", pages = "??--??", year = "2002", bibdate = "Wed Nov 26 07:34:20 2003", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.sc-2002.org/paperpdfs/pap.pap105.pdf", abstract = "Processor-in-Memory (PIM) architectures avoid the von Neumann bottleneck in conventional machines by integrating high-density DRAM and CMOS logic on the same chip. Parallel systems based on this new technology are expected to provide higher scalability, adaptability, robustness, fault tolerance and lower power consumption than current MPPs or commodity clusters. In this paper we describe the design of Gilgamesh, a PIM-based massively parallel architecture, and elements of its execution model. Gilgamesh extends existing PIM capabilities by incorporating advanced mechanisms for virtualizing tasks and data and providing adaptive resource management for load balancing and latency tolerance. The Gilgamesh execution model is based on macroservers, a middleware layer which supports object-based runtime management of data and threads allowing explicit and dynamic control of locality and load balancing. The paper concludes with a discussion of related research activities and an outlook to future work.", acknowledgement = ack-nhfb, } @Article{Stoller:2002:MCM, author = "Scott D. Stoller", title = "Model-checking multi-threaded distributed {Java} programs", journal = j-INT-J-SOFTW-TOOLS-TECHNOL-TRANSFER, volume = "4", number = "1", pages = "71--91", month = oct, year = "2002", CODEN = "????", DOI = "http://dx.doi.org/10.1007/s10009-002-0077-2", ISSN = "1433-2779 (print), 1433-2787 (electronic)", ISSN-L = "1433-2779", bibdate = "Tue Nov 23 15:01:41 MST 2004", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "International Journal on Software Tools for Technology Transfer: STTT", } @Article{Sung:2002:CPE, author = "Minyoung Sung and Soyoung Kim and Sangsoo Park and Naehyuck Chang and Heonshik Shin", title = "Comparative performance evaluation of {Java} threads for embedded applications: {Linux Thread} vs. {Green Thread}", journal = j-INFO-PROC-LETT, volume = "84", number = "4", pages = "221--225", day = "30", month = nov, year = "2002", CODEN = "IFPLAT", ISSN = "0020-0190 (print), 1872-6119 (electronic)", ISSN-L = "0020-0190", bibdate = "Mon Jan 26 08:44:30 MST 2004", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.sciencedirect.com/science/journal/00200190", acknowledgement = ack-nhfb, fjournal = "Information Processing Letters", } @Article{Tennberg:2002:RGO, author = "Patrick Tennberg", title = "Refactoring Global Objects in Multithreaded Applications", journal = j-CCCUJ, volume = "20", number = "5", pages = "20--??", month = may, year = "2002", CODEN = "CCUJEX", ISSN = "1075-2838", bibdate = "Tue May 14 18:09:36 MDT 2002", bibsource = "http://www.cuj.com/articles/2002/0205/0205toc.htm?topic=articles; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Although you may get fired for introducing any new global variables, it's too much work to rewrite old code to remove them. So make them thread-safe and stop worrying.", acknowledgement = ack-nhfb, fjournal = "C/C++ Users Journal", } @Article{Theobald:2002:IEC, author = "Kevin B. Theobald and Rishi Kumar and Gagan Agrawal and Gerd Heber and Ruppa K. Thulasiram and Guang R. Gao", title = "Implementation and evaluation of a communication intensive application on the {EARTH} multithreaded system", journal = j-CCPE, volume = "14", number = "3", pages = "183--201", month = mar, year = "2002", CODEN = "CCPEBO", DOI = "http://dx.doi.org/10.1002/cpe.604", ISSN = "1532-0626 (print), 1532-0634 (electronic)", ISSN-L = "1532-0626", bibdate = "Sat May 18 14:54:00 MDT 2002", bibsource = "http://www.interscience.wiley.com/jpages/1532-0626; http://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www3.interscience.wiley.com/journalfinder.html", URL = "http://www3.interscience.wiley.com/cgi-bin/abstract/93513486/START; http://www3.interscience.wiley.com/cgi-bin/fulltext?ID=93513486{\&}PLACEBO=IE.pdf", acknowledgement = ack-nhfb, fjournal = "Concurrency and Computation: Prac\-tice and Experience", } @Article{Ungerer:2002:MP, author = "Theo Ungerer and Borut Robi{\v{c}} and Jurij {\v{S}}ilc", title = "Multithreaded Processors", journal = j-COMP-J, volume = "45", number = "3", pages = "320--348", month = "????", year = "2002", CODEN = "CMPJA6", ISSN = "0010-4620 (print), 1460-2067 (electronic)", ISSN-L = "0010-4620", bibdate = "Fri May 10 10:12:07 MDT 2002", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www3.oup.co.uk/computer_journal/hdb/Volume_45/Issue_03/", URL = "http://www3.oup.co.uk/computer_journal/hdb/Volume_45/Issue_03/450320.sgm.abs.html; http://www3.oup.co.uk/computer_journal/hdb/Volume_45/Issue_03/pdf/450320.pdf", acknowledgement = ack-nhfb, fjournal = "The Computer Journal", } @Article{Ungerer:2002:SPE, author = "Theo Ungerer and Borut Robi{\v{c}} and Jurij {\v{S}}ilc", title = "A survey of processors with explicit multithreading", journal = j-COMP-SURV, volume = "35", number = "1", pages = "29--63", month = mar, year = "2002", CODEN = "CMSVAN", ISSN = "0360-0300 (print), 1557-7341 (electronic)", ISSN-L = "0360-0300", bibdate = "Thu Aug 7 06:57:01 MDT 2003", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM Computing Surveys", } @Article{Vijaykumar:2002:TFR, author = "T. N. Vijaykumar and Irith Pomeranz and Karl Cheng", title = "Transient-fault recovery using simultaneous multithreading", journal = j-COMP-ARCH-NEWS, volume = "30", number = "2", pages = "87--98", month = may, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:50 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", } @Article{Yan:2002:RCC, author = "C. Yan", title = "Race condition and concurrency safety of multithreaded object-oriented programming in {Java}", journal = "IEEE International Conference on Systems Man and Cybernetics", volume = "6", pages = "??--??", year = "2002", CODEN = "????", ISSN = "1062-922X", bibdate = "Tue Apr 8 06:53:44 MDT 2003", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib; Ingenta database", acknowledgement = ack-nhfb, xxpages = "WA1Q3", } @Article{Zhai:2002:COS, author = "Antonia Zhai and Christopher B. Colohan and J. Gregory Steffan and Todd C. Mowry", title = "Compiler optimization of scalar value communication between speculative threads", journal = j-OPER-SYS-REV, volume = "36", number = "5", pages = "171--183", month = dec, year = "2002", CODEN = "OSRED8", ISSN = "0163-5980", ISSN-L = "0163-5980", bibdate = "Sat Aug 26 08:55:56 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Operating Systems Review", } @Article{Zhai:2002:COSa, author = "Antonia Zhai and Christopher B. Colohan and J. Gregory Steffan and Todd C. Mowry", title = "Compiler optimization of scalar value communication between speculative threads", journal = j-COMP-ARCH-NEWS, volume = "30", number = "5", pages = "171--183", month = dec, year = "2002", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:23 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", } @Article{Zhai:2002:COSb, author = "Antonia Zhai and Christopher B. Colohan and J. Gregory Steffan and Todd C. Mowry", title = "Compiler optimization of scalar value communication between speculative threads", journal = j-SIGPLAN, volume = "37", number = "10", pages = "171--183", month = oct, year = "2002", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Thu May 15 12:23:09 MDT 2003", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", } @Article{Aamodt:2003:FMO, author = "Tor M. Aamodt and Pedro Marcuello and Paul Chow and Antonio Gonz{\'a}lez and Per Hammarlund and Hong Wang and John P. Shen", title = "A framework for modeling and optimization of prescient instruction prefetch", journal = j-SIGMETRICS, volume = "31", number = "1", pages = "13--24", month = jun, year = "2003", CODEN = "????", DOI = "http://doi.acm.org/10.1145/781027.781030", ISSN = "0163-5999 (print), 1557-9484 (electronic)", ISSN-L = "0163-5999", bibdate = "Thu Jun 26 11:41:41 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "This paper describes a framework for modeling macroscopic program behavior and applies it to optimizing prescient instruction prefetch --- novel technique that uses helper threads to improve single-threaded application performance by performing judicious and timely instruction prefetch. A helper thread is initiated when the main thread encounters a spawn point, and prefetches instructions starting at a distant target point. The target identifies a code region tending to incur I-cache misses that the main thread is likely to execute soon, even though intervening control flow may be unpredictable. The optimization of spawn-target pair selections is formulated by modeling program behavior as a Markov chain based on profile statistics. Execution paths are considered stochastic outcomes, and aspects of program behavior are summarized via path expression mappings. Mappings for computing reaching, and posteriori probability; path length mean, and variance; and expected path footprint are presented. These are used with Tarjan's fast path algorithm to efficiently estimate the benefit of spawn-target pair selections. Using this framework we propose a spawn-target pair selection algorithm for prescient instruction prefetch. This algorithm has been implemented, and evaluated for the Itanium Processor Family architecture. A limit study finds 4.8\%to 17\% speedups on an in-order simultaneous multithreading processor with eight contexts, over nextline and streaming I-prefetch for a set of benchmarks with high I-cache miss rates. The framework in this paper is potentially applicable to other thread speculation techniques.", acknowledgement = ack-nhfb, fjournal = "ACM SIGMETRICS Performance Evaluation Review", keywords = "analytical modeling; helper threads; instruction prefetch; multithreading; optimization; path expressions", } @Article{Addison:2003:OIA, author = "C. Addison and Y. Ren and M. van Waveren", title = "{OpenMP} issues arising in the development of parallel {BLAS} and {LAPACK} libraries", journal = j-SCI-PROG, volume = "11", number = "2", pages = "95--104", year = "2003", CODEN = "SCIPEV", ISSN = "1058-9244 (print), 1875-919X (electronic)", ISSN-L = "1058-9244", bibdate = "Mon Jan 12 06:28:15 MST 2004", bibsource = "http://www.iospress.nl/site/html/10589244.html; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Scientific Programming", } @Article{Almasi:2003:DCD, author = "George Alm{\'a}si and C{\u{a}}lin Ca{\c{s}}caval and Jos{\'e} G. Casta{\~n}os and Monty Denneau and Derek Lieber and Jos{\'e} E. Moreira and Henry S. {Warren, Jr.}", title = "Dissecting {Cyclops}: a detailed analysis of a multithreaded architecture", journal = j-COMP-ARCH-NEWS, volume = "31", number = "1", pages = "26--38", month = mar, year = "2003", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:37 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", } @Article{Balis:2003:MSM, author = "Bartosz Bali{\'s} and Marian Bubak and W{\l}odzimierz Funika and Roland Wism{\"u}ller", title = "A monitoring system for multithreaded applications", journal = j-FUT-GEN-COMP-SYS, volume = "19", number = "5", pages = "641--650", month = jul, year = "2003", CODEN = "FGSEVI", ISSN = "0167-739X (print), 1872-7115 (electronic)", ISSN-L = "0167-739X", bibdate = "Sat Jan 10 10:03:34 MST 2004", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Future Generation Computer Systems", remark = "Tools for Program Development and Analysis. Best papers from two Technical Sessions, at ICCS2001, San Francisco, CA, USA, and ICCS2002, Amsterdam, The Netherlands.", } @Article{Barekas:2003:MAO, author = "Vasileios K. Barekas and Panagiotis E. Hadjidoukas and Eleftherios D. Polychronopoulos and others", title = "A Multiprogramming Aware {OpenMP} Implementation", journal = j-SCI-PROG, volume = "11", number = "2", pages = "133--141", year = "2003", CODEN = "SCIPEV", ISSN = "1058-9244 (print), 1875-919X (electronic)", ISSN-L = "1058-9244", bibdate = "Mon Jan 12 06:28:15 MST 2004", bibsource = "http://www.iospress.nl/site/html/10589244.html; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Scientific Programming", } @Article{Brightwell:2003:DIP, author = "Ron Brightwell and Rolf Riesen and Arthur B. Maccabe", title = "Design, Implementation, and Performance of {MPI} on {Portals 3.0}", journal = j-IJHPCA, volume = "17", number = "1", pages = "7--20", month = "Spring", year = "2003", CODEN = "IHPCFL", ISSN = "1094-3420 (print), 1741-2846 (electronic)", ISSN-L = "1094-3420", bibdate = "Fri Nov 28 06:52:13 2003", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Article{Briguglio:2003:PPM, author = "Sergio Briguglio and Beniamino Di Martino and Gregorio Vlad", title = "A performance-prediction model for {PIC} applications on clusters of Symmetric MultiProcessors: Validation with hierarchical {HPF $+$ OpenMP} implementation", journal = j-SCI-PROG, volume = "11", number = "2", pages = "159--176", year = "2003", CODEN = "SCIPEV", ISSN = "1058-9244 (print), 1875-919X (electronic)", ISSN-L = "1058-9244", bibdate = "Mon Jan 12 06:28:15 MST 2004", bibsource = "http://www.iospress.nl/site/html/10589244.html; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Scientific Programming", } @Article{Carr:2003:TPT, author = "Steve Carr and Jean Mayo and Ching-Kuang Shene", title = "{ThreadMentor}: a pedagogical tool for multithreaded programming", journal = j-JERIC, volume = "3", number = "1", pages = "1--30", month = mar, year = "2003", CODEN = "????", ISSN = "1531-4278", bibdate = "Tue Feb 3 18:43:37 MST 2004", bibsource = "http://www.acm.org/pubs/contents/journals/jeric/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM Journal on Educational Resources in Computing (JERIC)", } @Article{Chen:2003:CSS, author = "Peng-Sheng Chen and Ming-Yu Hung and Yuan-Shin Hwang and Roy Dz-Ching Ju and Jenq Kuen Lee", title = "Compiler support for speculative multithreading architecture with probabilistic points-to analysis", journal = j-SIGPLAN, pages = "25--36", year = "2003", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Mon Dec 22 16:52:42 MST 2003", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", } @Article{Domani:2003:TLH, author = "Tamar Domani and Gal Goldshtein and Elliot K. Kolodner and Ethan Lewis and Erez Petrank and Dafna Sheinwald", title = "Thread-Local Heaps for {Java}", journal = j-SIGPLAN, volume = "38", number = "2s", pages = "183--194", month = feb, year = "2003", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Thu May 15 12:23:14 MDT 2003", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib; Ingenta database", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", } @Article{Edelstein:2003:FTM, author = "Orit Edelstein and Eitan Farchi and Evgeny Goldin and Yarden Nir and Gil Ratsaby and Shmuel Ur", title = "Framework for testing multi-threaded {Java} programs", journal = j-CCPE, volume = "15", number = "3--5", pages = "485--499", month = mar # "\slash " # apr, year = "2003", CODEN = "CCPEBO", DOI = "http://dx.doi.org/10.1002/cpe.654", ISSN = "1532-0626 (print), 1532-0634 (electronic)", ISSN-L = "1532-0626", bibdate = "Tue Jan 13 09:28:08 MST 2004", bibsource = "http://www.interscience.wiley.com/jpages/1532-0626; http://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www3.interscience.wiley.com/journalfinder.html", acknowledgement = ack-nhfb, fjournal = "Concurrency and Computation: Prac\-tice and Experience", onlinedate = "12 Feb 2003", } @Article{Fang:2003:DGO, author = "Weijian Fang and Cho-Li Wang and Francis C. M. Lau", title = "On the design of global object space for efficient multi-threading {Java} computing on clusters", journal = j-PARALLEL-COMPUTING, volume = "29", number = "11--12", pages = "1563--1587", month = nov # "\slash " # dec, year = "2003", CODEN = "PACOEJ", ISSN = "0167-8191", ISSN-L = "0167-8191", bibdate = "Wed Dec 24 09:07:29 MST 2003", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Parallel Computing", } @Article{Gagnon:2003:EIT, author = "E. Gagnon and L. Hendren", title = "Effective Inline-Threaded Interpretation of {Java} Bytecode Using Preparation Sequences", journal = j-LECT-NOTES-COMP-SCI, volume = "2622", pages = "170--184", year = "2003", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Tue Apr 15 07:54:18 MDT 2003", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib; Ingenta database", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Gould:2003:GLT, author = "Nicholas I. M. Gould and Dominique Orban and Philippe L. Toint", title = "{GALAHAD}, a library of thread-safe {Fortran 90} packages for large-scale nonlinear optimization", journal = j-TOMS, volume = "29", number = "4", pages = "353--372", month = dec, year = "2003", CODEN = "ACMSCU", DOI = "http://doi.acm.org/10.1145/962437.962438", ISSN = "0098-3500 (print), 1557-7295 (electronic)", ISSN-L = "0098-3500", bibdate = "Mon Jan 5 17:18:49 MST 2004", bibsource = "http://www.acm.org/pubs/contents/journals/toms/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "We describe the design of version 1.0 of GALAHAD, a library of Fortran 90 packages for large-scale nonlinear optimization. The library particularly addresses quadratic programming problems, containing both interior point and active set algorithms, as well as tools for preprocessing problems prior to solution. It also contains an updated version of the venerable nonlinear programming package, LANCELOT.", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Mathematical Software", } @Article{Grossman:2003:TSM, author = "Dan Grossman", title = "Type-safe multithreading in cyclone", journal = j-SIGPLAN, volume = "38", number = "3", pages = "13--25", month = mar, year = "2003", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Thu May 15 12:23:16 MDT 2003", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", } @Article{Heinlein:2003:ATS, author = "C. Heinlein", title = "Advanced Thread Synchronization in {Java} Using Interaction Expressions", journal = j-LECT-NOTES-COMP-SCI, volume = "2591", pages = "345--365", year = "2003", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Tue Apr 1 06:09:06 MST 2003", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib; Ingenta database", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Jin:2003:AMP, author = "Haoqiang Jin and Gabriele Jost and Jerry Yan and others", title = "Automatic multilevel parallelization using {OpenMP}", journal = j-SCI-PROG, volume = "11", number = "2", pages = "177--190", year = "2003", CODEN = "SCIPEV", ISSN = "1058-9244 (print), 1875-919X (electronic)", ISSN-L = "1058-9244", bibdate = "Mon Jan 12 06:28:15 MST 2004", bibsource = "http://www.iospress.nl/site/html/10589244.html; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Scientific Programming", } @InProceedings{Kee:2003:POP, author = "Yang-Suk Kee and Jin-Soo Kim and Soonhoi Ha", title = "{ParADE}: An {OpenMP} Programming Environment for {SMP} Cluster Systems", crossref = "ACM:2003:SII", pages = "??--??", year = "2003", bibdate = "Wed Nov 26 07:34:20 2003", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10708#0; http://www.sc-conference.org/sc2003/paperpdfs/pap130.pdf", abstract = "Demand for programming environments to exploit clusters of symmetric multiprocessors (SMPs) is increasing. In this paper, we present a new programming environment, called ParADE, to enable easy, portable, and high-performance programming on SMP clusters. It is an OpenMP programming environment on top of a multi-threaded software distributed shared memory (SDSM) system with a variant of home-based lazy release consistency protocol. To boost performance, the runtime system provides explicit message-passing primitives to make it a hybrid-programming environment. Collective communication primitives are used for the synchronization and work-sharing directives associated with small data structures, lessening the synchronization overhead and avoiding the implicit barriers of work-sharing directives. The OpenMP translator bridges the gap between the OpenMP abstraction and the hybrid programming interfaces of the runtime system. The experiments with several NAS benchmarks and applications on a Linux-based cluster show promising results that ParADE overcomes the performance problem of the conventional SDSM-based OpenMP environment.", acknowledgement = ack-nhfb, keywords = "hybrid programming; MPI; OpenMP; programming environment; SMP cluster; software distributed shared memory", } @Article{Keen:2003:CCP, author = "Aaron W. Keen and Takashi Ishihara and Justin T. Maris and Tiejun Li and Eugene F. Fodor and Ronald A. Olsson", title = "A comparison of concurrent programming and cooperative multithreading", journal = j-CCPE, volume = "15", number = "1", pages = "27--53", month = jan, year = "2003", CODEN = "CCPEBO", DOI = "http://dx.doi.org/10.1002/cpe.706", ISSN = "1532-0626 (print), 1532-0634 (electronic)", ISSN-L = "1532-0626", bibdate = "Tue Jan 13 09:28:05 MST 2004", bibsource = "http://www.interscience.wiley.com/jpages/1532-0626; http://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www3.interscience.wiley.com/journalfinder.html", acknowledgement = ack-nhfb, fjournal = "Concurrency and Computation: Prac\-tice and Experience", onlinedate = "6 Jan 2003", } @Article{Kepner:2003:MTF, author = "Jeremy Kepner", title = "A multi-threaded fast convolver for dynamically parallel image filtering", journal = j-J-PAR-DIST-COMP, volume = "63", number = "3", pages = "360--372", month = mar, year = "2003", CODEN = "JPDCER", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Tue Dec 16 16:10:40 MST 2003", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Journal of Parallel and Distributed Computing", } @InProceedings{Klasky:2003:GBP, author = "Scott Alan Klasky and Stephane Ethier and Zhihong Lin and Kevin Martins and Doug McCune and Ravi Samtaney", title = "Grid-Based Parallel Data Streaming implemented for the Gyrokinetic Toroidal Code", crossref = "ACM:2003:SII", pages = "??--??", year = "2003", bibdate = "Wed Nov 26 07:34:20 2003", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10722#2; http://www.sc-conference.org/sc2003/paperpdfs/pap207.pdf", abstract = "We have developed a threaded parallel data streaming approach using Globus to transfer multi-terabyte simulation data from a remote supercomputer to the scientist's home analysis/visualization cluster, as the simulation executes, with negligible overhead. Data transfer experiments show that this concurrent data transfer approach is more favorable compared with writing to local disk and then transferring this data to be post-processed. The present approach is conducive to using the grid to pipeline the simulation with post-processing and visualization. We have applied this method to the Gyrokinetic Toroidal Code (GTC), a 3-dimensional particle-in-cell code used to study micro-turbulence in magnetic confinement fusion from first principles plasma theory.", acknowledgement = ack-nhfb, } @Article{Koster:2003:TTI, author = "Rainer Koster and Andrew P. Black and Jie Huang and Jonathan Walpole and Calton Pu", title = "Thread transparency in information flow middleware", journal = j-SPE, volume = "33", number = "4", pages = "321--349", month = apr, year = "2003", CODEN = "SPEXBL", DOI = "http://dx.doi.org/10.1002/spe.510", ISSN = "0038-0644 (print), 1097-024X (electronic)", ISSN-L = "0038-0644", bibdate = "Sat Nov 29 17:39:44 MST 2003", bibsource = "http://www.interscience.wiley.com/jpages/0038-0644; http://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www3.interscience.wiley.com/journalfinder.html", acknowledgement = ack-nhfb, fjournal = "Software---Practice and Experience", onlinedate = "19 Feb 2003", } @Article{Koufaty:2003:HTN, author = "David Koufaty and Deborah T. Marr", title = "Hyperthreading Technology in the Netburst Microarchitecture", journal = j-IEEE-MICRO, volume = "23", number = "2", pages = "56--65", month = mar # "\slash " # apr, year = "2003", CODEN = "IEMIDZ", DOI = "http://dx.doi.org/10.1109/MM.2003.1196115", ISSN = "0272-1732 (print), 1937-4143 (electronic)", ISSN-L = "0272-1732", bibdate = "Wed Apr 23 18:57:11 MDT 2003", bibsource = "http://www.computer.org/micro/mi2003/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://dlib.computer.org/mi/books/mi2003/pdf/m2056.pdf; http://www.computer.org/micro/mi2003/m2056abs.htm", acknowledgement = ack-nhfb, fjournal = "IEEE Micro", } @Article{Kranzlmuller:2003:RAP, author = "Dieter Kranzlm{\"u}ller and Peter Kacsuk and Jack Dongarra and Jens Volkert", title = "Recent Advances in Parallel Virtual Machine and Message Passing Interface (Select papers from the {EuroPVMMPI 2002 Conference})", journal = j-IJHPCA, volume = "17", number = "1", pages = "3--5", month = "Spring", year = "2003", CODEN = "IHPCFL", ISSN = "1094-3420 (print), 1741-2846 (electronic)", ISSN-L = "1094-3420", bibdate = "Fri Nov 28 06:52:13 2003", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Article{Kreuzinger:2003:RTE, author = "J. Kreuzinger and U. Brinkschulte and M. Pfeffer and S. Uhrig and T. Ungerer", title = "Real-time event-handling and scheduling on a multithreaded {Java} microcontroller", journal = j-MICROPROC-MICROSYS, volume = "27", number = "1", pages = "19--31", year = "2003", CODEN = "MIMID5", ISSN = "0141-9331", ISSN-L = "0141-9331", bibdate = "Tue Feb 18 07:16:21 MST 2003", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib; Ingenta database", acknowledgement = ack-nhfb, fjournal = "Microprocessors and Microsystems", } @Article{Kwok:2003:EHC, author = "Yu-Kwong Kwok", title = "On Exploiting Heterogeneity for Cluster Based Parallel Multithreading Using Task Duplication", journal = j-J-SUPERCOMPUTING, volume = "25", number = "1", pages = "63--72", month = may, year = "2003", CODEN = "JOSUED", ISSN = "0920-8542 (print), 1573-0484 (electronic)", ISSN-L = "0920-8542", bibdate = "Tue Dec 16 08:27:09 MST 2003", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.wkap.nl/journalhome.htm/0920-8542", URL = "http://ipsapp009.kluweronline.com/content/getfile/5189/43/4/abstract.htm; http://ipsapp009.kluweronline.com/content/getfile/5189/43/4/fulltext.pdf", acknowledgement = ack-nhfb, fjournal = "The Journal of Supercomputing", } @Article{Marowka:2003:EOT, author = "Ami Marowka", title = "Extending {OpenMP} for Task Parallelism", journal = j-PARALLEL-PROCESS-LETT, volume = "13", number = "3", pages = "341--??", month = sep, year = "2003", CODEN = "PPLTEE", ISSN = "0129-6264", bibdate = "Sat Nov 6 18:06:31 MST 2004", bibsource = "http://ejournals.wspc.com.sg/ppl/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Parallel Processing Letters", } @Article{Mattson:2003:HGO, author = "Timothy G. Mattson", title = "How good is {OpenMP}", journal = j-SCI-PROG, volume = "11", number = "2", pages = "81--93", year = "2003", CODEN = "SCIPEV", ISSN = "1058-9244 (print), 1875-919X (electronic)", ISSN-L = "1058-9244", bibdate = "Mon Jan 12 06:28:15 MST 2004", bibsource = "http://www.iospress.nl/site/html/10589244.html; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Scientific Programming", } @Article{McDowell:2003:ISS, author = "Luke K. McDowell and Susan J. Eggers and Steven D. Gribble", title = "Improving server software support for simultaneous multithreaded processors", journal = j-SIGPLAN, pages = "37--48", year = "2003", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Mon Dec 22 16:52:42 MST 2003", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", } @Article{Muller:2003:OCB, author = "Matthias S. M{\"u}ller", title = "An {OpenMP} compiler benchmark", journal = j-SCI-PROG, volume = "11", number = "2", pages = "125--131", year = "2003", CODEN = "SCIPEV", ISSN = "1058-9244 (print), 1875-919X (electronic)", ISSN-L = "1058-9244", bibdate = "Mon Jan 12 06:28:15 MST 2004", bibsource = "http://www.iospress.nl/site/html/10589244.html; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Scientific Programming", } @InProceedings{Nakajima:2003:PIS, author = "Kengo Nakajima", title = "Parallel Iterative Solvers of {GeoFEM} with Selective Blocking Preconditioning for Nonlinear Contact Problems on the {Earth Simulator}", crossref = "ACM:2003:SII", pages = "??--??", year = "2003", bibdate = "Wed Nov 26 07:34:20 2003", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10703#1; http://www.sc-conference.org/sc2003/paperpdfs/pap155.pdf", abstract = "An efficient parallel iterative method with selective blocking preconditioning has been developed for symmetric multiprocessor (SMP) cluster architectures with vector processors such as the Earth Simulator. This method is based on a three-level hybrid parallel programming model, which includes message passing for inter-SMP node communication, loop directives by OpenMP for intra-SMP node parallelization and vectorization for each processing element (PE). This method provides robust and smooth convergence and excellent vector and parallel performance in 3D geophysical simulations with contact conditions performed on the Earth Simulator. The selective blocking preconditioning is much more efficient than ILU(1) and ILU(2). Performance for the complicated Southwest Japan model with more than 23 M DOF on 10 SMP nodes (80 PEs) of the Earth Simulator was 161.7 GFLOPS, corresponding to 25.3\% of the peak performance for hybrid programming model, and 190.4 GFLOPS (29.8\% of the peak performance) for flat MPI, respectively.", acknowledgement = ack-nhfb, } @Article{Pang:2003:PSR, author = "James C. Pang and Gholamali C. Shoja and Eric G. Manning", title = "Providing soft real-time quality of service guarantees for {Java} threads", journal = j-CCPE, volume = "15", number = "3--5", pages = "521--538", month = mar # "\slash " # apr, year = "2003", CODEN = "CCPEBO", DOI = "http://dx.doi.org/10.1002/cpe.663", ISSN = "1532-0626 (print), 1532-0634 (electronic)", ISSN-L = "1532-0626", bibdate = "Tue Jan 13 09:28:08 MST 2004", bibsource = "http://www.interscience.wiley.com/jpages/1532-0626; http://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www3.interscience.wiley.com/journalfinder.html", acknowledgement = ack-nhfb, fjournal = "Concurrency and Computation: Prac\-tice and Experience", onlinedate = "12 Feb 2003", } @Article{Park:2003:IMP, author = "Il Park and Babak Falsafi and T. N. Vijaykumar", title = "Implicitly-multithreaded processors", journal = j-COMP-ARCH-NEWS, volume = "31", number = "2", pages = "39--51", month = may, year = "2003", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", } @Article{Petitpierre:2003:JTC, author = "C. Petitpierre", title = "{Java} Threads Can Be Very Useful Building Blocks", journal = j-LECT-NOTES-COMP-SCI, volume = "2604", pages = "204", year = "2003", CODEN = "LNCSD9", ISSN = "0302-9743 (print), 1611-3349 (electronic)", ISSN-L = "0302-9743", bibdate = "Tue Apr 1 06:09:06 MST 2003", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib; Ingenta database", acknowledgement = ack-nhfb, fjournal = "Lecture Notes in Computer Science", } @Article{Pinilla:2003:UJT, author = "Ruben Pinilla and Marisa Gil", title = "{ULT}: a {Java} threads model for platform independent execution", journal = j-OPER-SYS-REV, volume = "37", number = "4", pages = "48--62", month = oct, year = "2003", CODEN = "OSRED8", ISSN = "0163-5980", ISSN-L = "0163-5980", bibdate = "Sat Aug 26 08:55:53 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Operating Systems Review", } @Article{Pozniansky:2003:EFD, author = "Eli Pozniansky and Assaf Schuster", title = "Efficient on-the-fly data race detection in multithreaded {C++} programs", journal = j-SIGPLAN, pages = "179--190", year = "2003", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Mon Dec 22 16:52:42 MST 2003", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", } @Article{Prabhu:2003:UTL, author = "Manohar K. Prabhu and Kunle Olukotun", title = "Using thread-level speculation to simplify manual parallelization", journal = j-SIGPLAN, pages = "1--12", year = "2003", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Mon Dec 22 16:52:42 MST 2003", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", } @Article{Price:2003:CAF, author = "Gregory W. Price and David K. Lowenthal", title = "A comparative analysis of fine-grain threads packages", journal = j-J-PAR-DIST-COMP, volume = "63", number = "11", pages = "1050--1063", month = nov, year = "2003", CODEN = "JPDCER", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Tue Dec 16 16:10:44 MST 2003", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Journal of Parallel and Distributed Computing", } @Article{Prvulovic:2003:RUT, author = "Milos Prvulovic and Josep Torrellas", title = "{ReEnact}: using thread-level speculation mechanisms to debug data races in multithreaded codes", journal = j-COMP-ARCH-NEWS, volume = "31", number = "2", pages = "110--121", month = may, year = "2003", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", } @Book{Robbins:2003:USP, author = "Kay A. Robbins and Steven Robbins", title = "{UNIX} Systems programming: communication, concurrency, and threads", publisher = pub-PHPTR, address = pub-PHPTR:adr, edition = "Second", pages = "xvii + 893", year = "2003", ISBN = "0-13-042411-0", ISBN-13 = "978-0-13-042411-2", LCCN = "QA76.76.O63 R6215 2003", bibdate = "Wed Aug 20 21:08:15 2003", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, annote = "See \cite{Robbins:1996:PUP} for first edition.", keywords = "operating systems (computers); UNIX (computer file)", } @Article{Robison:2003:MCN, author = "Arch D. Robison", title = "Memory Consistency and {.NET}", journal = j-DDJ, volume = "28", number = "4", pages = "46, 48--50", month = apr, year = "2003", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Thu Jun 12 05:46:22 MDT 2003", bibsource = "http://www.ddj.com/articles/2003/0304/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.ddj.com/documents/s=7827/ddj0304e/", abstract = "Understanding the basics of memory consistency is essential to writing multithreaded code that works on both uniprocessors and multiprocessors.", acknowledgement = ack-nhfb, fjournal = "Dr. Dobb's Journal of Software Tools", } @Article{Solihin:2003:CPU, author = "Yan Solihin and Jaejin Lee and Josep Torrellas", title = "Correlation Prefetching with a User-Level Memory Thread", journal = j-IEEE-TRANS-PAR-DIST-SYS, volume = "14", number = "6", pages = "563--580", month = jun, year = "2003", CODEN = "ITDSEO", DOI = "http://dx.doi.org/10.1109/TPDS.2003.1206504", ISSN = "1045-9219 (print), 1558-2183 (electronic)", ISSN-L = "1045-9219", bibdate = "Wed Dec 24 10:02:07 MST 2003", bibsource = "http://www.computer.org/tpds/td2003/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://csdl.computer.org/comp/trans/td/2003/06/l0563abs.htm; http://csdl.computer.org/dl/trans/td/2003/06/l0563.pdf", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Parallel and Distributed Systems", } @Article{Swanson:2003:ESI, author = "Steven Swanson and Luke K. McDowell and Michael M. Swift and Susan J. Eggers and Henry M. Levy", title = "An evaluation of speculative instruction execution on simultaneous multithreaded processors", journal = j-TOCS, volume = "21", number = "3", pages = "314--340", month = aug, year = "2003", CODEN = "ACSYEC", ISSN = "0734-2071", ISSN-L = "0734-2071", bibdate = "Thu Aug 7 10:13:26 MDT 2003", bibsource = "http://www.acm.org/pubs/contents/journals/tocs/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Computer Systems", } @Article{Thulasiram:2003:PEM, author = "Ruppa K. Thulasiram and Parimala Thulasiraman", title = "Performance Evaluation of a Multithreaded {Fast Fourier Transform} Algorithm for Derivative Pricing", journal = j-J-SUPERCOMPUTING, volume = "26", number = "1", pages = "43--58", month = aug, year = "2003", CODEN = "JOSUED", ISSN = "0920-8542 (print), 1573-0484 (electronic)", ISSN-L = "0920-8542", bibdate = "Tue Dec 16 08:27:10 MST 2003", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.wkap.nl/journalhome.htm/0920-8542", URL = "http://ipsapp009.kluweronline.com/content/getfile/5189/46/4/abstract.htm; http://ipsapp009.kluweronline.com/content/getfile/5189/46/4/fulltext.pdf", acknowledgement = ack-nhfb, fjournal = "The Journal of Supercomputing", } @Article{Timmerman:2003:EWC, author = "Martin Timmerman", title = "Examining {Windows CE .NET}", journal = j-DDJ, volume = "28", number = "2", pages = "62, 64", month = feb, year = "2003", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Thu Jun 12 05:46:21 MDT 2003", bibsource = "http://www.ddj.com/articles/2003/0302/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.ddj.com/documents/s=7790/ddj0302h/", abstract = "Martin examines Windows CE .NET's thread handling and advanced interrupt handling capabilities, as well as its synchronization mechanisms and network stack performance.", acknowledgement = ack-nhfb, fjournal = "Dr. Dobb's Journal of Software Tools", } @Article{Tremblay:2003:IEP, author = "G. Tremblay and C. J. Morrone and J. N. Amaral and G. R. Gao", title = "Implementation of the {EARTH} programming model on {SMP} clusters: a multi-threaded language and runtime system", journal = j-CCPE, volume = "15", number = "9", pages = "821--844", day = "10", month = aug, year = "2003", CODEN = "CCPEBO", DOI = "http://dx.doi.org/10.1002/cpe.729", ISSN = "1532-0626 (print), 1532-0634 (electronic)", ISSN-L = "1532-0626", bibdate = "Tue Jan 13 09:28:12 MST 2004", bibsource = "http://www.interscience.wiley.com/jpages/1532-0626; http://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www3.interscience.wiley.com/journalfinder.html", acknowledgement = ack-nhfb, fjournal = "Concurrency and Computation: Prac\-tice and Experience", onlinedate = "14 Jul 2003", } @Article{Tseng:2003:DST, author = "Y. Tseng and R. F. DeMara and P. J. Wilder", title = "Distributed-sum termination detection supporting multithreaded execution", journal = j-PARALLEL-COMPUTING, volume = "29", number = "7", pages = "953--968", month = jul, year = "2003", CODEN = "PACOEJ", ISSN = "0167-8191", ISSN-L = "0167-8191", bibdate = "Wed Dec 24 09:07:26 MST 2003", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Parallel Computing", } @Article{Ungerer:2003:SPE, author = "Theo Ungerer and Borut Robi{\v{c}} and Jurij {\v{S}}ilc", title = "A survey of processors with explicit multithreading", journal = j-COMP-SURV, volume = "35", number = "1", pages = "29--63", month = mar, year = "2003", CODEN = "CMSVAN", DOI = "http://doi.acm.org/10.1145/641865.641867", ISSN = "0360-0300 (print), 1557-7341 (electronic)", ISSN-L = "0360-0300", bibdate = "Thu Jun 19 10:18:52 MDT 2008", bibsource = "http://portal.acm.org/; http://www.acm.org/pubs/contents/journals/surveys/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Hardware multithreading is becoming a generally applied technique in the next generation of microprocessors. Several multithreaded processors are announced by industry or already into production in the areas of high-performance microprocessors, media, and network processors. A multithreaded processor is able to pursue two or more threads of control in parallel within the processor pipeline. The contexts of two or more threads of control are often stored in separate on-chip register sets. Unused instruction slots, which arise from latencies during the pipelined execution of single-threaded programs by a contemporary microprocessor, are filled by instructions of other threads within a multithreaded processor. The execution units are multiplexed between the thread contexts that are loaded in the register sets. Underutilization of a superscalar processor due to missing instruction-level parallelism can be overcome by simultaneous multithreading, where a processor can issue multiple instructions from multiple threads each cycle. Simultaneous multithreaded processors combine the multithreading technique with a wide-issue superscalar processor to utilize a larger part of the issue bandwidth by issuing instructions from different threads simultaneously. Explicit multithreaded processors are multithreaded processors that apply processes or operating system threads in their hardware thread slots. These processors optimize the throughput of multiprogramming workloads rather than single-thread performance. We distinguish these processors from implicit multithreaded processors that utilize thread-level speculation by speculatively executing compiler- or machine-generated threads of control that are part of a single sequential program. This survey paper explains and classifies the explicit multithreading techniques in research and in commercial microprocessors.", acknowledgement = ack-nhfb, fjournal = "ACM Computing Surveys", keywords = "Blocked multithreading; interleaved multithreading; simultaneous multithreading", } @Article{vonPraun:2003:SCA, author = "Christoph von Praun and Thomas R. Gross", title = "Static conflict analysis for multi-threaded object-oriented programs", journal = j-SIGPLAN, volume = "38", number = "5", pages = "115--128", month = may, year = "2003", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Sat Oct 11 12:45:00 MDT 2003", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", } @Article{White:2003:UTL, author = "Tom White", title = "Using Thread-Local Variables In {Java}", journal = j-DDJ, volume = "28", number = "7", pages = "42, 44--46", month = jul, year = "2003", CODEN = "DDJOEB", ISSN = "1044-789X", bibdate = "Thu Jun 12 05:46:24 MDT 2003", bibsource = "http://www.ddj.com/articles/2003/0307/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.ddj.com/ftp/2003/2003_07/thread.txt; http://www.ddj.com/ftp/2003/2003_07/thread.zip", abstract = "Java's ThreadLocal class provides a powerful, easy-to-use way to write efficient code that is safe for multithreaded access. Additional resources include thread.txt (listings) and thread.zip (source code).", acknowledgement = ack-nhfb, fjournal = "Dr. Dobb's Journal of Software Tools", } @Article{Yong:2003:AMC, author = "Xie Yong and Hsu Wen-Jing", title = "Aligned Multithreaded Computations and Their Scheduling with {FAB} Performance Guarantees", journal = j-PARALLEL-PROCESS-LETT, volume = "13", number = "3", pages = "353--??", month = sep, year = "2003", CODEN = "PPLTEE", ISSN = "0129-6264", bibdate = "Thu Jan 06 09:41:03 2005", bibsource = "http://ejournals.wspc.com.sg/ppl/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Parallel Processing Letters", } @Article{Bhowmik:2004:GCF, author = "Anasua Bhowmik and Manoj Franklin", title = "A General Compiler Framework for Speculative Multithreaded Processors", journal = j-IEEE-TRANS-PAR-DIST-SYS, volume = "15", number = "8", pages = "713--724", month = aug, year = "2004", CODEN = "ITDSEO", DOI = "http://dx.doi.org/10.1109/TPDS.2004.26", ISSN = "1045-9219 (print), 1558-2183 (electronic)", ISSN-L = "1045-9219", bibdate = "Sat Dec 11 16:24:15 MST 2004", bibsource = "http://www.computer.org/tpds/td2004/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://csdl.computer.org/dl/trans/td/2004/08/l0713.htm; http://csdl.computer.org/dl/trans/td/2004/08/l0713.pdf; http://doi.ieeecomputersociety.org/10.1109/TPDS.2004.26", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Parallel and Distributed Systems", } @Article{Bouchenak:2004:EIE, author = "S. Bouchenak and D. Hagimont and S. Krakowiak and N. De Palma and F. Boyer", title = "Experiences implementing efficient {Java} thread serialization, mobility and persistence", journal = j-SPE, volume = "34", number = "4", pages = "355--393", day = "10", month = apr, year = "2004", CODEN = "SPEXBL", DOI = "http://dx.doi.org/10.1002/spe.569", ISSN = "0038-0644 (print), 1097-024X (electronic)", ISSN-L = "0038-0644", bibdate = "Sat Apr 16 07:26:28 MDT 2005", bibsource = "http://www.interscience.wiley.com/jpages/0038-0644; http://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www3.interscience.wiley.com/journalfinder.html", acknowledgement = ack-nhfb, fjournal = "Software---Practice and Experience", onlinedate = "5 Jan 2004", } @Article{Chaudhuri:2004:SAN, author = "Mainak Chaudhuri and Mark Heinrich", title = "{SMTp}: {An Architecture} for {Next-generation Scalable Multi-threading}", journal = j-COMP-ARCH-NEWS, volume = "32", number = "2", pages = "124--124", month = mar, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", } @Article{Flanagan:2004:ADA, author = "Cormac Flanagan and Stephen N. Freund", title = "Atomizer: a dynamic atomicity checker for multithreaded programs", journal = j-SIGPLAN, volume = "39", number = "1", pages = "256--267", month = jan, year = "2004", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue Apr 12 09:38:12 MDT 2005", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", } @Article{Georges:2004:JPR, author = "A. Georges and M. Christiaens and M. Ronsse and K. De Bosschere", title = "{JaRec}: a portable record\slash replay environment for multi-threaded {Java} applications", journal = j-SPE, volume = "34", number = "6", pages = "523--547", month = may, year = "2004", CODEN = "SPEXBL", DOI = "http://dx.doi.org/10.1002/spe.579", ISSN = "0038-0644 (print), 1097-024X (electronic)", ISSN-L = "0038-0644", bibdate = "Sat Apr 16 07:26:29 MDT 2005", bibsource = "http://www.interscience.wiley.com/jpages/0038-0644; http://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www3.interscience.wiley.com/journalfinder.html", acknowledgement = ack-nhfb, fjournal = "Software---Practice and Experience", onlinedate = "24 Feb 2004", } @Article{Johnson:2004:MCP, author = "Troy A. Johnson and Rudolf Eigenmann and T. N. Vijaykumar", title = "Min-cut program decomposition for thread-level speculation", journal = j-SIGPLAN, volume = "39", number = "6", pages = "59--70", month = may, year = "2004", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Thu Dec 2 05:49:55 MST 2004", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", } @Article{Johnston:2004:ADP, author = "Wesley M. Johnston and J. R. Paul Hanna and Richard J. Millar", title = "Advances in dataflow programming languages", journal = j-COMP-SURV, volume = "36", number = "1", pages = "1--34", month = mar, year = "2004", CODEN = "CMSVAN", DOI = "http://doi.acm.org/10.1145/1013208.1013209", ISSN = "0360-0300 (print), 1557-7341 (electronic)", ISSN-L = "0360-0300", bibdate = "Thu Jun 19 10:19:47 MDT 2008", bibsource = "http://www.acm.org/pubs/contents/journals/surveys/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Many developments have taken place within dataflow programming languages in the past decade. In particular, there has been a great deal of activity and advancement in the field of dataflow visual programming languages. The motivation for this article is to review the content of these recent developments and how they came about. It is supported by an initial review of dataflow programming in the 1970s and 1980s that led to current topics of research. It then discusses how dataflow programming evolved toward a hybrid von Neumann dataflow formulation, and adopted a more coarse-grained approach. Recent trends toward dataflow visual programming languages are then discussed with reference to key graphical dataflow languages and their development environments. Finally, the article details four key open topics in dataflow programming languages.", acknowledgement = ack-nhfb, fjournal = "ACM Computing Surveys", keywords = "co-ordination languages; component software; data flow visual programming; Dataflow; graphical programming; multithreading; software engineering", } @Article{Kalla:2004:IPC, author = "Ron Kalla and Balaram Sinharoy and Joel M. Tendler", title = "{IBM Power5} Chip: {A} Dual-Core Multithreaded Processor", journal = j-IEEE-MICRO, volume = "24", number = "2", pages = "40--47", month = mar # "\slash " # apr, year = "2004", CODEN = "IEMIDZ", DOI = "http://dx.doi.org/10.1109/MM.2004.1289290", ISSN = "0272-1732 (print), 1937-4143 (electronic)", ISSN-L = "0272-1732", bibdate = "Sat Dec 11 17:59:16 MST 2004", bibsource = "http://www.computer.org/micro/mi2004/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://csdl.computer.org/comp/mags/mi/2004/02/m2040abs.htm; http://csdl.computer.org/dl/mags/mi/2004/02/m2040.htm; http://csdl.computer.org/dl/mags/mi/2004/02/m2040.pdf", acknowledgement = ack-nhfb, fjournal = "IEEE Micro", } @Article{Kapil:2004:CMP, author = "Sanjiv Kapil and Harlan McGhan and Jesse Lawrendra", title = "A Chip Multithreaded Processor for Network-Facing Workloads", journal = j-IEEE-MICRO, volume = "24", number = "2", pages = "20--30", month = mar # "\slash " # apr, year = "2004", CODEN = "IEMIDZ", DOI = "http://dx.doi.org/10.1109/MM.2004.1289288", ISSN = "0272-1732 (print), 1937-4143 (electronic)", ISSN-L = "0272-1732", bibdate = "Sat Dec 11 17:59:16 MST 2004", bibsource = "http://www.computer.org/micro/mi2004/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://csdl.computer.org/comp/mags/mi/2004/02/m2020abs.htm; http://csdl.computer.org/dl/mags/mi/2004/02/m2020.htm; http://csdl.computer.org/dl/mags/mi/2004/02/m2020.pdf", acknowledgement = ack-nhfb, fjournal = "IEEE Micro", } @Article{Kee:2004:MMM, author = "Yang-Suk Kee and Jin-Soo Kim and Soonhoi Ha", title = "Memory management for multi-threaded software {DSM} systems", journal = j-PARALLEL-COMPUTING, volume = "30", number = "1", pages = "121--138", month = jan, year = "2004", CODEN = "PACOEJ", ISSN = "0167-8191", ISSN-L = "0167-8191", bibdate = "Sun Nov 7 05:53:52 MST 2004", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.sciencedirect.com/science/journal/01678191", acknowledgement = ack-nhfb, fjournal = "Parallel Computing", } @Article{Krashinsky:2004:VTAa, author = "Ronny Krashinsky and Christopher Batten and Mark Hampton and Steve Gerding and Brian Pharris and Jared Casper and Krste Asanovic", title = "The Vector-Thread Architecture", journal = j-COMP-ARCH-NEWS, volume = "32", number = "2", pages = "52--52", month = mar, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", } @Article{Krashinsky:2004:VTAb, author = "Ronny Krashinsky and Christopher Batten and Mark Hampton and Steve Gerding and Brian Pharris and Jared Casper and Krste Asanovic", title = "The Vector-Thread Architecture", journal = j-IEEE-MICRO, volume = "24", number = "6", pages = "84--90", month = nov # "\slash " # dec, year = "2004", CODEN = "IEMIDZ", DOI = "http://dx.doi.org/10.1109/MM.2004.90", ISSN = "0272-1732 (print), 1937-4143 (electronic)", ISSN-L = "0272-1732", bibdate = "Wed Apr 20 08:11:28 MDT 2005", bibsource = "http://csdl.computer.org/comp/mags/mi/2004/06/m6toc.htm; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://csdl.computer.org/dl/mags/mi/2004/06/m6084.htm; http://csdl.computer.org/dl/mags/mi/2004/06/m6084.pdf; http://doi.ieeecomputersociety.org/10.1109/MM.2004.90", acknowledgement = ack-nhfb, fjournal = "IEEE Micro", } @Article{Kumar:2004:AST, author = "Nagendra J. Kumar and Siddhartha Shivshankar and Alexander G. Dean", title = "Asynchronous software thread integration for efficient software", journal = j-SIGPLAN, volume = "39", number = "7", pages = "37--46", month = jul, year = "2004", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Thu Dec 2 05:49:55 MST 2004", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", } @Article{Kumar:2004:SIH, author = "Rakesh Kumar and Dean M. Tullsen and Parthasarathy Ranganathan and Norman P. Jouppi and Keith I. Farkas", title = "Single-{ISA} Heterogeneous Multi-Core Architectures for Multithreaded Workload Performance", journal = j-COMP-ARCH-NEWS, volume = "32", number = "2", pages = "64--64", month = mar, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:45 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", } @Article{Maris:2004:CCP, author = "Justin T. Maris and Aaron W. Keen and Takashi Ishihara and Ronald A. Olsson", title = "A comparison of concurrent programming and cooperative multithreading under load balancing applications", journal = j-CCPE, volume = "16", number = "4", pages = "345--369", day = "10", month = apr, year = "2004", CODEN = "CCPEBO", DOI = "http://dx.doi.org/10.1002/cpe.751", ISSN = "1532-0626 (print), 1532-0634 (electronic)", ISSN-L = "1532-0626", bibdate = "Sat May 14 11:30:53 MDT 2005", bibsource = "http://www.interscience.wiley.com/jpages/1532-0626; http://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www3.interscience.wiley.com/journalfinder.html", acknowledgement = ack-nhfb, fjournal = "Concurrency and Computation: Prac\-tice and Experience", onlinedate = "19 Jan 2004", } @Article{Marowka:2004:OOA, author = "Ami Marowka and Zhenying Liu and Barbara Chapman", title = "{OpenMP}-oriented applications for distributed shared memory architectures", journal = j-CCPE, volume = "16", number = "4", pages = "371--384", day = "10", month = apr, year = "2004", CODEN = "CCPEBO", DOI = "http://dx.doi.org/10.1002/cpe.752", ISSN = "1532-0626 (print), 1532-0634 (electronic)", ISSN-L = "1532-0626", bibdate = "Sat May 14 11:30:53 MDT 2005", bibsource = "http://www.interscience.wiley.com/jpages/1532-0626; http://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www3.interscience.wiley.com/journalfinder.html", acknowledgement = ack-nhfb, fjournal = "Concurrency and Computation: Prac\-tice and Experience", onlinedate = "19 Jan 2004", } @Article{Martin:2004:HPA, author = "Mar{\'\i}a J. Mart{\'\i}n and Marta Parada and Ram{\'o}n Doallo", title = "High Performance Air Pollution Simulation Using {OpenMP}", journal = j-J-SUPERCOMPUTING, volume = "28", number = "3", pages = "311--321", month = jun, year = "2004", CODEN = "JOSUED", ISSN = "0920-8542 (print), 1573-0484 (electronic)", ISSN-L = "0920-8542", bibdate = "Sat Dec 4 12:39:13 MST 2004", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.wkap.nl/journalhome.htm/0920-8542", URL = "http://ipsapp008.kluweronline.com/IPS/content/ext/x/J/5189/I/54/A/5/abstract.htm", acknowledgement = ack-nhfb, fjournal = "The Journal of Supercomputing", } @Article{Michael:2004:SLF, author = "Maged M. Michael", title = "Scalable lock-free dynamic memory allocation", journal = j-SIGPLAN, volume = "39", number = "6", pages = "35--46", month = may, year = "2004", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/996841.996848", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Thu Dec 2 05:49:55 MST 2004", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Dynamic memory allocators (malloc/free) rely on mutual exclusion locks for protecting the consistency of their shared data structures under multithreading. The use of locking has many disadvantages with respect to performance, availability, robustness, and programming flexibility. A lock-free memory allocator guarantees progress regardless of whether some threads are delayed or even killed and regardless of scheduling policies. This paper presents a completely lock-free memory allocator. It uses only widely-available operating system support and hardware atomic instructions. It offers guaranteed availability even under arbitrary thread termination and crash-failure, and it is immune to deadlock regardless of scheduling policies, and hence it can be used even in interrupt handlers and real-time applications without requiring special scheduler support. Also, by leveraging some high-level structures from Hoard, our allocator is highly scalable, limits space blowup to a constant factor, and is capable of avoiding false sharing. In addition, our allocator allows finer concurrency and much lower latency than Hoard. We use PowerPC shared memory multiprocessor systems to compare the performance of our allocator with the default AIX 5.1 libc malloc, and two widely-used multithread allocators, Hoard and Ptmalloc. Our allocator outperforms the other allocators in virtually all cases and often by substantial margins, under various levels of parallelism and allocation patterns. Furthermore, our allocator also offers the lowest contention-free latency among the allocators by significant margins.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", } @Article{Omma:2004:BMA, author = "M. Omma", title = "On building multithreaded applications", journal = j-IEEE-DISTRIB-SYST-ONLINE, volume = "5", number = "4", pages = "1--3", month = apr, year = "2004", CODEN = "????", DOI = "http://dx.doi.org/10.1109/MDSO.2004.1301256", ISSN = "1541-4922 (print), 1558-1683 (electronic)", ISSN-L = "1541-4922", bibdate = "Fri Jul 15 17:50:15 MDT 2005", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://ieeexplore.ieee.org/iel5/8968/28913/01301256.pdf?isnumber=28913&prod=JNL&arnumber=1301256&arSt=+1&ared=+3&arAuthor=Omma%2C+M.; http://ieeexplore.ieee.org/xpls/abs_all.jsp?isnumber=28913&arnumber=1301256&count=5&index=3", acknowledgement = ack-nhfb, fjournal = "IEEE Distributed Systems Online", } @Article{Roth:2004:MTC, author = "Marcus Roth and Gerrit Voss and Dirk Reiners", title = "Multi-threading and clustering for scene graph systems", journal = j-COMPUTERS-AND-GRAPHICS, volume = "28", number = "1", pages = "63--66", month = feb, year = "2004", CODEN = "COGRD2", ISSN = "0097-8493", ISSN-L = "0097-8493", bibdate = "Tue Jan 27 12:04:28 MST 2004", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.sciencedirect.com/science/journal/00978493", acknowledgement = ack-nhfb, fjournal = "Computers and Graphics", } @Article{Sanden:2004:CJT, author = "B. Sanden", title = "Coping with {Java} Threads: {Java} works for many kinds of concurrent software, but it was not designed for safety-critical real-time applications and does not protect the programmer from the pitfalls associated with multithreading", journal = j-COMPUTER, volume = "37", number = "4", pages = "20--27", year = "2004", CODEN = "CPTRB4", ISSN = "0018-9162 (print), 1558-0814 (electronic)", ISSN-L = "0018-9162", bibdate = "Mon May 17 14:50:36 MDT 2004", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib; Ingenta database", acknowledgement = ack-nhfb, fjournal = "Computer", } @Article{Shin:2004:NAD, author = "Chulho Shin and Seong-Won Lee and Jean-Luc Gaudiot", title = "The Need for Adaptive Dynamic Thread Scheduling in Simultaneous Multithreading", journal = j-PARALLEL-PROCESS-LETT, volume = "14", number = "3/4", pages = "327--??", month = sep # "\slash " # dec, year = "2004", CODEN = "PPLTEE", ISSN = "0129-6264", bibdate = "Thu Jul 7 07:41:25 MDT 2005", bibsource = "http://ejournals.wspc.com.sg/ppl/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Parallel Processing Letters", } @Article{Thulasiraman:2004:FGL, author = "Parimala Thulasiraman and Ashfaq A. Khokhar and Gerd Heber and Guang R. Gao", title = "A fine-grain load-adaptive algorithm of the {$2$D} discrete wavelet transform for multithreaded architectures", journal = j-J-PAR-DIST-COMP, volume = "64", number = "1", pages = "68--78", month = jan, year = "2004", CODEN = "JPDCER", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Sat Dec 4 15:15:08 MST 2004", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.sciencedirect.com/science/journal/07437315", acknowledgement = ack-nhfb, fjournal = "Journal of Parallel and Distributed Computing", } @Article{Tolmach:2004:IFL, author = "Andrew Tolmach and Sergio Antoy and Marius Nita", title = "Implementing functional logic languages using multiple threads and stores", journal = j-SIGPLAN, volume = "39", number = "9", pages = "90--102", month = sep, year = "2004", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Thu Dec 2 05:49:56 MST 2004", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", } @Article{Vrenios:2004:PPC, author = "A. Vrenios", title = "{Parallel Programming in C with MPI and OpenMP} [Book Review]", journal = j-IEEE-DISTRIB-SYST-ONLINE, volume = "5", number = "1", pages = "7.1--7.3", month = "????", year = "2004", CODEN = "????", ISSN = "1541-4922 (print), 1558-1683 (electronic)", ISSN-L = "1541-4922", bibdate = "Fri Jul 15 17:50:13 MDT 2005", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://ieeexplore.ieee.org/iel5/8968/28452/01270716.pdf?isnumber=28452&prod=JNL&arnumber=1270716&arSt=+7.1&ared=+7.3&arAuthor=Vrenios%2C+A.; http://ieeexplore.ieee.org/xpls/abs_all.jsp?isnumber=28452&arnumber=1270716&count=8&index=5", acknowledgement = ack-nhfb, fjournal = "IEEE Distributed Systems Online", } @Article{Wang:2004:HTVa, author = "Perry H. Wang and Jamison D. Collins and Hong Wang and Dongkeun Kim and Bill Greene and Kai-Ming Chan and Aamir B. Yunus and Terry Sych and Stephen F. Moore and John P. Shen", title = "Helper threads via virtual multithreading on an experimental {Itanium-2} processor-based platform", journal = j-COMP-ARCH-NEWS, volume = "32", number = "5", pages = "144--155", month = dec, year = "2004", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:24 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", } @Article{Wang:2004:HTVb, author = "Perry H. Wang and Jamison D. Collins and Hong Wang and Dongkeun Kim and Bill Greene and Kai-Ming Chan and Aamir B. Yunus and Terry Sych and Stephen F. Moore and John P. Shen", title = "Helper threads via virtual multithreading on an experimental {Itanium-2} processor-based platform", journal = j-SIGPLAN, volume = "39", number = "11", pages = "144--155", month = nov, year = "2004", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue Apr 12 09:38:13 MDT 2005", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", } @Article{Wang:2004:HTVc, author = "Perry H. Wang and Jamison D. Collins and Hong Wang and Dongkeun Kim and Bill Greene and Kai-Ming Chan and Aamir B. Yunus and Terry Sych and Stephen F. Moore and John P. Shen", title = "Helper threads via virtual multithreading on an experimental {Itanium-2} processor-based platform", journal = j-OPER-SYS-REV, volume = "38", number = "5", pages = "144--155", month = dec, year = "2004", CODEN = "OSRED8", ISSN = "0163-5980", ISSN-L = "0163-5980", bibdate = "Sat Aug 26 08:55:56 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGOPS Operating Systems Review", } @Article{Wang:2004:HTVd, author = "Perry H. Wang and Jamison D. Collins and Hong Wang and Dongkeun Kim and Bill Greene and Kai-Ming Chan and Aamir B. Yunus and Terry Sych and Stephen F. Moore and John P. Shen", title = "Helper Threads via Virtual Multithreading", journal = j-IEEE-MICRO, volume = "24", number = "6", pages = "74--82", month = nov # "\slash " # dec, year = "2004", CODEN = "IEMIDZ", DOI = "http://dx.doi.org/10.1109/MM.2004.75", ISSN = "0272-1732 (print), 1937-4143 (electronic)", ISSN-L = "0272-1732", bibdate = "Wed Apr 20 08:11:28 MDT 2005", bibsource = "http://csdl.computer.org/comp/mags/mi/2004/06/m6toc.htm; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://csdl.computer.org/dl/mags/mi/2004/06/m6074.htm; http://csdl.computer.org/dl/mags/mi/2004/06/m6074.pdf; http://doi.ieeecomputersociety.org/10.1109/MM.2004.75", acknowledgement = ack-nhfb, fjournal = "IEEE Micro", } @Article{Zhuang:2004:BRA, author = "Xiaotong Zhuang and Santosh Pande", title = "Balancing register allocation across threads for a multithreaded network processor", journal = j-SIGPLAN, volume = "39", number = "6", pages = "289--300", month = may, year = "2004", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Thu Dec 2 05:49:55 MST 2004", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", } @Article{Abraham:2005:ABP, author = "Erika {\'A}brah{\'a}m and Frank S. de Boer and Willem-Paul de Roever and Martin Steffen", title = "An assertion-based proof system for multithreaded {Java}", journal = j-THEOR-COMP-SCI, volume = "331", number = "2--3", pages = "251--290", day = "25", month = feb, year = "2005", CODEN = "TCSCDI", ISSN = "0304-3975 (print), 1879-2294 (electronic)", ISSN-L = "0304-3975", bibdate = "Fri Jul 8 14:05:15 MDT 2005", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.sciencedirect.com/science/journal/03043975", acknowledgement = ack-nhfb, fjournal = "Theoretical Computer Science", } @Article{Anonymous:2005:ECS, author = "Anonymous", title = "Errata: {{\em Characterization of Simultaneous Multithreading (SMT) Efficiency in POWER5}}", journal = j-IBM-JRD, volume = "49", number = "6", pages = "1003--??", month = nov, year = "2005", CODEN = "IBMJAE", ISSN = "0018-8646 (print), 2151-8556 (electronic)", ISSN-L = "0018-8646", bibdate = "Fri Feb 9 21:39:23 MST 2007", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.research.ibm.com/journal/", note = "See \cite{Mathis:2005:CSM}.", URL = "http://www.research.ibm.com/journal/rd/496/errata.html", acknowledgement = ack-nhfb, fjournal = "IBM Journal of Research and Development", ordernumber = "G322-0245-00", } @Article{Barabash:2005:PIM, author = "Katherine Barabash and Ori Ben-Yitzhak and Irit Goft and Elliot K. Kolodner and Victor Leikehman and Yoav Ossia and Avi Owshanko and Erez Petrank", title = "A parallel, incremental, mostly concurrent garbage collector for servers", journal = j-TOPLAS, volume = "27", number = "6", pages = "1097--1146", month = nov, year = "2005", CODEN = "ATPSDT", DOI = "http://doi.acm.org/10.1145/1108970.1108972", ISSN = "0164-0925 (print), 1558-4593 (electronic)", ISSN-L = "0164-0925", bibdate = "Wed Jan 11 05:23:15 MST 2006", bibsource = "http://www.acm.org/pubs/contents/journals/toplas/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Multithreaded applications with multigigabyte heaps running on modern servers provide new challenges for garbage collection (GC). The challenges for ``server-oriented'' GC include: ensuring short pause times on a multigigabyte heap while minimizing throughput penalty, good scaling on multiprocessor hardware, and keeping the number of expensive multicycle fence instructions required by weak ordering to a minimum. We designed and implemented a collector facing these demands building on the mostly concurrent garbage collector proposed by Boehm et al. [1991]. Our collector incorporates new ideas into the original collector. We make it parallel and incremental; we employ concurrent low-priority background GC threads to take advantage of processor idle time; we propose novel algorithmic improvements to the basic mostly concurrent algorithm improving its efficiency and shortening its pause times; and finally, we use advanced techniques, such as a low-overhead work packet mechanism to enable full parallelism among the incremental and concurrent collecting threads and ensure load balancing. We compared the new collector to the mature, well-optimized, parallel, stop-the-world mark-sweep collector already in the IBM JVM. When allowed to run aggressively, using 72\% of the CPU utilization during a short concurrent phase, our collector prototype reduces the maximum pause time from 161 ms to 46 ms while only losing 11.5\% throughput when running the SPECjbb2000 benchmark on a 600-MB heap on an 8-way PowerPC 1.1-GHz processors. When the collector is limited to a nonintrusive operation using only 29\% of the CPU utilization, the maximum pause time obtained is 79 ms and the loss in throughput is 15.4\%.", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Programming Languages and Systems", } @Article{Boehm:2005:TCI, author = "Hans-J. Boehm", title = "Threads cannot be implemented as a library", journal = j-SIGPLAN, volume = "40", number = "6", pages = "261--268", month = jun, year = "2005", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1065010.1065042", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue Jun 21 17:04:05 MDT 2005", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "In many environments, multi-threaded code is written in a language that was originally designed without thread support (e.g. C), to which a library of threading primitives was subsequently added. There appears to be a general understanding that this is not the right approach. We provide specific arguments that a pure library approach, in which the compiler is designed independently of threading issues, cannot guarantee correctness of the resulting code. We first review why the approach almost works, and then examine some of the surprising behavior it may entail. We further illustrate that there are very simple cases in which a pure library-based approach seems incapable of expressing an efficient parallel algorithm. Our discussion takes place in the context of C with Pthreads, since it is commonly used, reasonably well specified, and does not attempt to ensure type-safety, which would entail even stronger constraints. The issues we raise are not specific to that context.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", remark = "This is an important paper: it shows that current languages cannot be reliable for threaded programming without language changes that prevent compiler optimizations from foiling synchronization methods and memory barriers. The article's author and others are collaborating on a proposal for changes to the C++ language to remedy this, but that still leaves threads unreliable in C code, even with POSIX threads.", } @Article{Constantinou:2005:PIS, author = "Theofanis Constantinou and Yiannakis Sazeides and Pierre Michaud and Damien Fetis and Andre Seznec", title = "Performance implications of single thread migration on a chip multi-core", journal = j-COMP-ARCH-NEWS, volume = "33", number = "4", pages = "80--91", month = nov, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:08 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", } @Article{Flanagan:2005:MVM, author = "Cormac Flanagan and Stephen N. Freund and Shaz Qadeer and Sanjit A. Seshia", title = "Modular verification of multithreaded programs", journal = j-THEOR-COMP-SCI, volume = "338", number = "1--3", pages = "153--183", day = "10", month = jun, year = "2005", CODEN = "TCSCDI", ISSN = "0304-3975 (print), 1879-2294 (electronic)", ISSN-L = "0304-3975", bibdate = "Fri Jul 8 14:05:16 MDT 2005", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.sciencedirect.com/science/journal/03043975", acknowledgement = ack-nhfb, fjournal = "Theoretical Computer Science", } @Article{Giampapa:2005:BGA, author = "M. E. Giampapa and R. Bellofatto and M. A. Blumrich and D. Chen and M. B. Dombrowa and A. Gara and R. A. Haring and P. Heidelberger and D. Hoenicke and G. V. Kopcsay and B. J. Nathanson and B. D. Steinmacher-Burow and M. Ohmacht and V. Salapura and P. Vranas", title = "{Blue Gene/L} advanced diagnostics environment", journal = j-IBM-JRD, volume = "49", number = "2/", pages = "319--331", month = "????", year = "2005", CODEN = "IBMJAE", ISSN = "0018-8646 (print), 2151-8556 (electronic)", ISSN-L = "0018-8646", bibdate = "Wed Jun 1 08:14:41 MDT 2005", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.research.ibm.com/journal/", URL = "http://www.research.ibm.com/journal/rd/492/giampapa.pdf", abstract = "This paper describes the Blue Gene/L advanced diagnostics environment (ADE) used throughout all aspects of the Blue Gene/L project, including design, logic verification, bringup, diagnostics, and manufacturing test. The Blue Gene/L ADE consists of a lightweight multithreaded coherence-managed kernel, runtime libraries, device drivers, system programming interfaces, compilers, and host-based development tools. It provides complete and flexible access to all features of the Blue Gene/L hardware. Prior to the existence of hardware, ADE was used on Very high-speed integrated circuit Hardware Description Language (VHDL) models, not only for logic verification, but also for performance measurements, code-path analysis, and evaluation of architectural tradeoffs. During early hardware bring-up, the ability to run in a cycle-reproducible manner on both hardware and VHDL proved invaluable in fault isolation and analysis. However, ADE is also capable of supporting high-performance applications and parallel test cases, thereby permitting us to stress the hardware to the limits of its capabilities. This paper also provides insights into system-level and device-level programming of Blue Gene/L to assist developers of high-performance applications to more fully exploit the performance of the machine.", acknowledgement = ack-nhfb, fjournal = "IBM Journal of Research and Development", ordernumber = "G322-0240", } @Article{Gil:2005:TCS, author = "Marisa Gil and Ruben Pinilla", title = "Thread coloring: a scheduler proposal from user to hardware threads", journal = j-OPER-SYS-REV, volume = "39", number = "2", pages = "54--70", month = apr, year = "2005", CODEN = "OSRED8", ISSN = "0163-5980", ISSN-L = "0163-5980", bibdate = "Sat Aug 26 08:55:43 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Operating Systems Review", } @Article{Gustafsson:2005:TP, author = "Andreas Gustafsson", title = "Threads without the pain", journal = j-QUEUE, volume = "3", number = "9", pages = "42--47", month = nov, year = "2005", CODEN = "AQCUAE", ISSN = "1542-7730 (print), 1542-7749 (electronic)", ISSN-L = "1542-7730", bibdate = "Sat Dec 17 07:37:28 MST 2005", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM Queue: Tomorrow's Computing Today", } @Article{Keller:2005:TBV, author = "J{\"o}rg Keller and Andreas Gr{\"a}vinghoff", title = "Thread-Based Virtual Duplex Systems in Embedded Environments", journal = j-IEEE-MICRO, volume = "25", number = "2", pages = "60--69", month = mar # "\slash " # apr, year = "2005", CODEN = "IEMIDZ", DOI = "http://dx.doi.org/10.1109/MM.2005.39", ISSN = "0272-1732 (print), 1937-4143 (electronic)", ISSN-L = "0272-1732", bibdate = "Wed Apr 20 08:11:29 MDT 2005", bibsource = "http://csdl.computer.org/comp/mags/mi/2005/02/m2toc.htm; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://csdl.computer.org/comp/mags/mi/2005/02/m2060abs.htm; http://csdl.computer.org/dl/mags/mi/2005/02/m2060.pdf", acknowledgement = ack-nhfb, fjournal = "IEEE Micro", } @Article{Kongetira:2005:NWM, author = "Poonacha Kongetira and Kathirgamar Aingaran and Kunle Olukotun", title = "{Niagara}: {A} 32-Way Multithreaded {Sparc} Processor", journal = j-IEEE-MICRO, volume = "25", number = "2", pages = "21--29", month = mar # "\slash " # apr, year = "2005", CODEN = "IEMIDZ", DOI = "http://dx.doi.org/10.1109/MM.2005.35", ISSN = "0272-1732 (print), 1937-4143 (electronic)", ISSN-L = "0272-1732", bibdate = "Wed Apr 20 08:11:29 MDT 2005", bibsource = "http://csdl.computer.org/comp/mags/mi/2005/02/m2toc.htm; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://csdl.computer.org/comp/mags/mi/2005/02/m2021abs.htm; http://csdl.computer.org/dl/mags/mi/2005/02/m2021.pdf", acknowledgement = ack-nhfb, fjournal = "IEEE Micro", } @Article{Li:2005:OSA, author = "Xiaoye S. Li", title = "An overview of {SuperLU}: {Algorithms}, implementation, and user interface", journal = j-TOMS, volume = "31", number = "3", pages = "302--325", month = sep, year = "2005", CODEN = "ACMSCU", DOI = "http://doi.acm.org/10.1145/1089014.1089017", ISSN = "0098-3500 (print), 1557-7295 (electronic)", ISSN-L = "0098-3500", bibdate = "Wed Oct 5 07:43:35 MDT 2005", bibsource = "http://www.acm.org/pubs/contents/journals/toms/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "We give an overview of the algorithms, design philosophy, and implementation techniques in the software SuperLU, for solving sparse unsymmetric linear systems. In particular, we highlight the differences between the sequential SuperLU (including its multithreaded extension) and parallel SuperLU_DIST. These include the numerical pivoting strategy, the ordering strategy for preserving sparsity, the ordering in which the updating tasks are performed, the numerical kernel, and the parallelization strategy. Because of the scalability concern, the parallel code is drastically different from the sequential one. We describe the user interfaces of the libraries, and illustrate how to use the libraries most efficiently depending on some matrix characteristics. Finally, we give some examples of how the solver has been used in large-scale scientific applications, and the performance.", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Mathematical Software", } @Article{Loepere:2005:STM, author = "Keith Loepere", title = "Stackable thread mechanisms", journal = j-OPER-SYS-REV, volume = "39", number = "4", pages = "4--17", month = oct, year = "2005", CODEN = "OSRED8", ISSN = "0163-5980", ISSN-L = "0163-5980", bibdate = "Sat Aug 26 08:55:53 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Operating Systems Review", } @Article{Mathis:2005:CSM, author = "H. M. Mathis and A. E. Mericas and J. D. McCalpin and R. J. Eickemeyer and S. R. Kunkel", title = "Characterization of simultaneous multithreading ({SMT}) efficiency in {POWER5}", journal = j-IBM-JRD, volume = "49", number = "4/5", pages = "555--564", month = "????", year = "2005", CODEN = "IBMJAE", ISSN = "0018-8646 (print), 2151-8556 (electronic)", ISSN-L = "0018-8646", bibdate = "Wed Oct 5 07:12:31 MDT 2005", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.research.ibm.com/journal/", URL = "http://www.research.ibm.com/journal/rd/494/mathis.html", acknowledgement = ack-nhfb, fjournal = "IBM Journal of Research and Development", } @Article{McNairy:2005:MDC, author = "Cameron McNairy and Rohit Bhatia", title = "{Montecito}: {A} Dual-Core, Dual-Thread {Itanium} Processor", journal = j-IEEE-MICRO, volume = "25", number = "2", pages = "10--20", month = mar # "\slash " # apr, year = "2005", CODEN = "IEMIDZ", DOI = "http://dx.doi.org/10.1109/MM.2005.34", ISSN = "0272-1732 (print), 1937-4143 (electronic)", ISSN-L = "0272-1732", bibdate = "Wed Apr 20 08:11:29 MDT 2005", bibsource = "http://csdl.computer.org/comp/mags/mi/2005/02/m2toc.htm; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://csdl.computer.org/comp/mags/mi/2005/02/m2010abs.htm; http://csdl.computer.org/dl/mags/mi/2005/02/m2010.pdf", acknowledgement = ack-nhfb, fjournal = "IEEE Micro", } @Article{Mudigonda:2005:MMA, author = "Jayaram Mudigonda and Harrick M. Vin and Raj Yavatkar", title = "Managing memory access latency in packet processing", journal = j-SIGMETRICS, volume = "33", number = "1", pages = "396--397", month = jun, year = "2005", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1064212.1064272", ISSN = "0163-5999 (print), 1557-9484 (electronic)", ISSN-L = "0163-5999", bibdate = "Fri Jun 27 09:21:27 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "In this study, we refute the popular belief [1,2] that packet processing does not benefit from data-caching. We show that a small data-cache of 8KB can bring down the packet processing time by much as 50-90\%, while reducing the off-chip memory bandwidth usage by about 60-95\%. We also show that, unlike general-purpose computing, packet processing, due to its memory-intensive nature, cannot rely exclusively on data-caching to eliminate the memory bottleneck completely.", acknowledgement = ack-nhfb, fjournal = "ACM SIGMETRICS Performance Evaluation Review", keywords = "data-caches; multithreading; network processors", } @Article{Petric:2005:EEP, author = "Vlad Petric and Amir Roth", title = "Energy-Effectiveness of Pre-Execution and Energy-Aware {P}-Thread Selection", journal = j-COMP-ARCH-NEWS, volume = "33", number = "2", pages = "322--333", month = may, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:40:51 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", } @Article{Ruan:2005:EIS, author = "Yaoping Ruan and Vivek S. Pai and Erich Nahum and John M. Tracey", title = "Evaluating the impact of simultaneous multithreading on network servers using real hardware", journal = j-SIGMETRICS, volume = "33", number = "1", pages = "315--326", month = jun, year = "2005", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1071690.1064254", ISSN = "0163-5999 (print), 1557-9484 (electronic)", ISSN-L = "0163-5999", bibdate = "Fri Jun 27 09:21:27 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "This paper examines the performance of simultaneous multithreading (SMT) for network servers using actual hardware, multiple network server applications, and several workloads. Using three versions of the Intel Xeon processor with Hyper-Threading, we perform macroscopic analysis as well as microarchitectural measurements to understand the origins of the performance bottlenecks for SMT processors in these environments. The results of our evaluation suggest that the current SMT support in the Xeon is application and workload sensitive, and may not yield significant benefits for network servers. In general, we find that enabling SMT on real hardware usually produces only slight performance gains, and can sometimes lead to performance loss. In the uniprocessor case, previous studies appear to have neglected the OS overhead in switching from a uniprocessor kernel to an SMT-enabled kernel. The performance loss associated with such support is comparable to the gains provided by SMT. In the 2-way multiprocessor case, the higher number of memory references from SMT often causes the memory system to become the bottleneck, offsetting any processor utilization gains. This effect is compounded by the growing gap between processor speeds and memory latency. In trying to understand the large gains shown by simulation studies, we find that while the general trends for microarchitectural behavior agree with real hardware, differences in sizing assumptions and performance models yield much more optimistic benefits for SMT than we observe.", acknowledgement = ack-nhfb, fjournal = "ACM SIGMETRICS Performance Evaluation Review", keywords = "network server; simultaneous multithreading(SMT)", } @Article{Sendag:2005:IIS, author = "Resit Sendag and Ying Chen and David J. Lilja", title = "The Impact of Incorrectly Speculated Memory Operations in a Multithreaded Architecture", journal = j-IEEE-TRANS-PAR-DIST-SYS, volume = "16", number = "3", pages = "271--285", month = mar, year = "2005", CODEN = "ITDSEO", DOI = "http://dx.doi.org/10.1109/TPDS.2005.36", ISSN = "1045-9219 (print), 1558-2183 (electronic)", ISSN-L = "1045-9219", bibdate = "Thu Nov 10 08:30:29 MST 2005", bibsource = "http://www.computer.org/tpds/td2005/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "IEEE Transactions on Parallel and Distributed Systems", } @Article{Stark:2005:FSV, author = "Robert F. St{\"a}rk", title = "Formal specification and verification of the {C\#} thread model", journal = j-THEOR-COMP-SCI, volume = "343", number = "3", pages = "482--508", day = "17", month = oct, year = "2005", CODEN = "TCSCDI", ISSN = "0304-3975 (print), 1879-2294 (electronic)", ISSN-L = "0304-3975", bibdate = "Tue Mar 29 06:48:50 MDT 2011", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.sciencedirect.com/science/journal/03043975", abstract = "We present a high-level Abstract State Machine (ASM) model of C\# threads and the .NET memory model. We focus on purely managed, fully portable threading features of C\#. The sequential model interleaves the computation steps of the currently running threads and is suitable for uniprocessors. The parallel model addresses problems of true concurrency on multi-processor systems. The models provide a sound basis for the development of multi-threaded applications in C\#. The thread and memory models complete the abstract operational semantics of C\# in [B{\"o}rger et al. Theoret. Comput. Sci., to appear]. The main invariants of the thread model concerning locks, monitors and mutual exclusion are formally verified in the AsmTP system, an interactive proof assistant based on ASM logic.", acknowledgement = ack-nhfb, fjournal = "Theoretical Computer Science", } @Article{Steinke:2005:NPF, author = "Robert Steinke and Micah Clark and Elihu McMahon", title = "A new pattern for flexible worker threads with in-place consumption message queues", journal = j-OPER-SYS-REV, volume = "39", number = "2", pages = "71--73", month = apr, year = "2005", CODEN = "OSRED8", ISSN = "0163-5980", ISSN-L = "0163-5980", bibdate = "Sat Aug 26 08:55:43 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Operating Systems Review", } @Article{Sundell:2005:FLF, author = "H{\aa}kan Sundell and Philippas Tsigas", title = "Fast and lock-free concurrent priority queues for multi-thread systems", journal = j-J-PAR-DIST-COMP, volume = "65", number = "5", pages = "609--627", month = may, year = "2005", CODEN = "JPDCER", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Fri Jul 11 20:32:33 MDT 2008", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.sciencedirect.com/science/journal/07437315", acknowledgement = ack-nhfb, fjournal = "Journal of Parallel and Distributed Computing", } @Article{Tian:2005:PCT, author = "Xinmin Tian and Milind Girkar and Aart Bik and Hideki Saito", title = "Practical Compiler Techniques on Efficient Multithreaded Code Generation for {OpenMP} Programs", journal = j-COMP-J, volume = "48", number = "5", pages = "588--601", month = sep, year = "2005", CODEN = "CMPJA6", DOI = "http://dx.doi.org/10.1093/comjnl/bxh109", ISSN = "0010-4620 (print), 1460-2067 (electronic)", ISSN-L = "0010-4620", bibdate = "Tue Nov 8 05:58:50 MST 2005", bibsource = "http://comjnl.oxfordjournals.org/content/vol48/issue5/index.dtl; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://comjnl.oxfordjournals.org/cgi/content/abstract/48/5/588; http://comjnl.oxfordjournals.org/cgi/reprint/48/5/588", acknowledgement = ack-nhfb, fjournal = "The Computer Journal", } @Article{Vachharajani:2005:CMP, author = "Neil Vachharajani and Matthew Iyer and Chinmay Ashok and Manish Vachharajani and David I. August and Daniel Connors", title = "Chip multi-processor scalability for single-threaded applications", journal = j-COMP-ARCH-NEWS, volume = "33", number = "4", pages = "44--53", month = nov, year = "2005", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri May 12 09:41:08 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", } @Article{Abadi:2006:TSL, author = "Martin Abadi and Cormac Flanagan and Stephen N. Freund", title = "Types for safe locking: {Static} race detection for {Java}", journal = j-TOPLAS, volume = "28", number = "2", pages = "207--255", month = mar, year = "2006", CODEN = "ATPSDT", DOI = "http://doi.acm.org/10.1145/1119479.1119480", ISSN = "0164-0925 (print), 1558-4593 (electronic)", ISSN-L = "0164-0925", bibdate = "Fri Mar 10 18:46:58 MST 2006", bibsource = "http://www.acm.org/pubs/contents/journals/toplas/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "This article presents a static race-detection analysis for multithreaded shared-memory programs, focusing on the Java programming language. The analysis is based on a type system that captures many common synchronization patterns. It supports classes with internal synchronization, classes that require client-side synchronization, and thread-local classes. In order to demonstrate the effectiveness of the type system, we have implemented it in a checker and applied it to over 40,000 lines of hand-annotated Java code. We found a number of race conditions in the standard Java libraries and other test programs. The checker required fewer than 20 additional type annotations per 1,000 lines of code. This article also describes two improvements that facilitate checking much larger programs: an algorithm for annotation inference and a user interface that clarifies warnings generated by the checker. These extensions have enabled us to use the checker for identifying race conditions in large-scale software systems with up to 500,000 lines of code.", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Programming Languages and Systems", } @Article{Adl-Tabatabai:2006:CRS, author = "Ali-Reza Adl-Tabatabai and Brian T. Lewis and Vijay Menon and Brian R. Murphy and Bratin Saha and Tatiana Shpeisman", title = "Compiler and runtime support for efficient software transactional memory", journal = j-SIGPLAN, volume = "41", number = "6", pages = "26--37", month = jun, year = "2006", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1133981.1133985", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jun 18 10:42:48 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Programmers have traditionally used locks to synchronize concurrent access to shared data. Lock-based synchronization, however, has well-known pitfalls: using locks for fine-grain synchronization and composing code that already uses locks are both difficult and prone to deadlock. Transactional memory provides an alternate concurrency control mechanism that avoids these pitfalls and significantly eases concurrent programming. Transactional memory language constructs have recently been proposed as extensions to existing languages or included in new concurrent language specifications, opening the door for new compiler optimizations that target the overheads of transactional memory. This paper presents compiler and runtime optimizations for transactional memory language constructs. We present a high-performance software transactional memory system (STM) integrated into a managed runtime environment. Our system efficiently implements nested transactions that support both composition of transactions and partial roll back. Our JIT compiler is the first to optimize the overheads of STM, and we show novel techniques for enabling JIT optimizations on STM operations. We measure the performance of our optimizations on a 16-way SMP running multi-threaded transactional workloads. Our results show that these techniques enable transactional memory's performance to compete with that of well-tuned synchronization.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "code generation; compiler optimizations; locking; synchronization; transactional memory; virtual machines", } @Article{Agerwala:2006:SRC, author = "T. Agerwala and M. Gupta", title = "Systems research challenges: {A} scale-out perspective", journal = j-IBM-JRD, volume = "50", number = "2/3", pages = "173--??", month = mar # " \slash " # may, year = "2006", CODEN = "IBMJAE", DOI = "http://dx.doi.org/", ISSN = "0018-8646 (print), 2151-8556 (electronic)", ISSN-L = "0018-8646", bibdate = "Fri Feb 9 20:16:31 MST 2007", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.research.ibm.com/journal/", URL = "http://www.research.ibm.com/journal/rd/502/agerwala.html", abstract = "A scale-out system is a collection of interconnected, modular, low- cost computers that work as a single entity to cooperatively provide applications, systems resources, and data to users. The dominant programming model for such systems consists of message passing at the systems level and multithreading at the element level. Scale-out computers have traditionally been developed and deployed to provide levels of performance (throughput and parallel processing) beyond what was achievable by large shared-memory computers that utilized the fastest processors and the most expensive memory systems. Today, exploiting scale-out at all levels in systems is becoming imperative in order to overcome a fundamental discontinuity in the development of microprocessor technology caused by power dissipation. The pervasive use of greater levels of scale-out, on the other hand, creates its own challenges in architecture, programming, systems management, and reliability. This position paper identifies some of the important research problems that must be addressed in order to deal with the technology disruption and fully realize the opportunity offered by scale-out. Our examples are based on parallelism, but the challenges we identify apply to scale-out more generally.", acknowledgement = ack-nhfb, fjournal = "IBM Journal of Research and Development", ordernumber = "G322-0247-00", } @Article{Bacon:2006:BFL, author = "D. F. Bacon and X. Shen", title = "Braids and fibers: Language constructs with architectural support for adaptive responses to memory latencies", journal = j-IBM-JRD, volume = "50", number = "2/3", pages = "209--??", month = mar # " \slash " # may, year = "2006", CODEN = "IBMJAE", DOI = "http://dx.doi.org/", ISSN = "0018-8646 (print), 2151-8556 (electronic)", ISSN-L = "0018-8646", bibdate = "Fri Feb 9 20:16:31 MST 2007", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.research.ibm.com/journal/", URL = "http://www.research.ibm.com/journal/rd/502/bacon.html", abstract = "As processor speeds continue to increase at a much higher rate than memory speeds, memory latencies may soon approach a thousand processor cycles. As a result, the flat memory model that was made practical by deeply pipelined superscalar processors with multilevel caches will no longer be tenable. The most common approach to this problem is multithreading; however, multithreading requires either abundant independent applications or well-parallelized monolithic applications, and neither is easy to come by. We present high-level programming constructs called braids and fibers. The programming constructs facilitate the creation of programs that are partially ordered, in which the partial orders can be used to support adaptive responses to memory access latencies. Braiding is simpler than parallelizing, while yielding many of the same benefits. We show how the programming constructs can be effectively supported with simple instruction set architecture extensions and microarchitectural enhancements. We have developed braided versions of a number of important algorithms. The braided code is easy to understand at the source level and can be translated into highly efficient instructions using our architecture extensions.", acknowledgement = ack-nhfb, fjournal = "IBM Journal of Research and Development", ordernumber = "G322-0247-00", } @Article{Chakraborty:2006:CSE, author = "Koushik Chakraborty and Philip M. Wells and Gurindar S. Sohi", title = "Computation spreading: employing hardware migration to specialize {CMP} cores on-the-fly", journal = j-SIGPLAN, volume = "41", number = "11", pages = "283--292", month = nov, year = "2006", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1168919.1168893", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jun 18 10:49:40 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "In canonical parallel processing, the operating system (OS) assigns a processing core to a single thread from a multithreaded server application. Since different threads from the same application often carry out similar computation, albeit at different times, we observe extensive code reuse among different processors, causing redundancy (e.g., in our server workloads, 45-65\% of all instruction blocks are accessed by all processors). Moreover, largely independent fragments of computation compete for the same private resources causing destructive interference. Together, this redundancy and interference lead to poor utilization of private microarchitecture resources such as caches and branch predictors. We present Computation Spreading (CSP), which employs hardware migration to distribute a thread's dissimilar fragments of computation across the multiple processing cores of a chip multiprocessor (CMP), while grouping similar computation fragments from different threads together. This paper focuses on a specific example of CSP for OS intensive server applications: separating application level (user) computation from the OS calls it makes. When performing CSP, each core becomes temporally specialized to execute certain computation fragments, and the same core is repeatedly used for such fragments. We examine two specific thread assignment policies for CSP, and show that these policies, across four server workloads, are able to reduce instruction misses in private L2 caches by 27-58\%, private L2 load misses by 0-19\%, and branch mispredictions by 9-25\%.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "cache locality; dynamic specialization", } @Article{Chuang:2006:UPB, author = "Weihaw Chuang and Satish Narayanasamy and Ganesh Venkatesh and Jack Sampson and Michael Van Biesbrouck and Gilles Pokam and Brad Calder and Osvaldo Colavin", title = "Unbounded page-based transactional memory", journal = j-SIGPLAN, volume = "41", number = "11", pages = "347--358", month = nov, year = "2006", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1168918.1168901", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jun 18 10:49:40 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Exploiting thread level parallelism is paramount in the multicore era. Transactions enable programmers to expose such parallelism by greatly simplifying the multi-threaded programming model. Virtualized transactions (unbounded in space and time) are desirable, as they can increase the scope of transactions' use, and thereby further simplify a programmer's job. However, hardware support is essential to support efficient execution of unbounded transactions. In this paper, we introduce Page-based Transactional Memory to support unbounded transactions. We combine transaction bookkeeping with the virtual memory system to support fast transaction conflict detection, commit, abort, and to maintain transactions' speculative data.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "concurrency; parallel programming; transactional memory; transactions; virtual memory", } @Article{Gomez:2006:SCM, author = "Juan Carlos Gomez and Vernon Rego and V. S. Sunderam", title = "Scheduling communication in multithreaded programs: experimental results", journal = j-CCPE, volume = "18", number = "1", pages = "1--28", month = jan, year = "2006", CODEN = "CCPEBO", DOI = "http://dx.doi.org/10.1002/cpe.904", ISSN = "1532-0626 (print), 1532-0634 (electronic)", ISSN-L = "1532-0626", bibdate = "Mon Dec 5 10:08:00 MST 2011", bibsource = "http://www.interscience.wiley.com/jpages/1532-0626; http://www.math.utah.edu/pub/tex/bib/ccpe.bib; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Concurrency and Computation: Prac\-tice and Experience", onlinedate = "13 Sep 2005", } @Article{Gomez:2006:STC, author = "Juan Carlos Gomez and Jorge R. Ramos and Vernon Rego", title = "Signals, timers, and continuations for multithreaded user-level protocols", journal = j-SPE, volume = "36", number = "5", pages = "449--471", day = "25", month = apr, year = "2006", CODEN = "SPEXBL", DOI = "http://dx.doi.org/10.1002/spe.700", ISSN = "0038-0644 (print), 1097-024X (electronic)", ISSN-L = "0038-0644", bibdate = "Wed Oct 17 18:33:12 MDT 2007", bibsource = "http://www.interscience.wiley.com/jpages/0038-0644; http://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www3.interscience.wiley.com/journalfinder.html", acknowledgement = ack-nhfb, fjournal = "Software---Practice and Experience", onlinedate = "19 Jan 2006", } @Article{Grelck:2006:SFA, author = "Clemens Grelck and Sven-Bodo Scholz", title = "{SAC} --- {A} Functional Array Language for Efficient Multi-threaded Execution", journal = j-INT-J-PARALLEL-PROG, volume = "34", number = "4", pages = "383--427", month = aug, year = "2006", CODEN = "IJPPE5", DOI = "http://dx.doi.org/10.1007/s10766-006-0018-x", ISSN = "0885-7458 (print), 1573-7640 (electronic)", ISSN-L = "0885-7458", bibdate = "Wed Jul 9 16:06:07 MDT 2008", bibsource = "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=34&issue=4; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.springerlink.com/openurl.asp?genre=article&issn=0885-7458&volume=34&issue=4&spage=383", acknowledgement = ack-nhfb, fjournal = "International Journal of Parallel Programming", keywords = "Compiler optimisation; data parallel programming; multi-threading; Single Assignment C", } @Article{Kaiser:2006:CJC, author = "Claude Kaiser and Jean-Fran{\c{c}}ois Pradat-Peyre and Sami {\'E}vangelista and Pierre Rousseau", title = "Comparing {Java}, {C\#} and {Ada} monitors queuing policies: a case study and its {Ada} refinement", journal = j-SIGADA-LETTERS, volume = "26", number = "2", pages = "23--37", month = aug, year = "2006", CODEN = "AALEE5", DOI = "http://doi.acm.org/10.1145/1165678.1165681", ISSN = "0736-721X", ISSN-L = "0736-721X", bibdate = "Tue Jun 17 09:16:14 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Learning concurrency paradigms is necessary but it is not sufficient since the choice of run-time semantics may introduce subtle programming errors. It is the aim of this paper to exemplify the importance of process queuing and awaking policies resulting from possible choices of the monitor concept implementation.The first part of the paper compares the behaviour of concurrent processes sharing a unique waiting queue for condition synchronization when implemented in Java or in Ada. A particular solution of the dining philosophers paradigm will be used to show how the difference in the monitor semantics may lead or not to deadlock. This comparison provides insight for deriving a correct Java implementation. The second part of the paper shows how the implementation can be refined when using Ada entry families and requeue with requeue once restriction. The result is elegant, safe and fair, and deterministic. This paper ends with quantitative comparisons of concurrency complexity and of concurrency effectiveness.We conclude that Java and C\# multithreading need defensive concurrent programming while Ada allows more latitude for developing correct concurrent programs.", acknowledgement = ack-nhfb, fjournal = "ACM SIGAda Ada Letters", } @Article{Kim:2006:ERI, author = "Seon Wook Kim and Chong-Liang Ooi and Rudolf Eigenmann and Babak Falsafi and T. N. Vijaykumar", title = "Exploiting reference idempotency to reduce speculative storage overflow", journal = j-TOPLAS, volume = "28", number = "5", pages = "942--965", month = sep, year = "2006", CODEN = "ATPSDT", DOI = "http://doi.acm.org/10.1145/1152649.1152653", ISSN = "0164-0925 (print), 1558-4593 (electronic)", ISSN-L = "0164-0925", bibdate = "Wed Sep 6 07:13:55 MDT 2006", bibsource = "http://www.acm.org/pubs/contents/journals/toplas/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Recent proposals for multithreaded architectures employ speculative execution to allow threads with unknown dependences to execute speculatively in parallel. The architectures use hardware speculative storage to buffer speculative data, track data dependences and correct incorrect executions through roll-backs. Because all memory references access the speculative storage, current proposals implement speculative storage using small memory structures to achieve fast access. The limited capacity of the speculative storage causes considerable performance loss due to speculative storage overflow whenever a thread's speculative state exceeds the speculative storage capacity. Larger threads exacerbate the overflow problem but are preferable to smaller threads, as larger threads uncover more parallelism. In this article, we discover a new program property called memory reference idempotency. Idempotent references are guaranteed to be eventually corrected, though the references may be temporarily incorrect in the process of speculation. Therefore, idempotent references, even from nonparallelizable program sections, need not be tracked in the speculative storage, and instead can directly access nonspeculative storage (i.e., conventional memory hierarchy). Thus, we reduce the demand for speculative storage space in large threads. We define a formal framework for reference idempotency and present a novel compiler-assisted speculative execution model. We prove the necessary and sufficient conditions for reference idempotency using our model. We present a compiler algorithm to label idempotent memory references for the hardware. Experimental results show that for our benchmarks, over 60\% of the references in nonparallelizable program sections are idempotent.", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Programming Languages and Systems", } @TechReport{Lee:2006:PT, author = "Edward A. Lee", title = "The Problem with Threads", type = "Technical Report", number = "UCB/EECS-2006-1", institution = "Electrical Engineering and Computer Sciences. University of California at Berkeley", address = "Berkeley, CA, USA", day = "10", month = jan, year = "2006", bibdate = "Thu Oct 23 15:07:59 2008", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.eecs.berkeley.edu/Pubs/TechRpts/2006/EECS-2006-1.html", abstract = "Threads are a seemingly straightforward adaptation of the dominant sequential model of computation to concurrent systems. Languages require little or no syntactic changes to support threads, and operating systems and architectures have evolved to efficiently support them. Many technologists are pushing for increased use of multithreading in software in order to take advantage of the predicted increases in parallelism in computer architectures. In this paper, I argue that this is not a good idea. Although threads seem to be a small step from sequential computation, in fact, they represent a huge step. They discard the most essential and appealing properties of sequential computation: understandability, predictability, and determinism. Threads, as a model of computation, are wildly nondeterministic, and the job of the programmer becomes one of pruning that nondeterminism. Although many research techniques improve the model by offering more effective pruning, I argue that this is approaching the problem backwards. Rather than pruning nondeterminism, we should build from essentially deterministic, composable components. Nondeterminism should be explicitly and judiciously introduced where needed, rather than removed where not needed. The consequences of this principle are profound. I argue for the development of concurrent coordination languages based on sound, composable formalisms. I believe that such languages will yield much more reliable, and more concurrent programs.", acknowledgement = ack-nhfb, } @Article{Li:2006:MEMa, author = "Xin Li and Marian Boldt and Reinhard von Hanxleden", title = "Mapping {Esterel} onto a multi-threaded embedded processor", journal = j-COMP-ARCH-NEWS, volume = "34", number = "5", pages = "303--314", month = dec, year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Oct 27 06:18:30 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", } @Article{Li:2006:MEMb, author = "Xin Li and Marian Boldt and Reinhard von Hanxleden", title = "Mapping {Esterel} onto a multi-threaded embedded processor", journal = j-OPER-SYS-REV, volume = "40", number = "5", pages = "303--314", month = dec, year = "2006", CODEN = "OSRED8", ISSN = "0163-5980", ISSN-L = "0163-5980", bibdate = "Fri Oct 27 06:18:30 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGOPS Operating Systems Review", } @Article{Li:2006:MEMc, author = "Xin Li and Marian Boldt and Reinhard von Hanxleden", title = "Mapping {Esterel} onto a multi-threaded embedded processor", journal = j-SIGPLAN, volume = "41", number = "11", pages = "303--314", month = nov, year = "2006", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1168857.1168896", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jun 18 10:49:40 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "The synchronous language Esterel is well-suited for programming control-dominated reactive systems at the system level. It provides non-traditional control structures, in particular concurrency and various forms of preemption, which allow to concisely express reactive behavior. As these control structures cannot be mapped easily onto traditional, sequential processors, an alternative approach that has emerged recently makes use of special-purpose reactive processors. However, the designs proposed so far have limitations regarding completeness of the language support, and did not really take advantage of compile-time knowledge to optimize resource usage. This paper presents a reactive processor, the Kiel Esterel Processor 3a (KEP3a), and its compiler. The KEP3a improves on earlier designs in several areas; most notable are the support for exception handling and the provision of context-dependent preemption handling instructions. The KEP3a compiler presented here is to our knowledge the first for multi-threaded reactive processors. The translation of Esterel's preemption constructs onto KEP3a assembler is straightforward; however, a challenge is the correct and efficient representation of Esterel's concurrency. The compiler generates code that respects data and control dependencies using the KEP3a priority-based scheduling mechanism. We present a priority assignment approach that makes use of a novel concurrent control flow graph and has a complexity that in practice tends to be linear in the size of the program. Unlike earlier Esterel compilation schemes, this approach avoids unnecessary context switches by considering each thread's actual execution state at run time. Furthermore, it avoids code replication present in other approaches.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "concurrency; Esterel; low-power processing; multi-threading; reactive systems", } @Article{Moon:2006:TMS, author = "Sewon Moon and Byeong-Mo Chang", title = "A thread monitoring system for multithreaded {Java} programs", journal = j-SIGPLAN, volume = "41", number = "5", pages = "21--29", month = may, year = "2006", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1149982.1149985", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jun 18 10:42:34 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "To assist developing robust multithreaded software, we develop a thread monitoring system for multithreaded Java programs, which can trace or monitor running threads and synchronization. We design a monitoring system which has options to select interesting threads and synchronized actions. Using this tool, programmers can monitor only interesting threads and synchronization in more details by selecting options, and can detect a deadlock. It also provides profile information after execution, which summarizes behavior of running threads and synchronized actions during execution. We implement the system based on code inlining, and presents some experimental results.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "Java; monitoring; synchronization; thread", } @Article{Naik:2006:ESR, author = "Mayur Naik and Alex Aiken and John Whaley", title = "Effective static race detection for {Java}", journal = j-SIGPLAN, volume = "41", number = "6", pages = "308--319", month = jun, year = "2006", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1133255.1134018", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jun 18 10:42:48 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "We present a novel technique for static race detection in Java programs, comprised of a series of stages that employ a combination of static analyses to successively reduce the pairs of memory accesses potentially involved in a race. We have implemented our technique and applied it to a suite of multi-threaded Java programs. Our experiments show that it is precise, scalable, and useful, reporting tens to hundreds of serious and previously unknown concurrency bugs in large, widely-used programs with few false alarms.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "concurrency; Java; multi-threading; static race detection; synchronization", } @Article{Nanda:2006:ISM, author = "Mangala Gowri Nanda and S. Ramesh", title = "Interprocedural slicing of multithreaded programs with applications to {Java}", journal = j-TOPLAS, volume = "28", number = "6", pages = "1088--1144", month = nov, year = "2006", CODEN = "ATPSDT", DOI = "http://doi.acm.org/10.1145/1186632.1186636", ISSN = "0164-0925 (print), 1558-4593 (electronic)", ISSN-L = "0164-0925", bibdate = "Sat Apr 14 11:13:21 MDT 2007", bibsource = "http://www.acm.org/pubs/contents/journals/toplas/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM Transactions on Programming Languages and Systems", } @Article{Narayanasamy:2006:RSM, author = "Satish Narayanasamy and Cristiano Pereira and Brad Calder", title = "Recording shared memory dependencies using strata", journal = j-SIGPLAN, volume = "41", number = "11", pages = "229--240", month = nov, year = "2006", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1168857.1168886", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jun 18 10:49:40 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Significant time is spent by companies trying to reproduce and fix bugs. BugNet and FDR are recent architecture proposals that provide architecture support for deterministic replay debugging. They focus on continuously recording information about the program's execution, which can be communicated back to the developer. Using that information, the developer can deterministically replay the program's execution to reproduce and fix the bugs. In this paper, we propose using Strata to efficiently capture the shared memory dependencies. A stratum creates a time layer across all the logs for the running threads, which separates all the memory operations executed before and after the stratum. A strata log allows us to determine all the shared memory dependencies during replay and thereby supports deterministic replay debugging for multi-threaded programs.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "debugging; dependencies; logging; replay; shared memory; strata", } @Article{Parashar:2006:SSBa, author = "Angshuman Parashar and Anand Sivasubramaniam and Sudhanva Gurumurthi", title = "{SlicK}: slice-based locality exploitation for efficient redundant multithreading", journal = j-COMP-ARCH-NEWS, volume = "34", number = "5", pages = "95--105", month = dec, year = "2006", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Oct 27 06:18:30 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", } @Article{Parashar:2006:SSBb, author = "Angshuman Parashar and Anand Sivasubramaniam and Sudhanva Gurumurthi", title = "{SlicK}: slice-based locality exploitation for efficient redundant multithreading", journal = j-OPER-SYS-REV, volume = "40", number = "5", pages = "95--105", month = dec, year = "2006", CODEN = "OSRED8", ISSN = "0163-5980", ISSN-L = "0163-5980", bibdate = "Fri Oct 27 06:18:30 MDT 2006", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGOPS Operating Systems Review", } @Article{Parashar:2006:SSBc, author = "Angshuman Parashar and Anand Sivasubramaniam and Sudhanva Gurumurthi", title = "{SlicK}: slice-based locality exploitation for efficient redundant multithreading", journal = j-SIGPLAN, volume = "41", number = "11", pages = "95--105", month = nov, year = "2006", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1168857.1168870", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jun 18 10:49:40 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Transient faults are expected a be a major design consideration in future microprocessors. Recent proposals for transient fault detection in processor cores have revolved around the idea of redundant threading, which involves redundant execution of a program across multiple execution contexts. This paper presents a new approach to redundant threading by bringing together the concepts of slice-level execution and value and control-flow locality into a novel partial redundant threading mechanism called SlicK .The purpose of redundant execution is to check the integrity of the outputs propagating out of the core (typically through stores). SlicK implements redundancy at the granularity of backward-slices of these output instructions and exploits value and control-flow locality to avoid redundantly executing slices that lead to predictable outputs, thereby avoiding redundant execution of a significant fraction of instructions while maintaining extremely low vulnerabilities for critical processor structures. We propose the microarchitecture of a backward-slice extractor called SliceEM that is able to identify backward slices without interrupting the instruction flow, and show how this extractor and a set of predictors can be integrated into a redundant threading mechanism to form SlicK. Detailed simulations with SPEC CPU2000 benchmarks show that SlicK can provide around 10.2\% performance improvement over a well known redundant threading mechanism, buying back over 50\% of the loss suffered due to redundant execution. SlicK can keep the Architectural Vulnerability Factors of processor structures to typically 0\%-2\%. More importantly, SlicK's slice-based mechanisms provide future opportunities for exploring interesting points in the performance-reliability design space based on market segment needs.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "backward slice extraction; microarchitecture; redundant threading; transient faults", } @Article{Pratikakis:2006:LCS, author = "Polyvios Pratikakis and Jeffrey S. Foster and Michael Hicks", title = "{LOCKSMITH}: context-sensitive correlation analysis for race detection", journal = j-SIGPLAN, volume = "41", number = "6", pages = "320--331", month = jun, year = "2006", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1133255.1134019", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jun 18 10:42:48 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "One common technique for preventing data races in multi-threaded programs is to ensure that all accesses to shared locations are consistently protected by a lock. We present a tool called LOCKSMITH for detecting data races in C programs by looking for violations of this pattern. We call the relationship between locks and the locations they protect consistent correlation, and the core of our technique is a novel constraint-based analysis that infers consistent correlation context-sensitively, using the results to check that locations are properly guarded by locks. We present the core of our algorithm for a simple formal language \lambda$_>$ which we have proven sound, and discuss how we scale it up to an algorithm that aims to be sound for all of C. We develop several techniques to improve the precision and performance of the analysis, including a sharing analysis for inferring thread locality; existential quantification for modeling locks in data structures; and heuristics for modeling unsafe features of C such as type casts. When applied to several benchmarks, including multi-threaded servers and Linux device drivers, LOCKSMITH found several races while producing a modest number of false alarm.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "context-sensitivity; correlation; locksmith; multi-threaded programming; race detection; type inference", } @Article{Reddy:2006:UPB, author = "Vimal K. Reddy and Eric Rotenberg and Sailashri Parthasarathy", title = "Understanding prediction-based partial redundant threading for low-overhead, high- coverage fault tolerance", journal = j-SIGPLAN, volume = "41", number = "11", pages = "83--94", month = nov, year = "2006", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1168917.1168869", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jun 18 10:49:40 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Redundant threading architectures duplicate all instructions to detect and possibly recover from transient faults. Several lighter weight Partial Redundant Threading (PRT) architectures have been proposed recently. (i) Opportunistic Fault Tolerance duplicates instructions only during periods of poor single-thread performance. (ii) ReStore does not explicitly duplicate instructions and instead exploits mispredictions among highly confident branch predictions as symptoms of faults. (iii) Slipstream creates a reduced alternate thread by replacing many instructions with highly confident predictions. We explore PRT as a possible direction for achieving the fault tolerance of full duplication with the performance of single-thread execution. Opportunistic and ReStore yield partial coverage since they are restricted to using only partial duplication or only confident predictions, respectively. Previous analysis of Slipstream fault tolerance was cursory and concluded that only duplicated instructions are covered. In this paper, we attempt to better understand Slipstream's fault tolerance, conjecturing that the mixture of partial duplication and confident predictions actually closely approximates the coverage of full duplication. A thorough dissection of prediction scenarios confirms that faults in nearly 100\% of instructions are detectable. Fewer than 0.1\% of faulty instructions are not detectable due to coincident faults and mispredictions. Next we show that the current recovery implementation fails to leverage excellent detection capability, since recovery sometimes initiates belatedly, after already retiring a detected faulty instruction. We propose and evaluate a suite of simple microarchitectural alterations to recovery and checking. Using the best alterations, Slipstream can recover from faults in 99\% of instructions, compared to only 78\% of instructions without alterations. Both results are much higher than predicted by past research, which claims coverage for only duplicated instructions, or 65\% of instructions. On an 8-issue SMT processor, Slipstream performs within 1.3\% of single-thread execution whereas full duplication slows performance by 14\%. A key byproduct of this paper is a novel analysis framework in which every dynamic instruction is considered to be hypothetically faulty, thus not requiring explicit fault injection. Fault coverage is measured in terms of the fraction of candidate faulty instructions that are directly or indirectly detectable before.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "branch prediction; chip multiprocessor (CMP); redundant multithreading; simultaneous multithreading (SMT); slipstream processor; time redundancy; transient faults; value prediction", } @Article{Russell:2006:ESRa, author = "Kenneth Russell and David Detlefs", title = "Eliminating synchronization-related atomic operations with biased locking and bulk rebiasing", journal = j-SIGPLAN, volume = "41", number = "10", pages = "263--272", month = oct, year = "2006", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1167515.1167496", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jun 18 10:47:35 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "The Java{\TM} programming language contains built-in synchronization primitives for use in constructing multithreaded programs. Efficient implementation of these synchronization primitives is necessary in order to achieve high performance. Recent research [9, 12, 10, 3, 7] has focused on the run-time elimination of the atomic operations required to implement object monitor synchronization primitives. This paper describes a novel technique called store-free biased locking which eliminates all synchronization-related atomic operations on uncontended object monitors. The technique supports the bulk transfer of object ownership from one thread to another, and the selective disabling of the optimization where unprofitable, using epoch-based bulk rebiasing and revocation. It has been implemented in the production version of the Java HotSpot{\TM}VM and has yielded significant performance improvements on a range of benchmarks and applications. The technique is applicable to any virtual machine-based programming language implementation with mostly block-structured locking primitives.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "atomic; bias; Java; lock; monitor; optimization; rebias; reservation; revoke; synchronization", } @Article{Sen:2006:OEP, author = "Koushik Sen and Grigore Rosu and Gul Agha", title = "Online efficient predictive safety analysis of multithreaded programs", journal = j-INT-J-SOFTW-TOOLS-TECHNOL-TRANSFER, volume = "8", number = "3", pages = "248--260", month = jun, year = "2006", CODEN = "????", DOI = "http://dx.doi.org/10.1007/s10009-005-0192-y", ISSN = "1433-2779 (print), 1433-2787 (electronic)", ISSN-L = "1433-2779", bibdate = "Wed Jul 9 18:12:21 MDT 2008", bibsource = "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=1433-2779&volume=8&issue=3; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.springerlink.com/openurl.asp?genre=article&issn=1433-2779&volume=8&issue=3&spage=248", acknowledgement = ack-nhfb, fjournal = "International Journal on Software Tools for Technology Transfer: STTT", keywords = "JMPaX; Multithreaded analysis; Predictive analysis; Runtime monitoring; Vector clock", } @Article{Shin:2006:ADT, author = "Chulho Shin and Seong-Won Lee and Jean-Luc Gaudiot", title = "Adaptive dynamic thread scheduling for simultaneous multithreaded architectures with a detector thread", journal = j-J-PAR-DIST-COMP, volume = "66", number = "10", pages = "1304--1321", month = oct, year = "2006", CODEN = "JPDCER", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Fri Jul 11 20:32:35 MDT 2008", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.sciencedirect.com/science/journal/07437315", acknowledgement = ack-nhfb, fjournal = "Journal of Parallel and Distributed Computing", } @Article{Trancoso:2006:CCM, author = "Pedro Trancoso and Paraskevas Evripidou and Kyriakos Stavrou and Costas Kyriacou", title = "A Case for Chip Multiprocessors Based on the Data-Driven Multithreading Model", journal = j-INT-J-PARALLEL-PROG, volume = "34", number = "3", pages = "213--235", month = jun, year = "2006", CODEN = "IJPPE5", DOI = "http://dx.doi.org/10.1007/s10766-006-0016-z", ISSN = "0885-7458 (print), 1573-7640 (electronic)", ISSN-L = "0885-7458", bibdate = "Wed Jul 9 16:05:59 MDT 2008", bibsource = "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=34&issue=3; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.springerlink.com/openurl.asp?genre=article&issn=0885-7458&volume=34&issue=3&spage=213", acknowledgement = ack-nhfb, fjournal = "International Journal of Parallel Programming", keywords = "Chip multiprocessor; data-driven execution; multithreading; parallel processing", } @Article{Vasconcelos:2006:TCM, author = "Vasco T. Vasconcelos and Simon J. Gay and Ant{\'o}nio Ravara", title = "Type checking a multithreaded functional language with session types", journal = j-THEOR-COMP-SCI, volume = "368", number = "1--2", pages = "64--87", day = "5", month = dec, year = "2006", CODEN = "TCSCDI", ISSN = "0304-3975 (print), 1879-2294 (electronic)", ISSN-L = "0304-3975", bibdate = "Tue Mar 29 08:55:29 MDT 2011", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.sciencedirect.com/science/journal/03043975", acknowledgement = ack-nhfb, fjournal = "Theoretical Computer Science", } @Article{Xu:2006:RTR, author = "Min Xu and Mark D. Hill and Rastislav Bodik", title = "A regulated transitive reduction {(RTR)} for longer memory race recording", journal = j-SIGPLAN, volume = "41", number = "11", pages = "49--60", month = nov, year = "2006", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1168919.1168865", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jun 18 10:49:40 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Multithreaded deterministic replay has important applications in cyclic debugging, fault tolerance and intrusion analysis. Memory race recording is a key technology for multithreaded deterministic replay. In this paper, we considerably improve our previous always-on Flight Data Recorder (FDR) in four ways:\par \begin{itemize} \item Longer recording by reducing the log size growth rate to approximately one byte per thousand dynamic instructions. \item Lower hardware cost by reducing the cost to 24 KB per processor core. \item Simpler design by modifying only the cache coherence protocol, but not the cache. \item Broader applicability by supporting both Sequential Consistency (SC) and Total Store Order (TSO) memory consistency models (existing recorders support only SC). \end{itemize} These improvements stem from several ideas: (1) a Regulated Transitive Reduction (RTR) recording algorithm that creates stricter and vectorizable dependencies to reduce the log growth rate; (2) a Set/LRU timestamp approximation method that better approximates timestamps of uncached memory locations to reduce the hardware cost; (3) an order-value-hybrid recording method that explicitly logs the value of potential SC-violating load instructions to support multiprocessor systems with TSO.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "determinism; multithreading; race recording", } @Article{Ziarek:2006:SMC, author = "Lukasz Ziarek and Philip Schatz and Suresh Jagannathan", title = "Stabilizers: a modular checkpointing abstraction for concurrent functional programs", journal = j-SIGPLAN, volume = "41", number = "9", pages = "136--147", month = sep, year = "2006", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1160074.1159822", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jun 18 10:46:22 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Transient faults that arise in large-scale software systems can often be repaired by re-executing the code in which they occur. Ascribing a meaningful semantics for safe re-execution in multi-threaded code is not obvious, however. For a thread to correctly re-execute a region of code, it must ensure that all other threads which have witnessed its unwanted effects within that region are also reverted to a meaningful earlier state. If not done properly, data inconsistencies and other undesirable behavior may result. however, automatically determining what constitutes a consistent global checkpoint is not straightforward since thread interactions are a dynamic property of the program. In this paper, we present a safe and efficient checkpointing mechanism for Concurrent ML (CML) that can be used to recover from transient faults. We introduce a new linguistic abstraction called stabilizers that permits the specification of per-thread monitors and the restoration of globally consistent checkpoints. Safe global states are computed through lightweight monitoring of communication events among threads (e.g. message-passing operations or updates to shared variables). Our experimental results on several realistic, multithreaded, server-style CML applications, including a web server and a windowing toolkit, show that the overheads to use stabilizers are small, and lead us to conclude that they are a viable mechanism for defining safe checkpoints in concurrent functional programs.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "checkpointing; concurrent ML; concurrent programming; error recovery; exception handling; transactions", } @Article{Benner:2007:SLS, author = "Peter Benner and Maribel Castillo and Rafael Mayo and Enrique S. Quintana-Ort{\'\i} and Gregorio Quintana-Ort{\'\i}", title = "Stabilizing large-scale generalized systems on parallel computers using multithreading and message-passing", journal = j-CCPE, volume = "19", number = "4", pages = "531--542", day = "25", month = mar, year = "2007", CODEN = "CCPEBO", DOI = "http://dx.doi.org/10.1002/cpe.1148", ISSN = "1532-0626 (print), 1532-0634 (electronic)", ISSN-L = "1532-0626", bibdate = "Mon Dec 5 10:08:11 MST 2011", bibsource = "http://www.interscience.wiley.com/jpages/1532-0626; http://www.math.utah.edu/pub/tex/bib/ccpe.bib; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Concurrency and Computation: Prac\-tice and Experience", onlinedate = "12 Dec 2006", } @Article{Bergstra:2007:SCE, author = "J. A. Bergstra and C. A. Middelburg", title = "Synchronous cooperation for explicit multi-threading", journal = j-ACTA-INFO, volume = "44", number = "7--8", pages = "525--569", month = dec, year = "2007", CODEN = "AINFA2", DOI = "http://dx.doi.org/10.1007/s00236-007-0057-9", ISSN = "0001-5903 (print), 1432-0525 (electronic)", ISSN-L = "0001-5903", bibdate = "Wed Jul 9 21:28:19 MDT 2008", bibsource = "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0001-5903&volume=44&issue=7; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.springerlink.com/openurl.asp?genre=article&issn=0001-5903&volume=44&issue=7&spage=525", acknowledgement = ack-nhfb, fjournal = "Acta Informatica", } @Article{Blundell:2007:MFC, author = "Colin Blundell and Joe Devietti and E. Christopher Lewis and Milo M. K. Martin", title = "Making the fast case common and the uncommon case simple in unbounded transactional memory", journal = j-COMP-ARCH-NEWS, volume = "35", number = "2", pages = "24--34", month = may, year = "2007", CODEN = "CANED2", DOI = "http://doi.acm.org/10.1145/1273440.1250667", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:48:43 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Hardware transactional memory has great potential to simplify the creation of correct and efficient multithreaded programs, allowing programmers to exploit more effectively the soon-to-be-ubiquitous multi-core designs. Several recent proposals have extended the original bounded transactional memory to unbounded transactional memory, a crucial step toward transactions becoming a general-purpose primitive. Unfortunately, supporting the concurrent execution of an unbounded number of unbounded transactions is challenging, and as a result, many proposed implementations are complex.\par This paper explores a different approach. First, we introduce the permissions-only cache to extend the bound at which transactions overflow to allow the fast, bounded case to be used as frequently as possible. Second, we propose OneTM to simplify the implementation of unbounded transactional memory by bounding the concurrency of transactions that overflow the cache. These mechanisms work synergistically to provide a simple and fast unbounded transactional memory system.\par The permissions-only cache efficiently maintains the coherence permissions --- but not data-for blocks read or written transactionally that have been evicted from the processor's caches. By holding coherence permissions for these blocks, the regular cache coherence protocol can be used to detect transactional conflicts using only a few bits of on-chip storage per overflowed cache block. OneTM allows only one overflowed transaction at a time, relying on the permissions-only cache to ensure that overflow is infrequent. We present two implementations. In OneTM-Serialized, an overflowed transaction simply stalls all other threads in the application.\par In OneTM-Concurrent, non-overflowed transactions and non-transactional code can execute concurrently with the overflowed transaction, providing more concurrency while retaining OneTM's core simplifying assumption.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", keywords = "concurrency; parallel programming; transactional memory; transactions", } @Article{Burckhardt:2007:CCC, author = "Sebastian Burckhardt and Rajeev Alur and Milo M. K. Martin", title = "{CheckFence}: checking consistency of concurrent data types on relaxed memory models", journal = j-SIGPLAN, volume = "42", number = "6", pages = "12--21", month = jun, year = "2007", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1250734.1250737", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jun 18 10:55:30 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Concurrency libraries can facilitate the development of multi-threaded programs by providing concurrent implementations of familiar data types such as queues or sets. There exist many optimized algorithms that can achieve superior performance on multiprocessors by allowing concurrent data accesses without using locks. Unfortunately, such algorithms can harbor subtle concurrency bugs. Moreover, they require memory ordering fences to function correctly on relaxed memory models.\par To address these difficulties, we propose a verification approach that can exhaustively check all concurrent executions of a given test program on a relaxed memory model and can verify that they are observationally equivalent to a sequential execution. Our CheckFence prototype automatically translates the C implementation code and the test program into a SAT formula, hands the latter to a standard SAT solver, and constructs counter example traces if there exist incorrect executions. Applying CheckFence to five previously published algorithms, we were able to (1) find several bugs (some not previously known), and (2) determine how to place memory ordering fences for relaxed memory models.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "concurrent data structures; lock-free synchronization; memory models; multi-threading; sequential consistency; shared-memory multiprocessors; software model checking", } @Article{Das:2007:FVT, author = "Dipankar Das and P. P. Chakrabarti and Rajeev Kumar", title = "Functional verification of task partitioning for multiprocessor embedded systems", journal = j-TODAES, volume = "12", number = "4", pages = "44:1--44:??", month = sep, year = "2007", CODEN = "ATASFO", DOI = "http://doi.acm.org/10.1145/1278349.1278357", ISSN = "1084-4309 (print), 1557-7309 (electronic)", ISSN-L = "1084-4309", bibdate = "Thu Jun 12 18:09:35 MDT 2008", bibsource = "http://www.acm.org/pubs/contents/journals/todaes/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "With the advent of multiprocessor embedded platforms, application partitioning and mapping have gained primacy as a design step. The output of this design step is a multithreaded partitioned application where each thread is mapped to a processing element (processor or ASIC) in the multiprocessor platform. This partitioned application must be verified to be consistent with the native unpartitioned application. This verification task is called application (or task) partitioning verification. \par This work proposes a code-block-level containment-checking -based methodology for application partitioning verification. We use a UML-based code-block-level modeling language which is rich enough to model most designs. We formulate the application partitioning verification problem as a special case of the containment checking problem, which we call the complete containment checking problem. We propose a state space reduction technique specific to the containment checking, reachability analysis, and deadlock detection problems. We propose novel data structures and token propagation methodologies which enhance the efficiency of containment checking. We present an efficient containment checking algorithm for the application partitioning verification problem. We develop a containment checking tool called TraceMatch and present experimental results. We present a comparison of the state space reduction achieved by TraceMatch with that achieved by formal analysis and verification tools like Spin, PEP, PROD, and LoLA.", acknowledgement = ack-nhfb, articleno = "44", fjournal = "ACM Transactions on Design Automation of Electronic Systems (TODAES)", keywords = "Containment checking; multiprocessor embedded systems; state space reduction; UML activity diagrams", } @Article{Dou:2007:CCM, author = "Jialin Dou and Marcelo Cintra", title = "A compiler cost model for speculative parallelization", journal = j-TACO, volume = "4", number = "2", pages = "12:1--12:??", month = jun, year = "2007", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1250727.1250732", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jun 16 11:40:54 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Speculative parallelization is a technique that allows code sections that cannot be fully analyzed by the compiler to be aggressively executed in parallel. However, while speculative parallelization can potentially deliver significant speedups, several overheads associated with this technique can limit these speedups in practice. This paper proposes a novel compiler static cost model of speculative multithreaded execution that can be used to predict the resulting performance. This model attempts to predict the expected speedups, or slowdowns, of the candidate speculative sections based on the estimation of the combined runtime effects of various overheads, and taking into account the scheduling restrictions of most speculative execution environments. The model is based on estimating the likely execution duration of threads and considers all the possible permutations of these threads. This model also produces a quantitative estimate of the speedup, which is different from prior heuristics that only qualitatively estimate the benefits of speculative multithreaded execution. In previous work, a limited version of the framework was evaluated on a number of loops from a collection of SPEC benchmarks that suffer mainly from load imbalance and thread dispatch and commit overheads. In this work, an extended framework is also evaluated on loops that may suffer from data-dependence violations. Experimental results show that prediction accuracy is lower when loops with violations are included. Nevertheless, accuracy is still very high for a static model: the framework can identify, on average, 45\% of the loops that cause slowdowns and, on average, 96\% of the loops that lead to speedups; it predicts the speedups or slowdowns with an error of less than 20\% for an average of 28\% of the loops across the benchmarks and with an error of less than 50\% for an average of 80\% of the loops. Overall, the framework often outperforms, by as much as 25\%, a naive approach that attempts to speculatively parallelize all the loops considered, and is able to curb the large slowdowns caused in many cases by this naive approach.", acknowledgement = ack-nhfb, articleno = "12", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", keywords = "speculative multithreading; speculative parallelization; thread-level speculation", } @Article{Elmas:2007:GRT, author = "Tayfun Elmas and Shaz Qadeer and Serdar Tasiran", title = "{Goldilocks}: a race and transaction-aware {Java} runtime", journal = j-SIGPLAN, volume = "42", number = "6", pages = "245--255", month = jun, year = "2007", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1273442.1250762", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jun 18 10:55:30 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Data races often result in unexpected and erroneous behavior. In addition to causing data corruption and leading programs to crash, the presence of data races complicates the semantics of an execution which might no longer be sequentially consistent. Motivated by these observations, we have designed and implemented a Java runtime system that monitors program executions and throws a DataRaceException when a data race is about to occur. Analogous to other runtime exceptions, the DataRaceException provides two key benefits. First, accesses causing race conditions are interrupted and handled before they cause errors that may be difficult to diagnose later. Second, if no DataRaceException is thrown in an execution, it is guaranteed to be sequentially consistent. This strong guarantee helps to rule out many concurrency-related possibilities as the cause of erroneous behavior. When a DataRaceException is caught, the operation, thread, or program causing it can be terminated gracefully. Alternatively, the DataRaceException can serve as a conflict-detection mechanism in optimistic uses of concurrency.\par We start with the definition of data-race-free executions in the Java memory model. We generalize this definition to executions that use transactions in addition to locks and volatile variables for synchronization. We present a precise and efficient algorithm for dynamically verifying that an execution is free of data races. This algorithm generalizes the Goldilocks algorithm for data-race detection by handling transactions and providing the ability to distinguish between read and write accesses. We have implemented our algorithm and the DataRaceException in the Kaffe Java Virtual Machine. We have evaluated our system on a variety of publicly available Java benchmarks and a few microbenchmarks that combine lock-based and transaction-based synchronization. Our experiments indicate that our implementation has reasonable overhead. Therefore, we believe that in addition to being a debugging tool, the DataRaceException may be a viable mechanism to enforce the safety of executions of multithreaded Java programs.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "data-race detection; Java runtime; runtime monitoring; software transactions", } @Article{Emmi:2007:LA, author = "Michael Emmi and Jeffrey S. Fischer and Ranjit Jhala and Rupak Majumdar", title = "Lock allocation", journal = j-SIGPLAN, volume = "42", number = "1", pages = "291--296", month = jan, year = "2007", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1190216.1190260", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jun 18 10:53:14 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "We introduce lock allocation, an automatic technique that takes a multi-threaded program annotated with atomic sections (that must be executed atomically), and infers a lock assignment from global variables to locks and a lock instrumentation that determines where each lock should be acquired and released such that the resulting instrumented program is guaranteed to preserve atomicity and deadlock freedom (provided all shared state is accessed only within atomic sections). Our algorithm works in the presence of pointers and procedures, and sets up the lock allocation problem as a 0-1 ILP which minimizes the conflict cost between atomic sections while simultaneously minimizing the number of locks. We have implemented our algorithm for both C with pthreads and Java, and have applied it to infer locks in 15K lines of AOLserver code. Our automatic allocation produces the same results as hand annotations for most of this code, while solving the optimization instances within a second for most programs.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "atomicity; ILP; lock inference", } @Article{Eytani:2007:TFB, author = "Yaniv Eytani and Klaus Havelund and Scott D. Stoller and Shmuel Ur", title = "Towards a framework and a benchmark for testing tools for multi-threaded programs", journal = j-CCPE, volume = "19", number = "3", pages = "267--279", day = "10", month = mar, year = "2007", CODEN = "CCPEBO", DOI = "http://dx.doi.org/10.1002/cpe.1068", ISSN = "1532-0626 (print), 1532-0634 (electronic)", ISSN-L = "1532-0626", bibdate = "Mon Dec 5 10:08:10 MST 2011", bibsource = "http://www.interscience.wiley.com/jpages/1532-0626; http://www.math.utah.edu/pub/tex/bib/ccpe.bib; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Concurrency and Computation: Prac\-tice and Experience", onlinedate = "1 Aug 2006", } @Article{Gabor:2007:FES, author = "Ron Gabor and Shlomo Weiss and Avi Mendelson", title = "Fairness enforcement in switch on event multithreading", journal = j-TACO, volume = "4", number = "3", pages = "15:1--15:??", month = sep, year = "2007", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1275937.1275939", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jun 16 11:41:20 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "The need to reduce power and complexity will increase the interest in Switch On Event multithreading (coarse-grained multithreading). Switch On Event multithreading is a low-power and low-complexity mechanism to improve processor throughput by switching threads on execution stalls. Fairness may, however, become a problem in a multithreaded processor. Unless fairness is properly handled, some threads may starve while others consume all of the processor cycles. Heuristics that were devised in order to improve fairness in simultaneous multithreading are not applicable to Switch On Event multithreading. This paper defines the fairness metric using the ratio of the individual threads' speedups and shows how it can be enforced in Switch On Event multithreading. Fairness is controlled by forcing additional thread switch points. These switch points are determined dynamically by runtime estimation of the single threaded performance of each of the individual threads. We analyze the impact of the fairness enforcement mechanism on aggregate IPC and weighted speedup. We present simulation results of the performance of Switch On Event multithreading. Switch On Event multithreading achieves an average aggregate IPC increase of 26\% over single thread and 12\% weighted speedup when no fairness is enforced. In this case, a sixth of our runs resulted in poor fairness in which one thread ran extremely slowly (10 to 100 times slower than its single-thread performance), while the other thread's performance was hardly affected. By using the proposed mechanism, we can guarantee fairness at different levels of strictness and, in most cases, even improve the weighted speedup.", acknowledgement = ack-nhfb, articleno = "15", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", keywords = "coarse-grained multithreading; fairness; multithreading; performance; SOE; Switch on Event multithreading; throughput; weighted speedup", } @Article{Ghoting:2007:CCF, author = "Amol Ghoting and Gregory Buehrer and Srinivasan Parthasarathy and Daehyun Kim and Anthony Nguyen and Yen-Kuang Chen and Pradeep Dubey", title = "Cache-conscious frequent pattern mining on modern and emerging processors", journal = j-VLDB-J, volume = "16", number = "1", pages = "77--96", month = jan, year = "2007", CODEN = "VLDBFR", ISSN = "1066-8888 (print), 0949-877X (electronic)", ISSN-L = "1066-8888", bibdate = "Mon Jun 23 10:51:22 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Algorithms are typically designed to exploit the current state of the art in processor technology. However, as processor technology evolves, said algorithms are often unable to derive the maximum achievable performance on these modern architectures. In this paper, we examine the performance of frequent pattern mining algorithms on a modern processor. A detailed performance study reveals that even the best frequent pattern mining implementations, with highly efficient memory managers, still grossly under-utilize a modern processor. The primary performance bottlenecks are {\em poor data locality\/} and {\em low instruction level parallelism (ILP)}. We propose a {\em cache-conscious prefix tree\/} to address this problem. The resulting tree improves spatial locality and also enhances the benefits from hardware cache line prefetching. Furthermore, the design of this data structure allows the use of {\em path tiling}, a novel tiling strategy, to improve temporal locality. The result is an overall speedup of up to 3.2 when compared with state of the art implementations. We then show how these algorithms can be improved further by realizing a non-naive thread-based decomposition that targets {\em simultaneously multi-threaded processors (SMT)}. A key aspect of this decomposition is to ensure cache re-use between threads that are co-scheduled at a fine granularity. This optimization affords an additional speedup of 50\%, resulting in an overall speedup of up to 4.8. The proposed optimizations also provide performance improvements on SMPs, and will most likely be beneficial on emerging processors.", acknowledgement = ack-nhfb, fjournal = "VLDB Journal: Very Large Data Bases", keywords = "architecture-conscious algorithms; association rule mining; cache-conscious data mining; frequent itemset mining; frequent pattern mining", } @Article{Gotsman:2007:TMS, author = "Alexey Gotsman and Josh Berdine and Byron Cook and Mooly Sagiv", title = "Thread-modular shape analysis", journal = j-SIGPLAN, volume = "42", number = "6", pages = "266--277", month = jun, year = "2007", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1273442.1250765", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jun 18 10:55:30 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "We present the first shape analysis for multithreaded programs that avoids the explicit enumeration of execution-interleavings. Our approach is to automatically infer a resource invariant associated with each lock that describes the part of the heap protected by the lock. This allows us to use a sequential shape analysis on each thread. We show that resource invariants of a certain class can be characterized as least fixed points and computed via repeated applications of shape analysis only on each individual thread. Based on this approach, we have implemented a thread-modular shape analysis tool and applied it to concurrent heap-manipulating code from Windows device drivers.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "abstract interpretation; concurrent programming; shape analysis; static analysis", } @Article{Gravvanis:2007:PPA, author = "George A. Gravvanis and Victor N. Epitropou and Konstantinos M. Giannoutakis", title = "On the performance of parallel approximate inverse preconditioning using {Java} multithreading techniques", journal = j-APPL-MATH-COMP, volume = "190", number = "1", pages = "255--270", day = "1", month = jul, year = "2007", CODEN = "AMHCBQ", ISSN = "0096-3003 (print), 1873-5649 (electronic)", ISSN-L = "0096-3003", bibdate = "Sat Jul 12 09:03:06 MDT 2008", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.sciencedirect.com/science/journal/00963003", acknowledgement = ack-nhfb, fjournal = "Applied Mathematics and Computation", } @Article{Hur:2007:MSM, author = "Ibrahim Hur and Calvin Lin", title = "Memory scheduling for modern microprocessors", journal = j-TOCS, volume = "25", number = "4", pages = "10:1--10:??", month = dec, year = "2007", CODEN = "ACSYEC", DOI = "http://doi.acm.org/10.1145/1314299.1314301", ISSN = "0734-2071", ISSN-L = "0734-2071", bibdate = "Mon Jun 16 17:52:15 MDT 2008", bibsource = "http://www.acm.org/pubs/contents/journals/tocs/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "The need to carefully schedule memory operations has increased as memory performance has become increasingly important to overall system performance. This article describes the adaptive history-based (AHB) scheduler, which uses the history of recently scheduled operations to provide three conceptual benefits: (1) it allows the scheduler to better reason about the delays associated with its scheduling decisions, (2) it provides a mechanism for combining multiple constraints, which is important for increasingly complex DRAM structures, and (3) it allows the scheduler to select operations so that they match the program's mixture of Reads and Writes, thereby avoiding certain bottlenecks within the memory controller.\par We have previously evaluated this scheduler in the context of the IBM Power5. When compared with the state of the art, this scheduler improves performance by 15.6\\%, 9.9\\%, and 7.6\\% for the Stream, NAS, and commercial benchmarks, respectively. This article expands our understanding of the AHB scheduler in a variety of ways. Looking backwards, we describe the scheduler in the context of prior work that focused exclusively on avoiding bank conflicts, and we show that the AHB scheduler is superior for the IBM Power5, which we argue will be representative of future microprocessor memory controllers. Looking forwards, we evaluate this scheduler in the context of future systems by varying a number of microarchitectural features and hardware parameters. For example, we show that the benefit of this scheduler increases as we move to multithreaded environments.", acknowledgement = ack-nhfb, articleno = "10", fjournal = "ACM Transactions on Computer Systems", keywords = "adaptive history-based scheduling; memory scheduling; memory system performance", } @InBook{Kollias:2007:APC, author = "Giorgos Kollias and Efstratios Gallopoulos", title = "Asynchronous {PageRank} computation in an interactive multithreading environment", volume = "07071", publisher = "International Begegnungs- und Forschungszentrum f{\"u}r Informatik", address = "Wadern, Germany", pages = "????", year = "2007", ISBN = "????", ISBN-13 = "????", bibdate = "Fri Feb 19 15:32:30 2010", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.math.utah.edu/pub/tex/bib/pagerank.bib", series = "Dagstuhl seminar proceedings", URL = "http://drops.dagstuhl.de/opus/volltexte/2007/1065/pdf/07071.KolliasGiorgios.Paper.1065", acknowledgement = ack-nhfb, } @Article{Kumar:2007:ESI, author = "Nagendra J. Kumar and Vasanth Asokan and Siddhartha Shivshankar and Alexander G. Dean", title = "Efficient software implementation of embedded communication protocol controllers using asynchronous software thread integration with time- and space-efficient procedure calls", journal = j-TECS, volume = "6", number = "1", pages = "2:1--2:??", month = feb, year = "2007", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1210268.1210270", ISSN = "1539-9087", ISSN-L = "1539-9087", bibdate = "Thu Jun 12 15:20:58 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "The overhead of context switching limits efficient scheduling of multiple concurrent threads on a uniprocessor when real-time requirements exist. A software-implemented protocol controller may be crippled by this problem. The available idle time may be too short to recover through context switching, so only the primary thread can execute during message activity, slowing the secondary threads and potentially missing deadlines. Asynchronous software thread integration (ASTI) uses coroutine calls and integration, letting threads make independent progress efficiently, and reducing the needed context switches. We demonstrate the methods with a software implementation of an automotive communication protocol (J1850) and several secondary threads.", acknowledgement = ack-nhfb, articleno = "2", fjournal = "ACM Transactions on Embedded Computing Systems", keywords = "asynchronous software thread integration; fine-grain concurrency; hardware to software migration; J1850; software-implemented communication protocol controllers", } @Article{Laudon:2007:CWM, author = "James Laudon and Lawrence Spracklen", title = "The Coming Wave of Multithreaded Chip Multiprocessors", journal = j-INT-J-PARALLEL-PROG, volume = "35", number = "3", pages = "299--330", month = jun, year = "2007", CODEN = "IJPPE5", DOI = "http://dx.doi.org/10.1007/s10766-007-0033-6", ISSN = "0885-7458 (print), 1573-7640 (electronic)", ISSN-L = "0885-7458", bibdate = "Wed Jul 9 16:06:21 MDT 2008", bibsource = "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=35&issue=3; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.springerlink.com/openurl.asp?genre=article&issn=0885-7458&volume=35&issue=3&spage=299", acknowledgement = ack-nhfb, fjournal = "International Journal of Parallel Programming", keywords = "Chip multiprocessing; multithreading; parallel programming; performance", } @Article{Le:2007:IPM, author = "H. Q. Le and W. J. Starke and J. S. Fields and F. P. O'Connell and D. Q. Nguyen and B. J. Ronchetti and W. M. Sauer and E. M. Schwarz and M. T. Vaden", title = "{IBM POWER6} microarchitecture", journal = j-IBM-JRD, volume = "51", number = "6", pages = "639--??", month = nov, year = "2007", CODEN = "IBMJAE", ISSN = "0018-8646 (print), 2151-8556 (electronic)", ISSN-L = "0018-8646", bibdate = "Mon Jul 7 21:49:07 MDT 2008", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.research.ibm.com/journal/", URL = "http://www.research.ibm.com/journal/rd/516/le.html", abstract = "This paper describes the implementation of the IBM POWER6 microprocessor, a two-way simultaneous multithreaded (SMT) dual-core chip whose key features include binary compatibility with IBM POWER5 microprocessor-based systems; increased functional capabilities, such as decimal floating-point and vector multimedia extensions; significant reliability, availability, and serviceability enhancements; and robust scalability with up to 64 physical processors. Based on a new industry-leading high-frequency core architecture with enhanced SMT and driven by a high-throughput symmetric multiprocessing (SMP) cache and memory subsystem, the POWER6 chip achieves a significant performance boost compared with its predecessor, the POWER5 chip. Key extensions to the coherence protocol enable POWER6 microprocessor-based systems to achieve better SMP scalability while enabling reductions in system packaging complexity and cost.", acknowledgement = ack-nhfb, fjournal = "IBM Journal of Research and Development", } @Article{Li:2007:CET, author = "Peng Li and Steve Zdancewic", title = "Combining events and threads for scalable network services implementation and evaluation of monadic, application-level concurrency primitives", journal = j-SIGPLAN, volume = "42", number = "6", pages = "189--199", month = jun, year = "2007", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1273442.1250756", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jun 18 10:55:30 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "This paper proposes to combine two seemingly opposed programming models for building massively concurrent network services: the event-driven model and the multithreaded model. The result is a hybrid design that offers the best of both worlds--the ease of use and expressiveness of threads and the flexibility and performance of events.\par This paper shows how the hybrid model can be implemented entirely at the application level using concurrency monads in Haskell, which provides type-safe abstractions for both events and threads. This approach simplifies the development of massively concurrent software in a way that scales to real-world network services. The Haskell implementation supports exceptions, symmetrical multiprocessing, software transactional memory, asynchronous I/O mechanisms and application-level network protocol stacks. Experimental results demonstrate that this monad-based approach has good performance: the threads are extremely lightweight (scaling to ten million threads), and the I/O performance compares favorably to that of Linux NPTL. tens of thousands of simultaneous, mostly-idle client connections. Such massively-concurrent programs are difficult to implement, especially when other requirements, such as high performance and strong security, must also be met.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "concurrency; event; Haskell; implementation; monad; networking; programming; scalability; thread", } @Article{Mahesri:2007:HSS, author = "Aqeel Mahesri and Nicholas J. Wang and Sanjay J. Patel", title = "Hardware support for software controlled multithreading", journal = j-COMP-ARCH-NEWS, volume = "35", number = "1", pages = "3--12", month = mar, year = "2007", CODEN = "CANED2", DOI = "http://doi.acm.org/10.1145/1241601.1241606", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:47:26 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Chip multi-processors have emerged as one of the most effective uses of the huge number of transistors available today and in the future, but questions remain as to the best way to leverage CMPs to accelerate single threaded applications. Previous approaches rely on significant speculation to accomplish this goal. Our proposal, NXA, is less speculative than previous proposals, relying heavily on software to guarantee thread correctness, though still allowing parallelism in the presence of ambiguous dependences. It divides a single thread of execution into multiple using the master-worker paradigm where some set of master threads execute code that spawns tasks for other, worker threads. The master threads generally consist of performance critical instructions that can prefetch data, compute critical control decisions, or compute performance critical dataflow slices. This prevents non-critical instructions from competing with critical instructions for processor resources, allowing the critical thread (and thus the workload) to complete faster. Empirical results from performance simulation show a 20\% improvement in performance on a 2-way CMP machine, demonstrating that software controlled multithreading can indeed provide a benefit in the presence of hardware support.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", remark = "{DASCMP'06}", } @Article{Marowka:2007:PCD, author = "Ami Marowka", title = "Parallel computing on any desktop", journal = j-CACM, volume = "50", number = "9", pages = "74--78", month = sep, year = "2007", CODEN = "CACMA2", DOI = "http://doi.acm.org/10.1145/1284621.1284622", ISSN = "0001-0782 (print), 1557-7317 (electronic)", ISSN-L = "0001-0782", bibdate = "Mon Jun 16 18:32:57 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Parallelization lets applications exploit the high throughput of new multicore processors, and the OpenMP parallel programming model helps developers create multithreaded applications.", acknowledgement = ack-nhfb, fjournal = "Communications of the ACM", } @Article{Minh:2007:EHT, author = "Chi Cao Minh and Martin Trautmann and JaeWoong Chung and Austen McDonald and Nathan Bronson and Jared Casper and Christos Kozyrakis and Kunle Olukotun", title = "An effective hybrid transactional memory system with strong isolation guarantees", journal = j-COMP-ARCH-NEWS, volume = "35", number = "2", pages = "69--80", month = may, year = "2007", CODEN = "CANED2", DOI = "http://doi.acm.org/10.1145/1250662.1250673", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:48:43 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "We propose signature-accelerated transactional memory (SigTM), a hybrid TM system that reduces the overhead of software transactions. SigTM uses hardware signatures to track the read-set and write-set for pending transactions and perform conflict detection between concurrent threads. All other transactional functionality, including data versioning, is implemented in software. Unlike previously proposed hybrid TM systems, SigTM requires no modifications to the hardware caches, which reduces hardware cost and simplifies support for nested transactions and multithreaded processor cores. SigTM is also the first hybrid TM system to provide strong isolation guarantees between transactional blocks and non-transactional accesses without additional read and write barriers in non-transactional code.\par Using a set of parallel programs that make frequent use of coarse-grain transactions, we show that SigTM accelerates software transactions by 30\% to 280\%. For certain workloads, SigTM can match the performance of a full-featured hardware TM system, while for workloads with large read-sets it can be up to two times slower. Overall, we show that SigTM combines the performance characteristics and strong isolation guarantees of hardware TM implementations with the low cost and flexibility of software TM systems.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", keywords = "multi-core architectures; parallel programming; strong isolation; transactional memory", } @Article{Morandini:2007:UDS, author = "Marco Morandini and Paolo Mantegazza", title = "Using dense storage to solve small sparse linear systems", journal = j-TOMS, volume = "33", number = "1", pages = "??--??", month = mar, year = "2007", CODEN = "ACMSCU", DOI = "http://doi.acm.org/10.1145/1206040.1206045", ISSN = "0098-3500 (print), 1557-7295 (electronic)", ISSN-L = "0098-3500", bibdate = "Sat Apr 14 09:48:58 MDT 2007", bibsource = "http://www.acm.org/pubs/contents/journals/toms/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "A data structure is used to build a linear solver specialized for relatively small sparse systems. The proposed solver, optimized for run-time performance at the expense of memory footprint, outperforms widely used direct and sparse solvers for systems with between 100 and 3000 equations. A multithreaded version of the solver is shown to give some speedups for problems with medium fill-in, while it does not give any benefit for very sparse problems.", acknowledgement = ack-nhfb, articleno = "5", fjournal = "ACM Transactions on Mathematical Software", } @Article{Musuvathi:2007:ICB, author = "Madanlal Musuvathi and Shaz Qadeer", title = "Iterative context bounding for systematic testing of multithreaded programs", journal = j-SIGPLAN, volume = "42", number = "6", pages = "446--455", month = jun, year = "2007", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1273442.1250785", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jun 18 10:55:30 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Multithreaded programs are difficult to get right because of unexpected interaction between concurrently executing threads. Traditional testing methods are inadequate for catching subtle concurrency errors which manifest themselves late in the development cycle and post-deployment. Model checking or systematic exploration of program behavior is a promising alternative to traditional testing methods. However, it is difficult to perform systematic search on large programs as the number of possible program behaviors grows exponentially with the program size. Confronted with this state-explosion problem, traditional model checkers perform iterative depth-bounded search. Although effective for message-passing software, iterative depth-bounding is inadequate for multithreaded software.\par This paper proposes iterative context-bounding, a new search algorithm that systematically explores the executions of a multithreaded program in an order that prioritizes executions with fewer context switches. We distinguish between preempting and nonpreempting context switches, and show that bounding the number of preempting context switches to a small number significantly alleviates the state explosion, without limiting the depth of explored executions. We show both theoretically and empirically that context-bounded search is an effective method for exploring the behaviors of multithreaded programs. We have implemented our algorithm in two model checkers and applied it to a number of real-world multithreaded programs. Our implementation uncovered 9 previously unknown bugs in our benchmarks, each of which was exposed by an execution with at most 2 preempting context switches. Our initial experience with the technique is encouraging and demonstrates that iterative context-bounding is a significant improvement over existing techniques for testing multithreaded programs.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "concurrency; context-bounding; model checking; multithreading; partial-order reduction; shared-memory programs; software testing", } @Article{Naik:2007:CMA, author = "Mayur Naik and Alex Aiken", title = "Conditional must not aliasing for static race detection", journal = j-SIGPLAN, volume = "42", number = "1", pages = "327--338", month = jan, year = "2007", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1190216.1190265", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jun 18 10:53:14 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Race detection algorithms for multi-threaded programs using the common lock-based synchronization idiom must correlate locks with the memory locations they guard. The heart of a proof of race freedom is showing that if two locks are distinct, then the memory locations they guard are also distinct. This is an example of a general property we call conditional must not aliasing: Under the assumption that two objects are not aliased, prove that two other objects are not aliased. This paper introduces and gives an algorithm for conditional must not alias analysis and discusses experimental results for sound race detection of Java programs.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "concurrency; Java; multi-threading; static race detection; synchronization", } @Article{Narayanasamy:2007:ACB, author = "Satish Narayanasamy and Zhenghao Wang and Jordan Tigani and Andrew Edwards and Brad Calder", title = "Automatically classifying benign and harmful data races all using replay analysis", journal = j-SIGPLAN, volume = "42", number = "6", pages = "22--31", month = jun, year = "2007", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1250734.1250738", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jun 18 10:55:30 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Many concurrency bugs in multi-threaded programs are due to dataraces. There have been many efforts to develop static and dynamic mechanisms to automatically find the data races. Most of the prior work has focused on finding the data races and eliminating the false positives.\par In this paper, we instead focus on a dynamic analysis technique to automatically classify the data races into two categories --- the dataraces that are potentially benign and the data races that are potentially harmful. A harmful data race is a real bug that needs to be fixed. This classification is needed to focus the triaging effort on those data races that are potentially harmful. Without prioritizing the data races we have found that there are too many data races to triage. Our second focus is to automatically provide to the developer a reproducible scenario of the data race, which allows the developer to understand the different effects of a harmful data race on a program's execution.\par To achieve the above, we record a multi-threaded program's execution in a replay log. The replay log is used to replay the multi-threaded program, and during replay we find the data races using a happens-before based algorithm. To automatically classify if a data race that we find is potentially benign or potentially harmful, were play the execution twice for a given data race --- one for each possible order between the conflicting memory operations. If the two replays for the two orders produce the same result, then we classify the data race to be potentially benign. We discuss our experiences in using our replay based dynamic data race checker on several Microsoft applications.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "benign data races; concurrency Bbugs; replay", } @Article{Ostler:2007:IHT, author = "Chris Ostler and Karam S. Chatha and Vijay Ramamurthi and Krishnan Srinivasan", title = "{ILP} and heuristic techniques for system-level design on network processor architectures", journal = j-TODAES, volume = "12", number = "4", pages = "48:1--48:??", month = sep, year = "2007", CODEN = "ATASFO", DOI = "http://doi.acm.org/10.1145/1278349.1278361", ISSN = "1084-4309 (print), 1557-7309 (electronic)", ISSN-L = "1084-4309", bibdate = "Thu Jun 12 18:09:35 MDT 2008", bibsource = "http://www.acm.org/pubs/contents/journals/todaes/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Network processors incorporate several architectural features, including symmetric multiprocessing (SMP), block multithreading, and multiple memory elements, to support the high-performance requirements of current day applications. This article presents automated system-level design techniques for application development on such architectures. We propose integer linear programming formulations and heuristic techniques for process allocation and data mapping on SMP and block-multithreading-based network processors. The techniques incorporate process transformations and multithreading-aware data mapping to maximize the throughput of the application. The article presents experimental results that evaluate the techniques by implementing network processing applications on the Intel IXP 2400 architecture.", acknowledgement = ack-nhfb, articleno = "48", fjournal = "ACM Transactions on Design Automation of Electronic Systems (TODAES)", keywords = "block multithreading; multiprocessor", } @Article{Park:2007:MEP, author = "Soyeon Park and Weihang Jiang and Yuanyuan Zhou and Sarita Adve", title = "Managing energy-performance tradeoffs for multithreaded applications on multiprocessor architectures", journal = j-SIGMETRICS, volume = "35", number = "1", pages = "169--180", month = jun, year = "2007", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1254882.1254902", ISSN = "0163-5999 (print), 1557-9484 (electronic)", ISSN-L = "0163-5999", bibdate = "Fri Jun 27 09:42:48 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "In modern computers, non-performance metrics such as energy consumption have become increasingly important, requiring tradeoff with performance. A recent work has proposed performance-guaranteed energy management, but it is designed specifically for sequential applications and cannot be used to a large class of multithreaded applications running on high end computers and data servers.\par To address the above problem, this paper makes the first attempt to provide performance-guaranteed energy management for multithreaded applications on multiprocessor architectures. We first conduct a comprehensive study on the effects of energy adaptation on thread synchronizations and show that a multithreaded application suffers from not only local slowdowns due to energy adaptation, but also significant slowdowns propagated from other threads because of synchronization. Based on these findings, we design three Synchronization-Aware (SA) algorithms, LWT (Lock Waiting Time-based), CSL (Critical Section Length-based) and ODP (Operation Delay Propagation-based) algorithms, to estimate the energy adaptation-induced slowdowns on each thread. The local slowdowns are then combined across multiple threads via three aggregation methods (MAX, AVG and SUM) to estimate the overall application slowdown.\par We evaluate our methods using a large multithreaded commercial application, IBM DB2 with industrial-strength online transaction processing (OLTP) workloads, and six SPLASH parallel scientific applications. Our experimental results show that LWT combined with the MAX aggregation method not only controls the performance slow down within the specified limits but also conserves the most energy.", acknowledgement = ack-nhfb, fjournal = "ACM SIGMETRICS Performance Evaluation Review", keywords = "energy and performance tradeoffs; low power design; memory energy management; multithreaded applications", } @Article{Permandla:2007:TSP, author = "Pratibha Permandla and Michael Roberson and Chandrasekhar Boyapati", title = "A type system for preventing data races and deadlocks in the {Java Virtual Machine} language: 1", journal = j-SIGPLAN, volume = "42", number = "7", pages = "10--10", month = jul, year = "2007", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1254766.1254768", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jun 18 10:57:50 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "In previous work on SafeJava we presented a type system extension to the Java source language that statically prevents data races and deadlocks in multithreaded programs. SafeJava is expressive enough to support common programming patterns, its type checking is fast and scalable, and it requires little programming overhead. SafeJava thus offers a promising approach for making multithreaded programs more reliable. This paper presents a corresponding type system extension for the Java virtual machine language (JVML). We call the resulting language SafeJVML. Well-typed SafeJVML programs are guaranteed to be free of data races and deadlocks. Designing a corresponding type system for JVML is important because most Java code is shipped in the JVML format. Designing a corresponding type system for JVML is nontrivial because of important differences between Java and JVML. In particular, the absence of block structure in JVML programs and the fact that they do not use named local variables the way Java programs do make the type systems for Java and JVML significantly different. For example, verifying absence of races and deadlocks in JVML programs requires performing an alias analysis, something that was not necessary for verifying absence of races and deadlocks in Java programs. This paper presents static and dynamic semantics for Safe JVML. It also includes a proof that the SafeJVML type system is sound and that it prevents data races and deadlocks. To the best of our knowledge, this is the first type system for JVML that statically ensures absence of synchronization errors.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "data races; deadlocks; ownership types; SafeJava", } @Article{Pozniansky:2007:MEF, author = "Eli Pozniansky and Assaf Schuster", title = "{MultiRace}: efficient on-the-fly data race detection in multithreaded {C++} programs", journal = j-CCPE, volume = "19", number = "3", pages = "327--340", day = "10", month = mar, year = "2007", CODEN = "CCPEBO", DOI = "http://dx.doi.org/10.1002/cpe.1064", ISSN = "1532-0626 (print), 1532-0634 (electronic)", ISSN-L = "1532-0626", bibdate = "Mon Dec 5 10:08:10 MST 2011", bibsource = "http://www.interscience.wiley.com/jpages/1532-0626; http://www.math.utah.edu/pub/tex/bib/ccpe.bib; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Concurrency and Computation: Prac\-tice and Experience", onlinedate = "1 Aug 2006", } @Article{Rosu:2007:ITO, author = "Grigore Ro{\c{s}}u and Koushik Sen", title = "An instrumentation technique for online analysis of multithreaded programs", journal = j-CCPE, volume = "19", number = "3", pages = "311--325", day = "10", month = mar, year = "2007", CODEN = "CCPEBO", DOI = "http://dx.doi.org/10.1002/cpe.1066", ISSN = "1532-0626 (print), 1532-0634 (electronic)", ISSN-L = "1532-0626", bibdate = "Mon Dec 5 10:08:10 MST 2011", bibsource = "http://www.interscience.wiley.com/jpages/1532-0626; http://www.math.utah.edu/pub/tex/bib/ccpe.bib; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Concurrency and Computation: Prac\-tice and Experience", onlinedate = "1 Aug 2006", } @Article{Shi:2007:CCP, author = "Xudong Shi and Feiqi Su and Jih-kwon Peir and Ye Xia and Zhen Yang", title = "{CMP} cache performance projection: accessibility vs. capacity", journal = j-COMP-ARCH-NEWS, volume = "35", number = "1", pages = "13--20", month = mar, year = "2007", CODEN = "CANED2", DOI = "http://doi.acm.org/10.1145/1241601.1241607", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:47:26 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Efficient utilizing on-chip storage space on Chip-Multiprocessors (CMPs) has become an important research topic. Tradeoffs between data accessibility and effective on-chip capacity have been studied extensively. It requires costly simulations to understand a wide-spectrum of the design space. In this paper, we first develop an abstract model for understanding the performance impact with respect to data replication. To overcome the lack of real-time interactions among multiple cores in the abstract model, we propose a global stack simulation strategy to study the performance of a variety of cache organizations on CMPs. The global stack logically incorporates a shared stack and per-core private stacks to collect shared/private reuse (stack) distances for every memory reference in a single simulation pass. With the collected reuse distances, performance in terms of hits/misses and average memory access times can be calculated for various cache organizations. We verify the stack results against individual execution-driven simulations that consider realistic cache parameters and delays using a set of commercial multithreaded workloads. The results show that stack simulations can accurately model the performance of various cache organizations. The single-pass stack simulation results demonstrate that the effectiveness of various techniques for optimizing the CMP on-chip storage is closely related to the working sets of the workloads as well as to the total cache sizes.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", keywords = "CMP caches; data replication; performance modeling and projection; stack simulation", remark = "{DASCMP'06}", } @Article{Smaragdakis:2007:TIC, author = "Yannis Smaragdakis and Anthony Kay and Reimer Behrends and Michal Young", title = "Transactions with isolation and cooperation", journal = j-SIGPLAN, volume = "42", number = "10", pages = "191--210", month = oct, year = "2007", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1297027.1297042", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jun 18 11:00:28 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "We present the TIC (Transactions with Isolation and Cooperation) model for concurrent programming. TIC adds to standard transactional memory the ability for a transaction to observe the effects of other threads at selected points. This allows transactions to cooperate, as well as to invoke nonrepeatable or irreversible operations, such as I/O. Cooperating transactions run the danger of exposing intermediate state and of having other threads change the transaction's state. The TIC model protects against unanticipated interference by having the type system keep track of all operations that may (transitively) violate the atomicity of a transaction and require the programmer to establish consistency at appropriate points. The result is a programming model that is both general and simple. We have used the TIC model to re-engineer existing lock-based applications including a substantial multi-threaded web mail server and a memory allocator with coarse-grained locking. Our experience confirms the features of the TIC model: It is convenient for the programmer, while maintaining the benefits of transactional memory.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "nested transactions; open-nesting; punctuation; TIC; transactional memory", } @Book{Sweetman:2007:SMR, author = "Dominic Sweetman", title = "See {MIPS} run", publisher = pub-MORGAN-KAUFMANN, address = pub-MORGAN-KAUFMANN:adr, edition = "Second", pages = "xix + 492", year = "2007", ISBN = "0-12-088421-6", ISBN-13 = "978-0-12-088421-6", LCCN = "QA76.9.A73 S88 2007", bibdate = "Thu Jun 20 10:21:55 2002", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "Embedded computer systems --- Programming; MIPS (Computer architecture); RISC microprocessors", libnote = "Not yet in my library.", tableofcontents = "Ch. 1. RISCs and MIPS architectures P.1\\ Ch. 2. MIPS architecture p. 29\\ Ch. 3. Coprocessor 0 : MIPS processor control p. 53\\ Ch. 4. How caches work on MIPS processors p. 79\\ Ch. 5. Exceptions, interrupts, and initialization p. 105\\ Ch. 6. Low-level memory management and the TLB p. 131\\ Ch. 7. Floating-point support p. 151\\ Ch. 8. Complete guide to the MIPS instruction set p. 183\\ Ch. 9. Reading MIPS assembly language p. 263\\ Ch. 10. Porting software to the MIPS architecture p. 279\\ Ch. 11. MIPS software standards (ABIs) p. 311\\ Ch. 12. Debugging MIPS designs - debug and profiling features p. 339\\ Ch. 13. GNU/Linux from eight miles high p. 363\\ Ch. 14. How hardware and software work together p. 371\\ Ch. 15. MIPS specific issues in the Linux kernel p. 399\\ Ch. 16. Linux application code, PIC, and libraries p. 409\\ App. A. MIPS multithreading p. 415\\ App. B. Other optional extensions to the MIPS instruction set", } @Article{Tam:2007:TCS, author = "David Tam and Reza Azimi and Michael Stumm", title = "Thread clustering: sharing-aware scheduling on {SMP--CMP--SMT} multiprocessors", journal = j-OPER-SYS-REV, volume = "41", number = "3", pages = "47--58", month = jun, year = "2007", CODEN = "OSRED8", DOI = "http://doi.acm.org/10.1145/1272996.1273004", ISSN = "0163-5980", ISSN-L = "0163-5980", bibdate = "Fri Jun 20 17:16:31 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "The major chip manufacturers have all introduced chip multiprocessing (CMP) and simultaneous multithreading (SMT) technology into their processing units. As a result, even low-end computing systems and game consoles have become shared memory multiprocessors with L1 and L2 cache sharing within a chip. Mid- and large-scale systems will have multiple processing chips and hence consist of an SMP-CMP-SMT configuration with non-uniform data sharing overheads. Current operating system schedulers are not aware of these new cache organizations, and as a result, distribute threads across processors in a way that causes many unnecessary, long-latency cross-chip cache accesses.\par In this paper we describe the design and implementation of a scheme to schedule threads based on sharing patterns detected online using features of standard performance monitoring units (PMUs) available in today's processing units. The primary advantage of using the PMU infrastructure is that it is fine-grained (down to the cache line) and has relatively low overhead. We have implemented our scheme in Linux running on an 8- way Power5 SMP-CMP-SMT multi-processor. For commercial multithreaded server workloads (VolanoMark, SPECjbb, and RUBiS), we are able to demonstrate reductions in cross-chip cache accesses of up to 70\%. These reductions lead to application-reported performance improvements of up to 7\%.", acknowledgement = ack-nhfb, fjournal = "ACM SIGOPS Operating Systems Review", keywords = "affinity scheduling; cache behavior; cache locality; CMP; detecting sharing; hardware performance counters; hardware performance monitors; multithreading; performance monitoring unit; resource allocation; shared caches; sharing; simultaneous multithreading; single-chip multiprocessors; SMP; SMT; thread migration; thread placement; thread scheduling", } @Article{Walcott:2007:DPA, author = "Kristen R. Walcott and Greg Humphreys and Sudhanva Gurumurthi", title = "Dynamic prediction of architectural vulnerability from microarchitectural state", journal = j-COMP-ARCH-NEWS, volume = "35", number = "2", pages = "516--527", month = may, year = "2007", CODEN = "CANED2", DOI = "http://doi.acm.org/10.1145/1250662.1250726", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:48:43 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Transient faults due to particle strikes are a key challenge in microprocessor design. Driven by exponentially increasing transistor counts, per-chip faults are a growing burden. To protect against soft errors, redundancy techniques such as redundant multithreading (RMT) are often used. However, these techniques assume that the probability that a structural fault will result in a soft error (i.e., the Architectural Vulnerability Factor (AVF)) is 100 percent, unnecessarily draining processor resources. Due to the high cost of redundancy, there have been efforts to throttle RMT at runtime. To date, these methods have not incorporated an AVF model and therefore tend to be ad hoc. Unfortunately, computing the AVF of complex microprocessor structures (e.g., the ISQ) can be quite involved.\par To provide probabilistic guarantees about fault tolerance, we have created a rigorous characterization of AVF behavior that can be easily implemented in hardware. We experimentally demonstrate AVF variability within and across the SPEC2000 benchmarks and identify strong correlations between structural AVF values and a small set of processor metrics. Using these simple indicators as predictors, we create a proof-of-concept RMT implementation that demonstrates that AVF prediction can be used to maintain a low fault tolerance level without significant performance impact.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", keywords = "architecture vulnerability factor; microarchitecture; performance; redundant multithreading; reliability", } @Article{Wang:2007:EAP, author = "Perry H. Wang and Jamison D. Collins and Gautham N. Chinya and Hong Jiang and Xinmin Tian and Milind Girkar and Nick Y. Yang and Guei-Yuan Lueh and Hong Wang", title = "{EXOCHI}: architecture and programming environment for a heterogeneous multi-core multithreaded system", journal = j-SIGPLAN, volume = "42", number = "6", pages = "156--166", month = jun, year = "2007", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1250734.1250753", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jun 18 10:55:30 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Future mainstream microprocessors will likely integrate specialized accelerators, such as GPUs, onto a single die to achieve better performance and power efficiency. However, it remains a keen challenge to program such a heterogeneous multicore platform, since these specialized accelerators feature ISAs and functionality that are significantly different from the general purpose CPU cores. In this paper, we present EXOCHI: (1) Exoskeleton Sequencer (EXO), an architecture to represent heterogeneous accelerators as ISA-based MIMD architecture resources, and a shared virtual memory heterogeneous multithreaded program execution model that tightly couples specialized accelerator cores with general-purpose CPU cores, and (2) C for Heterogeneous Integration (CHI), an integrated C/C++ programming environment that supports accelerator-specific inline assembly and domain-specific languages. The CHI compiler extends the OpenMP pragma for heterogeneous multithreading programming, and produces a single fat binary with code sections corresponding to different instruction sets. The runtime can judiciously spread parallel computation across the heterogeneous cores to optimize performance and power.\par We have prototyped the EXO architecture on a physical heterogeneous platform consisting of an Intel{\reg} Core{\TM} 2 Duo processor and an 8-core 32-thread Intel{\reg} Graphics Media Accelerator X3000. In addition, we have implemented the CHI integrated programming environment with the Intel{\reg} C++ Compiler, runtime toolset, and debugger. On the EXO prototype system, we have enhanced a suite of production-quality media kernels for video and image processing to utilize the accelerator through the CHI programming interface, achieving significant speedup (1.41X to10.97X) over execution on the IA32 CPU alone.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "GPU; heterogeneous multi-cores; openMP", } @Article{Wang:2007:OSC, author = "Qin Wang and Junpu Chen and Weihua Zhang and Min Yang and Binyu Zang", title = "Optimizing software cache performance of packet processing applications", journal = j-SIGPLAN, volume = "42", number = "7", pages = "227--236", month = jul, year = "2007", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1273444.1254808", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jun 18 10:57:50 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Network processors (NPs) are widely used in many types of networking equipment due to their high performance and flexibility. For most NPs, software cache is used instead of hardware cache due to the chip area, cost and power constraints. Therefore, programmers should take full responsibility for software cache management which is neither intuitive nor easy to most of them. Actually, without an effective use of it, long memory access latency will be a critical limiting factor to overall applications. Prior researches like hardware multi-threading, wide-word accesses and packet access combination for caching have already been applied to help programmers to overcome this bottleneck. However, most of them do not make enough use of the characteristics of packet processing applications and often perform intraprocedural optimizations only. As a result, the binary codes generated by those techniques often get lower performance than that comes from hand-tuned assembly programming for some applications. In this paper, we propose an algorithm including two techniques --- Critical Path Based Analysis (CPBA) and Global Adaptive Localization (GAL), to optimize the software cache performance of packet processing applications. Packet processing applications usually have several hot paths and CPBA tries to insert localization instructions according to their execution frequencies. For further optimizations, GAL eliminates some redundant localization instructions by interprocedural analysis and optimizations. Our algorithm is applied on some representative applications. Experiment results show that it leads to an average speedup by a factor of 1.974.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "local memory; network processor; optimization", } @Article{Yan:2007:HMC, author = "Jun Yan and Wei Zhang", title = "Hybrid multi-core architecture for boosting single-threaded performance", journal = j-COMP-ARCH-NEWS, volume = "35", number = "1", pages = "141--148", month = mar, year = "2007", CODEN = "CANED2", DOI = "http://doi.acm.org/10.1145/1241601.1241603", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:47:26 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "The scaling of technology and the diminishing return of complicated uniprocessors have driven the industry towards multicore processors. While multithreaded applications can naturally leverage the enhanced throughput of multi-core processors, a large number of important applications are single-threaded, which cannot automatically harness the potential of multi-core processors. In this paper, we propose a compiler-driven heterogeneous multicore architecture, consisting of tightly-integrated VLIW (Very Long Instruction Word) and superscalar processors on a single chip, to automatically boost the performance of single-threaded applications without compromising the capability to support multithreaded programs. In the proposed multi-core architecture, while the high-performance VLIW core is used to run code segments with high instruction-level parallelism (ILP) extracted by the compiler; the superscalar core can be exploited to deal with the runtime events that are typically difficult for the VLIW core to handle, such as L2 cache misses. Our initial experimental results by running the preexecution thread on the superscalar core to mitigate the L2 cache misses of the main thread on the VLIW core indicate that the proposed VLIW/superscalar multi-core processor can automatically improve the performance of single-threaded general-purpose applications by up to 40.8\%.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", } @Article{Yang:2007:RUL, author = "Jin-Min Yang and Da-Fang Zhang and Xue-Dong Yang and Wen-Wei Li", title = "Reliable user-level rollback recovery implementation for multithreaded processes on windows", journal = j-SPE, volume = "37", number = "3", pages = "331--346", month = mar, year = "2007", CODEN = "SPEXBL", DOI = "http://dx.doi.org/10.1002/spe.771", ISSN = "0038-0644 (print), 1097-024X (electronic)", ISSN-L = "0038-0644", bibdate = "Wed Oct 17 18:33:14 MDT 2007", bibsource = "http://www.interscience.wiley.com/jpages/0038-0644; http://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www3.interscience.wiley.com/journalfinder.html", acknowledgement = ack-nhfb, fjournal = "Software---Practice and Experience", onlinedate = "24 Oct 2006", } @Article{Abdulla:2008:MCR, author = "Parosh Aziz Abdulla and Fr{\'e}d{\'e}ric Haziza and Mats Kindahl", title = "Model checking race-freeness", journal = j-COMP-ARCH-NEWS, volume = "36", number = "5", pages = "72--79", month = dec, year = "2008", CODEN = "CANED2", DOI = "http://doi.acm.org/10.1145/1556444.1556454", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Jun 26 11:50:56 MDT 2009", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "With the introduction of highly concurrent systems in standard desktop computers, ensuring correctness of industrial-size concurrent programs is becoming increasingly important. One of the most important standards in use for developing multi-threaded programs is the POSIX Threads standard, commonly known as PThreads. Of particular importance, the analysis of industrial code should, as far as possible, be automatic and not require annotations or other forms of specifications of the code.\par Model checking has been one of the most successful approaches to program verification during the last two decades. The size and complexity of applications which can be handled have increased rapidly through integration with symbolic techniques. These methods are designed to work on finite (but large) state spaces. This framework fails to deal with several essential aspects of behaviours for multithreaded programs: there is no bound a priori on the number of threads which may arise in a given run of the system; each thread manipulates local variables which often range over unbounded domains; and the system has a dynamic structure in the sense that threads can be created and killed throughout execution of the system. In this paper we concentrate on checking a particular class of properties for concurrent programs, namely safety properties. In particular, we focus on race-freeness, that is, the absence of race conditions (also known as data races) in shared-variable pthreaded programs.\par We will follow a particular methodology which we have earlier developed for model checking general classes of infinite-state systems [1, 3, 6, 8, 9] and apply a symbolic backward reachability analysis to verify the safety property. Since we construct a model as an over-approximation of the original program, proving the safety property in the model implies that the property also holds in the original system. Surprisingly, it leads to a quite efficient analysis which can be carried out fully automatically.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", } @Article{Adams:2008:ENE, author = "Michael D. Adams and R. Kent Dybvig", title = "Efficient nondestructive equality checking for trees and graphs", journal = j-SIGPLAN, volume = "43", number = "9", pages = "179--188", month = sep, year = "2008", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1411203.1411230", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue Sep 23 17:31:25 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "The Revised$^6$ Report on Scheme requires its generic equivalence predicate, equal?, to terminate even on cyclic inputs. While the terminating equal? can be implemented via a DFA-equivalence or union-find algorithm, these algorithms usually require an additional pointer to be stored in each object, are not suitable for multithreaded code due to their destructive nature, and may be unacceptably slow for the small acyclic values that are the most likely inputs to the predicate.\par This paper presents a variant of the union-find algorithm for equal? that addresses these issues. It performs well on large and small, cyclic and acyclic inputs by interleaving a low-overhead algorithm that terminates only for acyclic inputs with a more general algorithm that handles cyclic inputs. The algorithm terminates for all inputs while never being more than a small factor slower than whichever of the acyclic or union-find algorithms would have been faster. Several intermediate algorithms are also presented, each of which might be suitable for use in a particular application, though only the final algorithm is suitable for use in a library procedure, like equal?, that must work acceptably well for all inputs.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "dfa equivalence; eq hash tables; equality; scheme; union-find", } @Article{Agrawal:2008:AWS, author = "Kunal Agrawal and Charles E. Leiserson and Yuxiong He and Wen Jing Hsu", title = "Adaptive work-stealing with parallelism feedback", journal = j-TOCS, volume = "26", number = "3", pages = "7:1--7:32", month = sep, year = "2008", CODEN = "ACSYEC", DOI = "http://doi.acm.org/10.1145/1394441.1394443", ISSN = "0734-2071", ISSN-L = "0734-2071", bibdate = "Wed Sep 17 14:28:13 MDT 2008", bibsource = "http://www.acm.org/pubs/contents/journals/tocs/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Multiprocessor scheduling in a shared multiprogramming environment can be structured as two-level scheduling, where a kernel-level job scheduler allots processors to jobs and a user-level thread scheduler schedules the work of a job on its allotted processors. We present a randomized work-stealing thread scheduler for fork-join multithreaded jobs that provides continual parallelism feedback to the job scheduler in the form of requests for processors. Our A-STEAL algorithm is appropriate for large parallel servers where many jobs share a common multiprocessor resource and in which the number of processors available to a particular job may vary during the job's execution. Assuming that the job scheduler never allots a job more processors than requested by the job's thread scheduler, A-STEAL guarantees that the job completes in near-optimal time while utilizing at least a constant fraction of the allotted processors.\par We model the job scheduler as the thread scheduler's adversary, challenging the thread scheduler to be robust to the operating environment as well as to the job scheduler's administrative policies. For example, the job scheduler might make a large number of processors available exactly when the job has little use for them. To analyze the performance of our adaptive thread scheduler under this stringent adversarial assumption, we introduce a new technique called {\em trim analysis,\/} which allows us to prove that our thread scheduler performs poorly on no more than a small number of time steps, exhibiting near-optimal behavior on the vast majority.\par More precisely, suppose that a job has work $T_1$ and span $T_\infty$. On a machine with $P$ processors, A-STEAL completes the job in an expected duration of $O(T_1 / \tilde{P} + T_\infty + L \lg P)$ time steps, where $L$ is the length of a scheduling quantum, and $\tilde{P}$ denotes the $O(T_\infty$ + L \lg P)$-trimmed availability. This quantity is the average of the processor availability over all time steps except the $O(T_\infty + L \lg P)$ time steps that have the highest processor availability. When the job's parallelism dominates the trimmed availability, that is, $\tilde{P} \ll T_1 / T_\infty$, the job achieves nearly perfect linear speedup. Conversely, when the trimmed mean dominates the parallelism, the asymptotic running time of the job is nearly the length of its span, which is optimal.\par We measured the performance of A-STEAL on a simulated multiprocessor system using synthetic workloads. For jobs with sufficient parallelism, our experiments confirm that A-STEAL provides almost perfect linear speedup across a variety of processor availability profiles. We compared A-STEAL with the ABP algorithm, an adaptive work-stealing thread scheduler developed by Arora et al. [1998] which does not employ parallelism feedback. On moderately to heavily loaded machines with large numbers of processors, A-STEAL typically completed jobs more than twice as quickly as ABP, despite being allotted the same number or fewer processors on every step, while wasting only 10\% of the processor cycles wasted by ABP.", acknowledgement = ack-nhfb, articleno = "7", fjournal = "ACM Transactions on Computer Systems", keywords = "adaptive scheduling; adversary; instantaneous parallelism; job scheduling; multiprocessing; multiprogramming; parallel computation; parallelism feedback; processor allocation; randomized algorithm; space sharing; span; thread scheduling; trim analysis; two-level scheduling; work; work-stealing", } @Article{Anderson:2008:SCD, author = "Zachary Anderson and David Gay and Rob Ennals and Eric Brewer", title = "{SharC}: checking data sharing strategies for multithreaded c", journal = j-SIGPLAN, volume = "43", number = "6", pages = "149--158", month = jun, year = "2008", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1379022.1375600", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jun 18 11:04:53 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Unintended or unmediated data sharing is a frequent cause of insidious bugs in multithreaded programs. We present a tool called SharC (short for Sharing Checker) that allows a user to write lightweight annotations to declare how they believe objects are being shared between threads in their program. SharC uses a combination of static and dynamic analyses to check that the program conforms to this specification.\par SharC allows any type to have one of five 'sharing modes' -- private to the current thread, read-only, shared under the control of a specified lock, intentionally racy, or checked dynamically. The dynamic mode uses run-time checking to verify that objects are either read-only, or only accessed by one thread. This allows us to check programs that would be difficult to check with a purely static system. If the user does not give a type an explicit annotation, then SharC uses a static type-qualifier analysis to infer that it is either private or should be checked dynamically.\par SharC allows objects to move between different sharing modes at runtime by using reference counting to check that there are no other references to the objects when they change mode.\par SharC's baseline dynamic analysis can check any C program, but is slow, and will generate false warnings about intentional data sharing. As the user adds more annotations, false warnings are reduced, and performance improves. We have found in practice that very few annotations are needed to describe all sharing and give reasonable performance. We ran SharC on 6 legacy C programs, summing to over 600k lines of code, and found that a total of only 60 simple annotations were needed to remove all false positives and to reduce performance overhead to only 2-14\%.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "data-race", } @Article{Athanasaki:2008:EPL, author = "Evangelia Athanasaki and Nikos Anastopoulos and Kornilios Kourtis and Nectarios Koziris", title = "Exploring the performance limits of simultaneous multithreading for memory intensive applications", journal = j-J-SUPERCOMPUTING, volume = "44", number = "1", pages = "64--97", month = apr, year = "2008", CODEN = "JOSUED", DOI = "http://dx.doi.org/10.1007/s11227-007-0149-x", ISSN = "0920-8542 (print), 1573-0484 (electronic)", ISSN-L = "0920-8542", bibdate = "Wed Jul 9 17:32:34 MDT 2008", bibsource = "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0920-8542&volume=44&issue=1; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.springerlink.com/openurl.asp?genre=article&issn=0920-8542&volume=44&issue=1&spage=64", acknowledgement = ack-nhfb, fjournal = "The Journal of Supercomputing", keywords = "Instruction-level parallelism; Performance analysis; Simultaneous multithreading; Software prefetching; Speculative precomputation; Thread-level parallelism", } @Article{Auerbach:2008:FTG, author = "Joshua Auerbach and David F. Bacon and Rachid Guerraoui and Jesper Honig Spring and Jan Vitek", title = "Flexible task graphs: a unified restricted thread programming model for {Java}", journal = j-SIGPLAN, volume = "43", number = "7", pages = "1--11", month = jul, year = "2008", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1375657.1375659", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jun 18 11:05:54 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "The disadvantages of unconstrained shared-memory multi-threading in Java, especially with regard to latency and determinism in realtime systems, have given rise to a variety of language extensions that place restrictions on how threads allocate, share, and communicate memory, leading to order-of-magnitude reductions in latency and jitter. However, each model makes different trade-offs with respect to expressiveness, efficiency, enforcement, and latency, and no one model is best for all applications.\par In this paper we present Flexible Task Graphs (Flexotasks), a single system that allows different isolation policies and mechanisms to be combined in an orthogonal manner, subsuming four previously proposed models as well as making it possible to use new combinations best suited to the needs of particular applications. We evaluate our implementation on top of the IBM Web-Sphere Real Time Java virtual machine using both a microbenchmark and a 30 KLOC avionics collision detector. We show that Flexotasks are capable of executing periodic threads at 10 KHz with a standard deviation of 1.2$\mu$s and that it achieves significantly better performance than RTSJ's scoped memory constructs while remaining impervious to interference from global garbage collection.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "Java Virtual Machine; memory management; ownership types; real-time systems", } @Article{Bahmann:2008:EFK, author = "Helge Bahmann and Konrad Froitzheim", title = "Extending futex for kernel to user notification", journal = j-OPER-SYS-REV, volume = "42", number = "5", pages = "18--26", month = jul, year = "2008", CODEN = "OSRED8", DOI = "http://doi.acm.org/10.1145/1400097.1400100", ISSN = "0163-5980", ISSN-L = "0163-5980", bibdate = "Wed Aug 6 16:54:12 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Threads in reactive applications need to service a multitude of events from different sources such as device drivers, communication channels or cooperating threads. While notification about these events can conceptually be understood as a form of 'synchronization', most operating systems (including Linux) do not provide a unified abstraction. This paper proposes to separate event delivery and notification, and to provide unified event notification through general-purpose synchronization objects. It demonstrates how this unified mechanism can be implemented in Linux as an extension of the futex mechanism to allow notification from kernel-space. Required modifications are discussed and their impact is assessed. The new event notification mechanism allows to move many thread activation policy decisions into user-space, with benefits for multi-threaded reactive applications: This is demonstrated in a modification of the leader/followers pattern with considerable performance benefits.", acknowledgement = ack-nhfb, fjournal = "ACM SIGOPS Operating Systems Review", keywords = "event notification; followers; futex; leader {\&} synchronization", } @Article{Boehm:2008:FCC, author = "Hans-J. Boehm and Sarita V. Adve", title = "Foundations of the {C++} concurrency memory model", journal = j-SIGPLAN, volume = "43", number = "6", pages = "68--78", month = jun, year = "2008", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1379022.1375591", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jun 18 11:04:53 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Currently multi-threaded C or C++ programs combine a single-threaded programming language with a separate threads library. This is not entirely sound [7].\par We describe an effort, currently nearing completion, to address these issues by explicitly providing semantics for threads in the next revision of the C++ standard. Our approach is similar to that recently followed by Java [25], in that, at least for a well-defined and interesting subset of the language, we give sequentially consistent semantics to programs that do not contain data races. Nonetheless, a number of our decisions are often surprising even to those familiar with the Java effort:\par We (mostly) insist on sequential consistency for race-free programs, in spite of implementation issues that came to light after the Java work.\par We give no semantics to programs with data races. There are no benign C++ data races.\par We use weaker semantics for trylock than existing languages or libraries, allowing us to promise sequential consistency with an intuitive race definition, even for programs with trylock.\par This paper describes the simple model we would like to be able to provide for C++ threads programmers, and explain how this, together with some practical, but often under-appreciated implementation constraints, drives us towards the above decisions.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "c++; data race; memory consistency; memory model; sequential consistency; trylock", } @Article{Boneti:2008:SCP, author = "Carlos Boneti and Francisco J. Cazorla and Roberto Gioiosa and Alper Buyuktosunoglu and Chen-Yong Cher and Mateo Valero", title = "Software-Controlled Priority Characterization of {POWER5} Processor", journal = j-COMP-ARCH-NEWS, volume = "36", number = "3", pages = "415--426", month = jun, year = "2008", CODEN = "CANED2", DOI = "http://dx.doi.org/10.1109/ISCA.2008.8", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Aug 6 08:35:03 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Due to the limitations of instruction-level parallelism, thread-level parallelism has become a popular way to improve processor performance. One example is the IBM POWER5TM processor, a two-context simultaneous-multithreaded dual-core chip. In each SMT core, the IBM POWER5 features two levels of thread resource balancing and prioritization. The first level provides automatic in-hardware resource balancing, while the second level is a software-controlled priority mechanism that presents eight levels of thread priorities. Currently, software-controlled prioritization is only used in limited number of cases in the software platforms due to lack of performance characterization of the effects of this mechanism. In this work, we characterize the effects of the software-based prioritization on several different workloads. We show that the impact of the prioritization significantly depends on the workloads coscheduled on a core. By prioritizing the right task, it is possible to obtain more than two times of throughput improvement for synthetic workloads compared to the baseline. We also present two application case studies targeting two different performance metrics: the first case study improves overall throughput by 23.7\% and the second case study reduces the total execution time by 9.3\%. In addition, we show the circumstances when a background thread can be run transparently without affecting the performance of the foreground thread.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", keywords = "IBM POWER5; performance characterization; simultaneous multithreading; SMT; software-controlled prioritization", } @Article{Campanoni:2008:PDC, author = "Simone Campanoni and Giovanni Agosta and Stefano Crespi Reghizzi", title = "A parallel dynamic compiler for {CIL} bytecode", journal = j-SIGPLAN, volume = "43", number = "4", pages = "11--20", month = apr, year = "2008", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1374752.1374754", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jun 18 11:04:46 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Multi-core technology is being employed in most recent high-performance architectures. Such architectures need specifically designed multi-threaded software to exploit all the potentialities of their hardware parallelism.\par At the same time, object code virtualization technologies are achieving a growing popularity, as they allow higher levels of software portability and reuse.\par Thus, a virtual execution environment running on a multi-core processor has to run complex, high-level applications and to exploit as much as possible the underlying parallel hardware. We propose an approach that leverages on CMP features to expose a novel pipeline synchronization model for the internal threads of the dynamic compiler.\par Thanks to compilation latency masking effect of the pipeline organization, our dynamic compiler, ILDJIT, is able to achieve significant speedups (26\% on average) with respect to the baseline, when the underlying hardware exposes at least two cores.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "dynamic compilation; parallel virtual machine; virtual execution system", } @Article{Chugh:2008:DAC, author = "Ravi Chugh and Jan W. Voung and Ranjit Jhala and Sorin Lerner", title = "Dataflow analysis for concurrent programs using datarace detection", journal = j-SIGPLAN, volume = "43", number = "6", pages = "316--326", month = jun, year = "2008", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1375581.1375620", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jun 18 11:04:53 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Dataflow analyses for concurrent programs differ from their single-threaded counterparts in that they must account for shared memory locations being overwritten by concurrent threads. Existing dataflow analysis techniques for concurrent programs typically fall at either end of a spectrum: at one end, the analysis conservatively kills facts about all data that might possibly be shared by multiple threads; at the other end, a precise thread-interleaving analysis determines which data may be shared, and thus which dataflow facts must be invalidated. The former approach can suffer from imprecision, whereas the latter does not scale.\par We present RADAR, a framework that automatically converts a dataflow analysis for sequential programs into one that is correct for concurrent programs. RADAR uses a race detection engine to kill the dataflow facts, generated and propagated by the sequential analysis, that become invalid due to concurrent writes. Our approach of factoring all reasoning about concurrency into a race detection engine yields two benefits. First, to obtain analyses for code using new concurrency constructs, one need only design a suitable race detection engine for the constructs. Second, it gives analysis designers an easy way to tune the scalability and precision of the overall analysis by only modifying the race detection engine. We describe the RADAR framework and its implementation using a pre-existing race detection engine. We show how RADAR was used to generate a concurrent version of a null-pointer dereference analysis, and we analyze the result of running the generated concurrent analysis on several benchmarks.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "interprocedural analysis; locksets; multithreaded programs; summaries", } @Article{Flanagan:2008:ADA, author = "Cormac Flanagan and Stephen N. Freund", title = "{Atomizer}: {A} dynamic atomicity checker for multithreaded programs", journal = j-SCI-COMPUT-PROGRAM, volume = "71", number = "2", pages = "89--109", day = "1", month = apr, year = "2008", CODEN = "SCPGD4", ISSN = "0167-6423", ISSN-L = "0167-6423", bibdate = "Fri Apr 1 18:39:19 MDT 2011", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.sciencedirect.com/science/journal/01676423", acknowledgement = ack-nhfb, fjournal = "Science of Computer Programming", } @Article{Flanagan:2008:TAS, author = "Cormac Flanagan and Stephen N. Freund and Marina Lifshin and Shaz Qadeer", title = "Types for atomicity: {Static} checking and inference for {Java}", journal = j-TOPLAS, volume = "30", number = "4", pages = "20:1--20:52", month = jul, year = "2008", CODEN = "ATPSDT", DOI = "http://doi.acm.org/10.1145/1377492.1377495", ISSN = "0164-0925 (print), 1558-4593 (electronic)", ISSN-L = "0164-0925", bibdate = "Tue Aug 5 19:14:53 MDT 2008", bibsource = "http://www.acm.org/pubs/contents/journals/toplas/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Atomicity is a fundamental correctness property in multithreaded programs. A method is atomic if, for every execution, there is an equivalent serial execution in which the actions of the method are not interleaved with actions of other threads. Atomic methods are amenable to sequential reasoning, which significantly facilitates subsequent analysis and verification.\par This article presents a type system for specifying and verifying the atomicity of methods in multithreaded Java programs using a synthesis of Lipton's theory of reduction and type systems for race detection. The type system supports guarded, write-guarded, and unguarded fields, as well as thread-local data, parameterized classes and methods, and protected locks. We also present an algorithm for verifying atomicity via type inference.\par We have applied our type checker and type inference tools to a number of commonly used Java library classes and programs. These tools were able to verify the vast majority of methods in these benchmarks as atomic, indicating that atomicity is a widespread methodology for multithreaded programming. In addition, reported atomicity violations revealed some subtle errors in the synchronization disciplines of these programs.", acknowledgement = ack-nhfb, articleno = "20", fjournal = "ACM Transactions on Programming Languages and Systems", keywords = "Atomicity; concurrent programs; type inference; type systems", } @Article{Flanagan:2008:VSC, author = "Cormac Flanagan and Stephen N. Freund and Jaeheon Yi", title = "{Velodrome}: a sound and complete dynamic atomicity checker for multithreaded programs", journal = j-SIGPLAN, volume = "43", number = "6", pages = "293--303", month = jun, year = "2008", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1375581.1375618", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jun 18 11:04:53 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Atomicity is a fundamental correctness property in multithreaded programs, both because atomic code blocks are amenable to sequential reasoning (which significantly simplifies correctness arguments), and because atomicity violations often reveal defects in a program's synchronization structure. Unfortunately, all atomicity analyses developed to date are incomplete in that they may yield false alarms on correctly synchronized programs, which limits their usefulness.\par We present the first dynamic analysis for atomicity that is both sound and complete. The analysis reasons about the exact dependencies between operations in the observed trace of the target program, and it reports error messages if and only if the observed trace is not conflict-serializable. Despite this significant increase in precision, the performance and coverage of our analysis is competitive with earlier incomplete dynamic analyses for atomicity.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "atomicity; dynamic analysis; serializability", } @Article{Gidenstam:2008:LLF, author = "Anders Gidenstam and Marina Papatriantafilou", title = "{LFTHREADS}: a lock-free thread library", journal = j-COMP-ARCH-NEWS, volume = "36", number = "5", pages = "88--92", month = dec, year = "2008", CODEN = "CANED2", DOI = "http://doi.acm.org/10.1145/1556444.1556456", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Fri Jun 26 11:50:56 MDT 2009", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "This extended abstract presents LFTHREADS, a thread library entirely based on lock-free methods, i.e. no spinlocks or similar synchronization mechanisms are employed in the implementation of the multithreading. Since lockfreedom is highly desirable in multiprocessors/multicores due to its advantages in parallelism, fault-tolerance, convoy-avoidance and more, there is an increased demand in lock-free methods in parallel applications, hence also in multiprocessor/multicore system services. LFTHREADS is the first thread library that provides a lock-free implementation of blocking synchronization primitives for application threads; although the latter may sound like a contradicting goal, such objects have several benefits: e.g. library operations that block and unblock threads on the same synchronization object can make progress in parallel while maintaining the desired thread-level semantics and without having to wait for any 'low' operations among them. Besides, as no spin-locks or similar synchronization mechanisms are employed, memory contention can be reduced and processors/cores are able to do useful work. As a consequence, applications, too, can enjoy enhanced parallelism and fault-tolerance. For the synchronization in LFTHREADS we have introduced a new method, which we call responsibility hand-off (RHO), that does not need any special kernel support. The RHO method is also of independent interest, as it can also serve as a tool for lock-free token passing, management of contention and interaction between scheduling and synchronization. This paper gives an outline and the context of LFTHREADS. For more details the reader is referred to [7] and [8].", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", } @Article{Gravvanis:2008:JMB, author = "George A. Gravvanis and Victor N. Epitropou", title = "{Java} multithreading-based parallel approximate arrow-type inverses", journal = j-CCPE, volume = "20", number = "10", pages = "1151--1172", month = jul, year = "2008", CODEN = "CCPEBO", DOI = "http://dx.doi.org/10.1002/cpe.1262", ISSN = "1532-0626 (print), 1532-0634 (electronic)", ISSN-L = "1532-0626", bibdate = "Mon Dec 5 10:08:25 MST 2011", bibsource = "http://www.interscience.wiley.com/jpages/1532-0626; http://www.math.utah.edu/pub/tex/bib/ccpe.bib; http://www.math.utah.edu/pub/tex/bib/java2000.bib; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Concurrency and Computation: Prac\-tice and Experience", onlinedate = "18 Sep 2007", } @Article{Hassanein:2008:AEH, author = "Wessam M. Hassanein and Layali K. Rashid and Moustafa A. Hammad", title = "Analyzing the Effects of Hyperthreading on the Performance of Data Management Systems", journal = j-INT-J-PARALLEL-PROG, volume = "36", number = "2", pages = "206--225", month = apr, year = "2008", CODEN = "IJPPE5", DOI = "http://dx.doi.org/10.1007/s10766-007-0066-x", ISSN = "0885-7458 (print), 1573-7640 (electronic)", ISSN-L = "0885-7458", bibdate = "Wed Jul 9 16:07:03 MDT 2008", bibsource = "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=36&issue=2; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.springerlink.com/openurl.asp?genre=article&issn=0885-7458&volume=36&issue=2&spage=206", acknowledgement = ack-nhfb, fjournal = "International Journal of Parallel Programming", keywords = "Data management systems; Databases; Hyper-threaded architectures; Performance; Simultaneous multithreading", } @Article{He:2008:COD, author = "Bingsheng He and Qiong Luo", title = "Cache-oblivious databases: {Limitations} and opportunities", journal = j-TODS, volume = "33", number = "2", pages = "8:1--8:??", month = jun, year = "2008", CODEN = "ATDSD3", DOI = "http://doi.acm.org/10.1145/1366102.1366105", ISSN = "0362-5915 (print), 1557-4644 (electronic)", ISSN-L = "0362-5915", bibdate = "Wed Jun 25 08:39:17 MDT 2008", bibsource = "http://www.acm.org/pubs/contents/journals/tods/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Cache-oblivious techniques, proposed in the theory community, have optimal asymptotic bounds on the amount of data transferred between any two adjacent levels of an arbitrary memory hierarchy. Moreover, this optimal performance is achieved without any hardware platform specific tuning. These properties are highly attractive to autonomous databases, especially because the hardware architectures are becoming increasingly complex and diverse.\par In this article, we present our design, implementation, and evaluation of the first cache-oblivious in-memory query processor, EaseDB. Moreover, we discuss the inherent limitations of the cache-oblivious approach as well as the opportunities given by the upcoming hardware architectures. Specifically, a cache-oblivious technique usually requires sophisticated algorithm design to achieve a comparable performance to its cache-conscious counterpart. Nevertheless, this development-time effort is compensated by the automaticity of performance achievement and the reduced ownership cost. Furthermore, this automaticity enables cache-oblivious techniques to outperform their cache-conscious counterparts in multi-threading processors.", acknowledgement = ack-nhfb, articleno = "8", fjournal = "ACM Transactions on Database Systems", keywords = "cache-conscious; cache-oblivious; chip multiprocessors; data caches; simultaneous multithreading", } @Article{Jacobs:2008:PMC, author = "Bart Jacobs and Frank Piessens and Jan Smans and K. Rustan M. Leino and Wolfram Schulte", title = "A programming model for concurrent object-oriented programs", journal = j-TOPLAS, volume = "31", number = "1", pages = "1:1--1:48", month = dec, year = "2008", CODEN = "ATPSDT", DOI = "http://doi.acm.org/10.1145/1452044.1452045", ISSN = "0164-0925 (print), 1558-4593 (electronic)", ISSN-L = "0164-0925", bibdate = "Tue Dec 23 11:52:52 MST 2008", bibsource = "http://www.acm.org/pubs/contents/journals/toplas/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Reasoning about multithreaded object-oriented programs is difficult, due to the nonlocal nature of object aliasing and data races. We propose a programming regime (or {\em programming model\/}) that rules out data races, and enables local reasoning in the presence of object aliasing and concurrency. Our programming model builds on the multithreading and synchronization primitives as they are present in current mainstream programming languages. Java or C\\# programs developed according to our model can be annotated by means of stylized comments to make the use of the model explicit. We show that such annotated programs can be formally verified to comply with the programming model. If the annotated program verifies, the underlying Java or C\\# program is guaranteed to be free from data races, and it is sound to reason locally about program behavior. Verification is modular: a program is valid if all methods are valid, and validity of a method does not depend on program elements that are not visible to the method. We have implemented a verifier for programs developed according to our model in a custom build of the Spec\\# programming system, and we have validated our approach on a case study.", acknowledgement = ack-nhfb, articleno = "1", fjournal = "ACM Transactions on Programming Languages and Systems", keywords = "Aliasing; data races; local reasoning; modular reasoning; ownership; verification condition generation", } @Article{Jaisson:2008:IPM, author = "Pascal Jaisson and Florian {De Vuyst}", title = "An innovating {PDE} model based on fluid flow paradigm for multithread systems", journal = j-COMP-NET-AMSTERDAM, volume = "52", number = "18", pages = "3318--3324", day = "22", month = dec, year = "2008", CODEN = "????", ISSN = "1389-1286", ISSN-L = "1389-1286", bibdate = "Sat Apr 2 08:42:29 MDT 2011", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.sciencedirect.com/science/journal/13891286", acknowledgement = ack-nhfb, fjournal = "Computer Networks (Amsterdam, Netherlands: 1999)", } @Article{Kang:2008:ISE, author = "Dongsoo Kang and Chen Liu and Jean-Luc Gaudiot", title = "The Impact of Speculative Execution on {SMT} Processors", journal = j-INT-J-PARALLEL-PROG, volume = "36", number = "4", pages = "361--385", month = aug, year = "2008", CODEN = "IJPPE5", DOI = "http://dx.doi.org/10.1007/s10766-007-0052-3", ISSN = "0885-7458 (print), 1573-7640 (electronic)", ISSN-L = "0885-7458", bibdate = "Wed Jul 9 16:07:14 MDT 2008", bibsource = "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=36&issue=4; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.springerlink.com/openurl.asp?genre=article&issn=0885-7458&volume=36&issue=4&spage=361", acknowledgement = ack-nhfb, fjournal = "International Journal of Parallel Programming", keywords = "Confidence estimator; Simultaneous multithreading; Speculation control; Thread scheduling", } @Article{Kgil:2008:PUS, author = "Taeho Kgil and Ali Saidi and Nathan Binkert and Steve Reinhardt and Krisztian Flautner and Trevor Mudge", title = "{PicoServer}: {Using} {$3$D} stacking technology to build energy efficient servers", journal = j-JETC, volume = "4", number = "4", pages = "16:1--16:??", month = oct, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1412587.1412589", ISSN = "1550-4832", ISSN-L = "1550-4832", bibdate = "Wed Mar 17 14:22:55 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/jetc/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "This article extends our prior work to show that a straightforward use of 3D stacking technology enables the design of compact energy-efficient servers. Our proposed architecture, called PicoServer, employs 3D technology to bond one die containing several simple, slow processing cores to multiple memory dies sufficient for a primary memory. The multiple memory dies are composed of DRAM. This use of 3D stacks readily facilitates wide low-latency buses between processors and memory. These remove the need for an L2 cache allowing its area to be re-allocated to additional simple cores. The additional cores allow the clock frequency to be lowered without impairing throughput. Lower clock frequency means that thermal constraints, a concern with 3D stacking, are easily satisfied. We extend our original analysis on PicoServer to include: (1) a wider set of server workloads, (2) the impact of multithreading, and (3) the on-chip DRAM architecture and system memory usage. PicoServer is intentionally simple, requiring only the simplest form of 3D technology where die are stacked on top of one another. Our intent is to minimize risk of introducing a new technology (3D) to implement a class of low-cost, low-power compact server architectures.", acknowledgement = ack-nhfb, articleno = "16", fjournal = "ACM Journal on Emerging Technologies in Computing Systems (JETC)", keywords = "3D stacking technology; chip multiprocessor; full-system simulation; Low power; Tier-1/2/3 server", } @Article{Krashinsky:2008:ISV, author = "Ronny Krashinsky and Christopher Batten and Krste Asanovi{\'c}", title = "Implementing the {Scale} vector-thread processor", journal = j-TODAES, volume = "13", number = "3", pages = "41:1--41:??", month = jul, year = "2008", CODEN = "ATASFO", DOI = "http://doi.acm.org/10.1145/1367045.1367050", ISSN = "1084-4309 (print), 1557-7309 (electronic)", ISSN-L = "1084-4309", bibdate = "Tue Aug 5 18:41:27 MDT 2008", bibsource = "http://www.acm.org/pubs/contents/journals/todaes/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "The Scale vector-thread processor is a complexity-effective solution for embedded computing which flexibly supports both vector and highly multithreaded processing. The 7.1-million transistor chip has 16 decoupled execution clusters, vector load and store units, and a nonblocking 32KB cache. An automated and iterative design and verification flow enabled a performance-, power-, and area-efficient implementation with two person-years of development effort. Scale has a core area of 16.6 mm$^2$ in 180 nm technology, and it consumes 400 mW--1.1 W while running at 260 MHz.", acknowledgement = ack-nhfb, articleno = "41", fjournal = "ACM Transactions on Design Automation of Electronic Systems (TODAES)", keywords = "hybrid C++/Verilog simulation; iterative VLSI design flow; multithreaded processors; procedural datapath pre-placement; vector processors; vector-thread processors", } @Article{Kumar:2008:AVO, author = "Sanjeev Kumar and Daehyun Kim and Mikhail Smelyanskiy and Yen-Kuang Chen and Jatin Chhugani and Christopher J. Hughes and Changkyu Kim and Victor W. Lee and Anthony D. Nguyen", title = "Atomic Vector Operations on Chip Multiprocessors", journal = j-COMP-ARCH-NEWS, volume = "36", number = "3", pages = "441--452", month = jun, year = "2008", CODEN = "CANED2", DOI = "http://doi.acm.org/10.1145/1394608.1382154", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Aug 6 08:35:03 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "The current trend is for processors to deliver dramatic improvements in parallel performance while only modestly improving serial performance. Parallel performance is harvested through vector/SIMD instructions as well as multithreading (through both multithreaded cores and chip multiprocessors). Vector parallelism can be more efficiently supported than multithreading, but is often harder for software to exploit. In particular, code with sparse data access patterns cannot easily utilize the vector/SIMD instructions of mainstream processors. Hardware to scatter and gather sparse data has previously been proposed to enable vector execution for these codes. However, on multithreaded architectures, a number of applications spend significant time on atomic operations (e.g., parallel reductions), which cannot be vectorized using previously proposed schemes. This paper proposes architectural support for atomic vector operations (referred to as GLSC) that addresses this limitation. GLSC extends scatter-gather hardware to support atomic memory operations. Our experiments show that the GLSC provides an average performance improvement on a set of important RMS kernels of 54\% for 4-wide SIMD.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", keywords = "locks; multiprocessors; reductions; SIMD; vector", } @Article{Liu:2008:HPP, author = "Duo Liu and Zheng Chen and Bei Hua and Nenghai Yu and Xinan Tang", title = "High-performance packet classification algorithm for multithreaded {IXP} network processor", journal = j-TECS, volume = "7", number = "2", pages = "16:1--16:??", month = feb, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1331331.1331340", ISSN = "1539-9087", ISSN-L = "1539-9087", bibdate = "Thu Jun 12 15:22:00 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Packet classification is crucial for the Internet to provide more value-added services and guaranteed quality of service. Besides hardware-based solutions, many software-based classification algorithms have been proposed. However, classifying at 10 Gbps speed or higher is a challenging problem and it is still one of the performance bottlenecks in core routers. In general, classification algorithms face the same challenge of balancing between high classification speed and low memory requirements. This paper proposes a modified recursive flow classification (RFC) algorithm, Bitmap-RFC, which significantly reduces the memory requirements of RFC by applying a bitmap compression technique. To speed up classifying speed, we exploit the multithreaded architectural features in various algorithm development stages from algorithm design to algorithm implementation. As a result, Bitmap-RFC strikes a good balance between speed and space. It can significantly keep both high classification speed and reduce memory space consumption. This paper investigates the main NPU software design aspects that have dramatic performance impacts on any NPU-based implementations: memory space reduction, instruction selection, data allocation, task partitioning, and latency hiding. We experiment with an architecture-aware design principle to guarantee the high performance of the classification algorithm on an NPU implementation. The experimental results show that the Bitmap-RFC algorithm achieves 10 Gbps speed or higher and has a good scalability on Intel IXP2800 NPU.", acknowledgement = ack-nhfb, articleno = "16", fjournal = "ACM Transactions on Embedded Computing Systems", keywords = "architecture; embedded system design; multithreading; network processor; packet classification; thread-level parallelism", } @Article{Montesinos:2008:DRD, author = "Pablo Montesinos and Luis Ceze and Josep Torrellas", title = "{DeLorean}: Recording and Deterministically Replaying Shared-Memory Multiprocessor Execution Efficiently", journal = j-COMP-ARCH-NEWS, volume = "36", number = "3", pages = "289--300", month = jun, year = "2008", CODEN = "CANED2", DOI = "http://dx.doi.org/10.1109/ISCA.2008.36", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Aug 6 08:35:03 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Support for deterministic replay of multithreaded execution can greatly help in finding concurrency bugs. For highest effectiveness, replay schemes should (i) record at production-run speed, (ii) keep their logging requirements minute, and (iii) replay at a speed similar to that of the initial execution. In this paper, we propose a new substrate for deterministic replay that provides substantial advances along these axes. In our proposal, processors execute blocks of instructions atomically, as in transactional memory or speculative multithreading, and the system only needs to record the commit order of these blocks. We call our scheme DeLorean. Our results show that DeLorean records execution at a speed similar to that of Release Consistency (RC) execution and replays at about 82\% of its speed. In contrast, most current schemes only record at the speed of Sequential Consistency (SC) execution. Moreover, DeLorean only needs 7.5\% of the log size needed by a state-of-the-art scheme. Finally, DeLorean can be configured to need only 0.6\% of the log size of the state-of-the-art scheme at the cost of recording at 86\% of RC's execution speed --- still faster than SC. In this configuration, the log of an 8-processor 5-GHz machine is estimated to be only about 20GB per day.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", } @Article{Musuvathi:2008:FSM, author = "Madanlal Musuvathi and Shaz Qadeer", title = "Fair stateless model checking", journal = j-SIGPLAN, volume = "43", number = "6", pages = "362--371", month = jun, year = "2008", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1379022.1375625", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jun 18 11:04:53 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Stateless model checking is a useful state-space exploration technique for systematically testing complex real-world software. Existing stateless model checkers are limited to the verification of safety properties on terminating programs. However, realistic concurrent programs are nonterminating, a property that significantly reduces the efficacy of stateless model checking in testing them. Moreover, existing stateless model checkers are unable to verify that a nonterminating program satisfies the important liveness property of livelock-freedom, a property that requires the program to make continuous progress for any input.\par To address these shortcomings, this paper argues for incorporating a fair scheduler in stateless exploration. The key contribution of this paper is an explicit scheduler that is (strongly) fair and at the same time sufficiently nondeterministic to guarantee full coverage of safety properties. We have implemented the fair scheduler in the CHESS model checker. We show through theoretical arguments and empirical evaluation that our algorithm satisfies two important properties: 1) it visits all states of a finite-state program achieving state coverage at a faster rate than existing techniques, and 2) it finds all livelocks in a finite-state program. Before this work, nonterminating programs had to be manually modified in order to apply CHESS to them. The addition of fairness has allowed CHESS to be effectively applied to real-world nonterminating programs without any modification. For example, we have successfully booted the Singularity operating system under the control of CHESS.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "concurrency; fairness; liveness; model checking; multi-threading; shared-memory programs; software testing", } @Article{Neamtiu:2008:CEV, author = "Iulian Neamtiu and Michael Hicks and Jeffrey S. Foster and Polyvios Pratikakis", title = "Contextual effects for version-consistent dynamic software updating all and safe concurrent programming", journal = j-SIGPLAN, volume = "43", number = "1", pages = "37--49", month = jan, year = "2008", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1328897.1328447", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jun 18 11:02:13 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "This paper presents a generalization of standard effect systems that we call contextual effects. A traditional effect system computes the effect of an expression e. Our system additionally computes the effects of the computational context in which $e$ occurs. More specifically, we compute the effect of the computation that has already occurred(the prior effect) and the effect of the computation yet to take place (the future effect).\par Contextual effects are useful when the past or future computation of the program is relevant at various program points. We present two substantial examples. First, we show how prior and future effects can be used to enforce transactional version consistency (TVC), a novel correctness property for dynamic software updates. TV Censures that programmer-designated transactional code blocks appear to execute entirely at the same code version, even if a dynamic update occurs in the middle of the block. Second, we show how future effects can be used in the analysis of multi-threaded programs to find thread-shared locations. This is an essential step in applications such as data race detection.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "computation effects; contextual effects; data race detection; dynamic software updating; type and effect systems; version consistency", } @Article{Ottoni:2008:COGa, author = "Guilherme Ottoni and David I. August", title = "Communication optimizations for global multi-threaded instruction scheduling", journal = j-COMP-ARCH-NEWS, volume = "36", number = "1", pages = "222--232", month = mar, year = "2008", CODEN = "CANED2", DOI = "http://doi.acm.org/10.1145/1353535.1346310", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:51:35 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "The recent shift in the industry towards chip multiprocessor (CMP) designs has brought the need for multi-threaded applications to mainstream computing. As observed in several limit studies, most of the parallelization opportunities require looking for parallelism beyond local regions of code. To exploit these opportunities, especially for sequential applications, researchers have recently proposed global multi-threaded instruction scheduling techniques, including DSWP and GREMIO. These techniques simultaneously schedule instructions from large regions of code, such as arbitrary loop nests or whole procedures, and have been shown to be effective at extracting threads for many applications. A key enabler of these global instruction scheduling techniques is the Multi-Threaded Code Generation (MTCG) algorithm proposed in [16], which generates multi-threaded code for any partition of the instructions into threads. This algorithm inserts communication and synchronization instructions in order to satisfy all inter-thread dependences.\par In this paper, we present a general compiler framework, COCO, to optimize the communication and synchronization instructions inserted by the MTCG algorithm. This framework, based on thread-aware data-flow analyses and graph min-cut algorithms, appropriately models and optimizes all kinds of inter-thread dependences, including register, memory, and control dependences. Our experiments, using a fully automatic compiler implementation of these techniques, demonstrate significant reductions (about 30\% on average) in the number of dynamic communication instructions in code parallelized with DSWP and GREMIO. This reduction in communication translates to performance gains of up to 40\%.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", keywords = "communication; data-flow analysis; graph min-cut; instruction scheduling; multi-threading; synchronization", } @Article{Ottoni:2008:COGb, author = "Guilherme Ottoni and David I. August", title = "Communication optimizations for global multi-threaded instruction scheduling", journal = j-OPER-SYS-REV, volume = "42", number = "2", pages = "222--232", month = mar, year = "2008", CODEN = "OSRED8", DOI = "http://doi.acm.org/10.1145/1353535.1346310", ISSN = "0163-5980", ISSN-L = "0163-5980", bibdate = "Fri Jun 20 17:20:12 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "The recent shift in the industry towards chip multiprocessor (CMP) designs has brought the need for multi-threaded applications to mainstream computing. As observed in several limit studies, most of the parallelization opportunities require looking for parallelism beyond local regions of code. To exploit these opportunities, especially for sequential applications, researchers have recently proposed global multi-threaded instruction scheduling techniques, including DSWP and GREMIO. These techniques simultaneously schedule instructions from large regions of code, such as arbitrary loop nests or whole procedures, and have been shown to be effective at extracting threads for many applications. A key enabler of these global instruction scheduling techniques is the Multi-Threaded Code Generation (MTCG) algorithm proposed in [16], which generates multi-threaded code for any partition of the instructions into threads. This algorithm inserts communication and synchronization instructions in order to satisfy all inter-thread dependences.\par In this paper, we present a general compiler framework, COCO, to optimize the communication and synchronization instructions inserted by the MTCG algorithm. This framework, based on thread-aware data-flow analyses and graph min-cut algorithms, appropriately models and optimizes all kinds of inter-thread dependences, including register, memory, and control dependences. Our experiments, using a fully automatic compiler implementation of these techniques, demonstrate significant reductions (about 30\% on average) in the number of dynamic communication instructions in code parallelized with DSWP and GREMIO. This reduction in communication translates to performance gains of up to 40\%.", acknowledgement = ack-nhfb, fjournal = "ACM SIGOPS Operating Systems Review", keywords = "communication; data-flow analysis; graph min-cut; instruction scheduling; multi-threading; synchronization", } @Article{Ottoni:2008:COGc, author = "Guilherme Ottoni and David I. August", title = "Communication optimizations for global multi-threaded instruction scheduling", journal = j-SIGPLAN, volume = "43", number = "3", pages = "222--232", month = mar, year = "2008", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1353535.1346310", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jun 18 11:03:40 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "The recent shift in the industry towards chip multiprocessor (CMP) designs has brought the need for multi-threaded applications to mainstream computing. As observed in several limit studies, most of the parallelization opportunities require looking for parallelism beyond local regions of code. To exploit these opportunities, especially for sequential applications, researchers have recently proposed global multi-threaded instruction scheduling techniques, including DSWP and GREMIO. These techniques simultaneously schedule instructions from large regions of code, such as arbitrary loop nests or whole procedures, and have been shown to be effective at extracting threads for many applications. A key enabler of these global instruction scheduling techniques is the Multi-Threaded Code Generation (MTCG) algorithm proposed in [16], which generates multi-threaded code for any partition of the instructions into threads. This algorithm inserts communication and synchronization instructions in order to satisfy all inter-thread dependences.\par In this paper, we present a general compiler framework, COCO, to optimize the communication and synchronization instructions inserted by the MTCG algorithm. This framework, based on thread-aware data-flow analyses and graph min-cut algorithms, appropriately models and optimizes all kinds of inter-thread dependences, including register, memory, and control dependences. Our experiments, using a fully automatic compiler implementation of these techniques, demonstrate significant reductions (about 30\% on average) in the number of dynamic communication instructions in code parallelized with DSWP and GREMIO. This reduction in communication translates to performance gains of up to 40\%.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "communication; data-flow analysis; graph min-cut; instruction scheduling; multi-threading; synchronization", } @Article{Rangan:2008:PSD, author = "Ram Rangan and Neil Vachharajani and Guilherme Ottoni and David I. August", title = "Performance scalability of decoupled software pipelining", journal = j-TACO, volume = "5", number = "2", pages = "8:1--8:??", month = aug, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1400112.1400113", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Aug 28 13:25:00 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Any successful solution to using multicore processors to scale general-purpose program performance will have to contend with rising intercore communication costs while exposing coarse-grained parallelism. Recently proposed pipelined multithreading (PMT) techniques have been demonstrated to have general-purpose applicability and are also able to effectively tolerate inter-core latencies through pipelined interthread communication. These desirable properties make PMT techniques strong candidates for program parallelization on current and future multicore processors and understanding their performance characteristics is critical to their deployment. To that end, this paper evaluates the performance scalability of a general-purpose PMT technique called decoupled software pipelining (DSWP) and presents a thorough analysis of the communication bottlenecks that must be overcome for optimal DSWP scalability.", acknowledgement = ack-nhfb, articleno = "8", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", keywords = "decoupled software pipelining; performance analysis", } @Article{Rounce:2008:DIS, author = "Peter A. Rounce and Alberto F. De Souza", title = "Dynamic Instruction Scheduling in a Trace-based Multi-threaded Architecture", journal = j-INT-J-PARALLEL-PROG, volume = "36", number = "2", pages = "184--205", month = apr, year = "2008", CODEN = "IJPPE5", DOI = "http://dx.doi.org/10.1007/s10766-007-0062-1", ISSN = "0885-7458 (print), 1573-7640 (electronic)", ISSN-L = "0885-7458", bibdate = "Wed Jul 9 16:07:03 MDT 2008", bibsource = "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=36&issue=2; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.springerlink.com/openurl.asp?genre=article&issn=0885-7458&volume=36&issue=2&spage=184", acknowledgement = ack-nhfb, fjournal = "International Journal of Parallel Programming", keywords = "Dynamic instruction scheduling; Simultaneous multi-threading; VLIW; Wide issue architectures", } @Article{Sen:2008:RDR, author = "Koushik Sen", title = "Race directed random testing of concurrent programs", journal = j-SIGPLAN, volume = "43", number = "6", pages = "11--21", month = jun, year = "2008", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1379022.1375584", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jun 18 11:04:53 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Bugs in multi-threaded programs often arise due to data races. Numerous static and dynamic program analysis techniques have been proposed to detect data races. We propose a novel randomized dynamic analysis technique that utilizes potential data race information obtained from an existing analysis tool to separate real races from false races without any need for manual inspection. Specifically, we use potential data race information obtained from an existing dynamic analysis technique to control a random scheduler of threads so that real race conditions get created with very high probability and those races get resolved randomly at runtime. Our approach has several advantages over existing dynamic analysis tools. First, we can create a real race condition and resolve the race randomly to see if an error can occur due to the race. Second, we can replay a race revealing execution efficiently by simply using the same seed for random number generation--we do not need to record the execution. Third, our approach has very low overhead compared to other precise dynamic race detection techniques because we only track all synchronization operations and a single pair of memory access statements that are reported to be in a potential race by an existing analysis. We have implemented the technique in a prototype tool for Java and have experimented on a number of large multi-threaded Java programs. We report a number of previously known and unknown bugs and real races in these Java programs.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "concurrency; dynamic analysis; race detection; random testing", } @Article{Sharkey:2008:RRP, author = "Joseph J. Sharkey and Jason Loew and Dmitry V. Ponomarev", title = "Reducing register pressure in {SMT} processors through {L2}-miss-driven early register release", journal = j-TACO, volume = "5", number = "3", pages = "13:1--13:??", month = nov, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1455650.1455652", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Dec 8 14:28:18 MST 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "The register file is one of the most critical datapath components limiting the number of threads that can be supported on a simultaneous multithreading (SMT) processor. To allow the use of smaller register files without degrading performance, techniques that maximize the efficiency of using registers through aggressive register allocation/deallocation can be considered. In this article, we propose a novel technique to early deallocate physical registers allocated to threads which experience L2 cache misses. This is accomplished by speculatively committing the load-independent instructions and deallocating the registers corresponding to the previous mappings of their destinations, without waiting for the cache miss request to be serviced. The early deallocated registers are then made immediately available for allocation to instructions within the same thread as well as within other threads, thus improving the overall processor throughput. On the average across the simulated mixes of multiprogrammed SPEC 2000 workloads, our technique results in 33\% improvement in throughput and 25\% improvement in terms of harmonic mean of weighted IPCs over the baseline SMT with the state-of-the-art DCRA policy. This is achieved without creating checkpoints, maintaining per-register counters of pending consumers, performing tag rebroadcasts, register remappings, and/or additional associative searches.", acknowledgement = ack-nhfb, articleno = "13", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", keywords = "register file; Simultaneous multithreading", } @Article{Suleman:2008:FDTa, author = "M. Aater Suleman and Moinuddin K. Qureshi and Yale N. Patt", title = "Feedback-driven threading: power-efficient and high-performance execution of multi-threaded workloads on {CMPs}", journal = j-COMP-ARCH-NEWS, volume = "36", number = "1", pages = "277--286", month = mar, year = "2008", CODEN = "CANED2", DOI = "http://doi.acm.org/10.1145/1346281.1346317", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:51:35 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Extracting high-performance from the emerging Chip Multiprocessors (CMPs) requires that the application be divided into multiple threads. Each thread executes on a separate core thereby increasing concurrency and improving performance. As the number of cores on a CMP continues to increase, the performance of some multi-threaded applications will benefit from the increased number of threads, whereas, the performance of other multi-threaded applications will become limited by data-synchronization and off-chip bandwidth. For applications that get limited by data-synchronization, increasing the number of threads significantly degrades performance and increases on-chip power. Similarly, for applications that get limited by off-chip bandwidth, increasing the number of threads increases on-chip power without providing any performance improvement. Furthermore, whether an application gets limited by data-synchronization, or bandwidth, or neither depends not only on the application but also on the input set and the machine configuration. Therefore, controlling the number of threads based on the run-time behavior of the application can significantly improve performance and reduce power.\par This paper proposes Feedback-Driven Threading (FDT), a framework to dynamically control the number of threads using run-time information. FDT can be used to implement Synchronization-Aware Threading (SAT), which predicts the optimal number of threads depending on the amount of data-synchronization. Our evaluation shows that SAT can reduce both execution time and power by up to 66\% and 78\% respectively. Similarly, FDT can be used to implement Bandwidth-Aware Threading (BAT), which predicts the minimum number of threads required to saturate the off-chip bus. Our evaluation shows that BAT reduces on-chip power by up to 78\%. When SAT and BAT are combined, the average execution time reduces by 17\% and power reduces by 59\%. The proposed techniques leverage existing performance counters and require minimal support from the threading library.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", keywords = "bandwidth; CMP; multi-threaded; synchronization", } @Article{Suleman:2008:FDTb, author = "M. Aater Suleman and Moinuddin K. Qureshi and Yale N. Patt", title = "Feedback-driven threading: power-efficient and high-performance execution of multi-threaded workloads on {CMPs}", journal = j-OPER-SYS-REV, volume = "42", number = "2", pages = "277--286", month = mar, year = "2008", CODEN = "OSRED8", DOI = "http://doi.acm.org/10.1145/1346281.1346317", ISSN = "0163-5980", ISSN-L = "0163-5980", bibdate = "Fri Jun 20 17:20:12 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Extracting high-performance from the emerging Chip Multiprocessors (CMPs) requires that the application be divided into multiple threads. Each thread executes on a separate core thereby increasing concurrency and improving performance. As the number of cores on a CMP continues to increase, the performance of some multi-threaded applications will benefit from the increased number of threads, whereas, the performance of other multi-threaded applications will become limited by data-synchronization and off-chip bandwidth. For applications that get limited by data-synchronization, increasing the number of threads significantly degrades performance and increases on-chip power. Similarly, for applications that get limited by off-chip bandwidth, increasing the number of threads increases on-chip power without providing any performance improvement. Furthermore, whether an application gets limited by data-synchronization, or bandwidth, or neither depends not only on the application but also on the input set and the machine configuration. Therefore, controlling the number of threads based on the run-time behavior of the application can significantly improve performance and reduce power.\par This paper proposes Feedback-Driven Threading (FDT), a framework to dynamically control the number of threads using run-time information. FDT can be used to implement Synchronization-Aware Threading (SAT), which predicts the optimal number of threads depending on the amount of data-synchronization. Our evaluation shows that SAT can reduce both execution time and power by up to 66\% and 78\% respectively. Similarly, FDT can be used to implement Bandwidth-Aware Threading (BAT), which predicts the minimum number of threads required to saturate the off-chip bus. Our evaluation shows that BAT reduces on-chip power by up to 78\%. When SAT and BAT are combined, the average execution time reduces by 17\% and power reduces by 59\%. The proposed techniques leverage existing performance counters and require minimal support from the threading library.", acknowledgement = ack-nhfb, fjournal = "ACM SIGOPS Operating Systems Review", keywords = "bandwidth; CMP; multi-threaded; synchronization", } @Article{Suleman:2008:FDTc, author = "M. Aater Suleman and Moinuddin K. Qureshi and Yale N. Patt", title = "Feedback-driven threading: power-efficient and high-performance execution of multi-threaded workloads on {CMPs}", journal = j-SIGPLAN, volume = "43", number = "3", pages = "277--286", month = mar, year = "2008", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1346281.1346317", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jun 18 11:03:40 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Extracting high-performance from the emerging Chip Multiprocessors (CMPs) requires that the application be divided into multiple threads. Each thread executes on a separate core thereby increasing concurrency and improving performance. As the number of cores on a CMP continues to increase, the performance of some multi-threaded applications will benefit from the increased number of threads, whereas, the performance of other multi-threaded applications will become limited by data-synchronization and off-chip bandwidth. For applications that get limited by data-synchronization, increasing the number of threads significantly degrades performance and increases on-chip power. Similarly, for applications that get limited by off-chip bandwidth, increasing the number of threads increases on-chip power without providing any performance improvement. Furthermore, whether an application gets limited by data-synchronization, or bandwidth, or neither depends not only on the application but also on the input set and the machine configuration. Therefore, controlling the number of threads based on the run-time behavior of the application can significantly improve performance and reduce power.\par This paper proposes Feedback-Driven Threading (FDT), a framework to dynamically control the number of threads using run-time information. FDT can be used to implement Synchronization-Aware Threading (SAT), which predicts the optimal number of threads depending on the amount of data-synchronization. Our evaluation shows that SAT can reduce both execution time and power by up to 66\% and 78\% respectively. Similarly, FDT can be used to implement Bandwidth-Aware Threading (BAT), which predicts the minimum number of threads required to saturate the off-chip bus. Our evaluation shows that BAT reduces on-chip power by up to 78\%. When SAT and BAT are combined, the average execution time reduces by 17\% and power reduces by 59\%. The proposed techniques leverage existing performance counters and require minimal support from the threading library.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "bandwidth; CMP; multi-threaded; synchronization", } @Article{Thoziyoor:2008:CMM, author = "Shyamkumar Thoziyoor and Jung Ho Ahn and Matteo Monchiero and Jay B. Brockman and Norman P. Jouppi", title = "A Comprehensive Memory Modeling Tool and Its Application to the Design and Analysis of Future Memory Hierarchies", journal = j-COMP-ARCH-NEWS, volume = "36", number = "3", pages = "51--62", month = jun, year = "2008", CODEN = "CANED2", DOI = "http://dx.doi.org/10.1109/ISCA.2008.16", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Aug 6 08:35:03 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "In this paper we introduce CACTI-D, a significant enhancement of CACTI 5.0. CACTI-D adds support for modeling of commodity DRAM technology and support for main memory DRAM chip organization. CACTI-D enables modeling of the complete memory hierarchy with consistent models all the way from SRAM based L1 caches through main memory DRAMs on DIMMs. We illustrate the potential applicability of CACTI-D in the design and analysis of future memory hierarchies by carrying out a last level cache study for a multicore multithreaded architecture at the 32nm technology node. In this study we use CACTI-D to model all components of the memory hierarchy including L1, L2, last level SRAM, logic process based DRAM or commodity DRAM L3 caches, and main memory DRAM chips. We carry out architectural simulation using benchmarks with large data sets and present results of their execution time, breakdown of power in the memory hierarchy, and system energy-delay product for the different system configurations. We find that commodity DRAM technology is most attractive for stacked last level caches, with significantly lower energy-delay products.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", keywords = "cache; CACTI; commodity DRAM; LLC; logic-process based DRAM; SRAM", } @Article{Vantrease:2008:CSI, author = "Dana Vantrease and Robert Schreiber and Matteo Monchiero and Moray McLaren and Norman P. Jouppi and Marco Fiorentino and Al Davis and Nathan Binkert and Raymond G. Beausoleil and Jung Ho Ahn", title = "{Corona}: System Implications of Emerging Nanophotonic Technology", journal = j-COMP-ARCH-NEWS, volume = "36", number = "3", pages = "153--164", month = jun, year = "2008", CODEN = "CANED2", DOI = "http://doi.acm.org/10.1145/1394608.1382135", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Aug 6 08:35:03 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "We expect that many-core microprocessors will push performance per chip from the 10 gigaflop to the 10 teraflop range in the coming decade. To support this increased performance, memory and inter-core bandwidths will also have to scale by orders of magnitude. Pin limitations, the energy cost of electrical signaling, and the non-scalability of chip-length global wires are significant bandwidth impediments. Recent developments in silicon nanophotonic technology have the potential to meet these off- and on-stack bandwidth requirements at acceptable power levels. Corona is a 3D many-core architecture that uses nanophotonic communication for both inter-core communication and off-stack communication to memory or I/O devices. Its peak floating-point performance is 10 teraflops. Dense wavelength division multiplexed optically connected memory modules provide 10 terabyte per second memory bandwidth. A photonic crossbar fully interconnects its 256 low-power multithreaded cores at 20 terabyte per second bandwidth. We have simulated a 1024 thread Corona system running synthetic benchmarks and scaled versions of the SPLASH-2 benchmark suite. We believe that in comparison with an electrically-connected many-core alternative that uses the same on-stack interconnect power, Corona can provide 2 to 6 times more performance on many memory intensive workloads, while simultaneously reducing power.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", keywords = "3D stacking; many-core CMP; nanophotonics; on-chip Networks", } @TechReport{Volkov:2008:LQC, author = "Vasily Volkov and James W. Demmel", title = "{$LU$}, {$QR$} and {Cholesky} Factorizations using Vector Capabilities of {GPUs}", type = "LAPACK Working Note", number = "202", institution = inst-UCB-EECS, address = inst-UCB-EECS:adr, month = may, year = "2008", bibdate = "Fri Apr 24 12:25:43 2009", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.netlib.org/lapack/lawnspdf/lawn202.pdf", abstract = "We present performance results for dense linear algebra using the 8-series NVIDIA GPUs. Our matrix-matrix multiply routine (GEMM) runs 60\% faster than the vendor implementation in CUBLAS 1.1 and approaches the peak of hardware capabilities. Our LU, QR and Cholesky factorizations achieve up to 80--90\% of the peak GEMM rate. Our parallel LU running on two GPUs achieves up to $\approx$300 Gflop/s. These results are accomplished by challenging the accepted view of the GPU architecture and programming guidelines. We argue that modern GPUs should be viewed as multithreaded multicore vector units. We exploit blocking similarly to vector computers and heterogeneity of the system by computing both on GPU and CPU. This study includes detailed benchmarking of the GPU memory system that reveals sizes and latencies of caches and TLB. We present a couple of algorithmic optimizations aimed at increasing parallelism and regularity in the problem that provide us with slightly higher performance.", acknowledgement = ack-nhfb, ucbnumber = "UCB/EECS-2008-49,", } @Article{Wang:2008:PIM, author = "Kun Wang and Yu Zhang and Huayong Wang and Xiaowei Shen", title = "Parallelization of {IBM Mambo} system simulator in functional modes", journal = j-OPER-SYS-REV, volume = "42", number = "1", pages = "71--76", month = jan, year = "2008", CODEN = "OSRED8", DOI = "http://doi.acm.org/10.1145/1341312.1341325", ISSN = "0163-5980", ISSN-L = "0163-5980", bibdate = "Fri Jun 20 17:19:29 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Mambo [4] is IBM's full-system simulator which models PowerPC systems, and provides a complete set of simulation tools to help IBM and its partners in pre-hardware development and performance evaluation for future systems. Currently Mambo simulates target systems on a single host thread. When the number of cores increases in a target system, Mambo's simulation performance for each core goes down. As the so-called `multi-core era' approaches, both target and host systems will have more and more cores. It is very important for Mambo to efficiently simulate a multi-core target system on a multi-core host system. Parallelization is a natural method to speed up Mambo under this situation.\par Parallel Mambo (P-Mambo) is a multi-threaded implementation of Mambo. Mambo's simulation engine is implemented as a user-level thread-scheduler. We propose a multi-scheduler method to adapt Mambo's simulation engine to multi-threaded execution. Based on this method a core-based module partition is proposed to achieve both high inter-scheduler parallelism and low inter-scheduler dependency. Protection of shared resources is crucial to both correctness and performance of P-Mambo. Since there are two tiers of threads in P-Mambo, protecting shared resources by only OS-level locks possibly introduces deadlocks due to user-level context switch. We propose a new lock mechanism to handle this problem. Since Mambo is an on-going project with many modules currently under development, co-existence with new modules is also important to P-Mambo. We propose a global-lock-based method to guarantee compatibility of P-Mambo with future Mambo modules.\par We have implemented the first version of P-Mambo in functional modes. The performance of P-Mambo has been evaluated on the OpenMP implementation of NAS Parallel Benchmark (NPB) 3.2 [12]. Preliminary experimental results show that P-Mambo achieves an average speedup of 3.4 on a 4-core host machine.", acknowledgement = ack-nhfb, fjournal = "ACM SIGOPS Operating Systems Review", keywords = "architectural simulation; dynamic binary translation; parallel simulation", } @Article{Warg:2008:DTS, author = "Fredrik Warg and Per Stenstrom", title = "Dual-thread Speculation: {A} Simple Approach to Uncover Thread-level Parallelism on a Simultaneous Multithreaded Processor", journal = j-INT-J-PARALLEL-PROG, volume = "36", number = "2", pages = "166--183", month = apr, year = "2008", CODEN = "IJPPE5", DOI = "http://dx.doi.org/10.1007/s10766-007-0064-z", ISSN = "0885-7458 (print), 1573-7640 (electronic)", ISSN-L = "0885-7458", bibdate = "Wed Jul 9 16:07:03 MDT 2008", bibsource = "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0885-7458&volume=36&issue=2; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.springerlink.com/openurl.asp?genre=article&issn=0885-7458&volume=36&issue=2&spage=166", acknowledgement = ack-nhfb, fjournal = "International Journal of Parallel Programming", keywords = "Chip multiprocessors; Computer architecture; Simultaneous multithreading; Thread-level parallelism; Thread-level speculation", } @Article{Wegiel:2008:MCVa, author = "Michal Wegiel and Chandra Krintz", title = "The mapping collector: virtual memory support for generational, parallel, and concurrent compaction", journal = j-COMP-ARCH-NEWS, volume = "36", number = "1", pages = "91--102", month = mar, year = "2008", CODEN = "CANED2", DOI = "http://doi.acm.org/10.1145/1353535.1346294", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Tue Jun 17 11:51:35 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Parallel and concurrent garbage collectors are increasingly employed by managed runtime environments (MREs) to maintain scalability, as multi-core architectures and multi-threaded applications become pervasive. Moreover, state-of-the-art MREs commonly implement compaction to eliminate heap fragmentation and enable fast linear object allocation.\par Our empirical analysis of object demographics reveals that unreachable objects in the heap tend to form clusters large enough to be effectively managed at the granularity of virtual memory pages. Even though processes can manipulate the mapping of the virtual address space through the standard operating system (OS) interface on most platforms, extant parallel/concurrent compactors do not do so to exploit this clustering behavior and instead achieve compaction by performing, relatively expensive, object moving and pointer adjustment.\par We introduce the Mapping Collector (MC), which leverages virtual memory operations to reclaim and consolidate free space without moving objects and updating pointers. MC is a nearly-single-phase compactor that is simpler and more efficient than previously reported compactors that comprise two to four phases. Through effective MRE-OS coordination, MC maintains the simplicity of a non-moving collector while providing efficient parallel and concurrent compaction.\par We implement both stop-the-world and concurrent MC in a generational garbage collection framework within the open-source HotSpot Java Virtual Machine. Our experimental evaluation using a multiprocessor indicates that MC significantly increases throughput and scalability as well as reduces pause times, relative to state-of-the-art, parallel and concurrent compactors.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", keywords = "compaction; concurrent; parallel; virtual memory", } @Article{Wegiel:2008:MCVb, author = "Michal Wegiel and Chandra Krintz", title = "The {Mapping Collector}: virtual memory support for generational, parallel, and concurrent compaction", journal = j-OPER-SYS-REV, volume = "42", number = "2", pages = "91--102", month = mar, year = "2008", CODEN = "OSRED8", DOI = "http://doi.acm.org/10.1145/1353535.1346294", ISSN = "0163-5980", ISSN-L = "0163-5980", bibdate = "Fri Jun 20 17:20:12 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Parallel and concurrent garbage collectors are increasingly employed by managed runtime environments (MREs) to maintain scalability, as multi-core architectures and multi-threaded applications become pervasive. Moreover, state-of-the-art MREs commonly implement compaction to eliminate heap fragmentation and enable fast linear object allocation.\par Our empirical analysis of object demographics reveals that unreachable objects in the heap tend to form clusters large enough to be effectively managed at the granularity of virtual memory pages. Even though processes can manipulate the mapping of the virtual address space through the standard operating system (OS) interface on most platforms, extant parallel/concurrent compactors do not do so to exploit this clustering behavior and instead achieve compaction by performing, relatively expensive, object moving and pointer adjustment.\par We introduce the Mapping Collector (MC), which leverages virtual memory operations to reclaim and consolidate free space without moving objects and updating pointers. MC is a nearly-single-phase compactor that is simpler and more efficient than previously reported compactors that comprise two to four phases. Through effective MRE-OS coordination, MC maintains the simplicity of a non-moving collector while providing efficient parallel and concurrent compaction.\par We implement both stop-the-world and concurrent MC in a generational garbage collection framework within the open-source HotSpot Java Virtual Machine. Our experimental evaluation using a multiprocessor indicates that MC significantly increases throughput and scalability as well as reduces pause times, relative to state-of-the-art, parallel and concurrent compactors.", acknowledgement = ack-nhfb, fjournal = "ACM SIGOPS Operating Systems Review", keywords = "compaction; concurrent; parallel; virtual memory", } @Article{Wegiel:2008:MCVc, author = "Michal Wegiel and Chandra Krintz", title = "The mapping collector: virtual memory support for generational, parallel, and concurrent compaction", journal = j-SIGPLAN, volume = "43", number = "3", pages = "91--102", month = mar, year = "2008", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1353535.1346294", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jun 18 11:03:40 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Parallel and concurrent garbage collectors are increasingly employed by managed runtime environments (MREs) to maintain scalability, as multi-core architectures and multi-threaded applications become pervasive. Moreover, state-of-the-art MREs commonly implement compaction to eliminate heap fragmentation and enable fast linear object allocation.\par Our empirical analysis of object demographics reveals that unreachable objects in the heap tend to form clusters large enough to be effectively managed at the granularity of virtual memory pages. Even though processes can manipulate the mapping of the virtual address space through the standard operating system (OS) interface on most platforms, extant parallel/concurrent compactors do not do so to exploit this clustering behavior and instead achieve compaction by performing, relatively expensive, object moving and pointer adjustment.\par We introduce the Mapping Collector (MC), which leverages virtual memory operations to reclaim and consolidate free space without moving objects and updating pointers. MC is a nearly-single-phase compactor that is simpler and more efficient than previously reported compactors that comprise two to four phases. Through effective MRE-OS coordination, MC maintains the simplicity of a non-moving collector while providing efficient parallel and concurrent compaction.\par We implement both stop-the-world and concurrent MC in a generational garbage collection framework within the open-source HotSpot Java Virtual Machine. Our experimental evaluation using a multiprocessor indicates that MC significantly increases throughput and scalability as well as reduces pause times, relative to state-of-the-art, parallel and concurrent compactors.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "compaction; concurrent; parallel; virtual memory", } @Article{Winter:2008:ATN, author = "Jonathan A. Winter and David H. Albonesi", title = "Addressing thermal nonuniformity in {SMT} workloads", journal = j-TACO, volume = "5", number = "1", pages = "4:1--4:??", month = may, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1369396.1369400", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Mon Jun 16 11:41:51 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "We explore DTM techniques within the context of uniform and nonuniform SMT workloads. While DVS is suitable for addressing workloads with uniformly high temperatures, for nonuniform workloads, performance loss occurs because of the slowdown of the cooler thread. To address this, we propose and evaluate DTM mechanisms that exploit the steering-based thread management mechanisms inherent in a clustered SMT architecture. We show that in contrast to DVS, which operates globally, our techniques are more effective at controlling temperature for nonuniform workloads. Furthermore, we devise a DTM technique that combines steering and DVS to achieve consistently good performance across all workloads.", acknowledgement = ack-nhfb, articleno = "4", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", keywords = "adaptive microarchitectures; clustered microarchitectures; dynamic thermal management; dynamic voltage scaling; simultaneous multithreading", } @Article{Wong:2008:TAF, author = "Chee Siang Wong and Ian Tan and Rosalind Deena Kumari and Fun Wey", title = "Towards achieving fairness in the {Linux} scheduler", journal = j-OPER-SYS-REV, volume = "42", number = "5", pages = "34--43", month = jul, year = "2008", CODEN = "OSRED8", DOI = "http://doi.acm.org/10.1145/1400097.1400102", ISSN = "0163-5980", ISSN-L = "0163-5980", bibdate = "Wed Aug 6 16:54:12 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "The Operating System scheduler is designed to allocate the CPU resources appropriately to all processes. The Linux Completely Fair Scheduler (CFS) design ensures fairness among tasks using the thread fair scheduling algorithm. This algorithm ensures allocation of resources based on the number of threads in the system and not within executing programs. This can lead to fairness issue in a multi-threaded environment as the Linux scheduler tends to favor programs with higher number of threads. We illustrate the issue of fairness through experimental evaluation thus exposing the weakness of the current allocation scheme where software developers could take advantage by spawning many additional threads in order to obtain more CPU resources. A novel algorithm is proposed as a solution towards achieving better fairness in the Linux scheduler. The algorithm is based on weight readjustment of the threads created in the same process to significantly reduce the unfair allocation of CPU resources in multi-threaded environments. The algorithm was implemented and evaluated. It demonstrated promising results towards solving the raised fairness issue. We conclude this paper highlighting the limitations of the proposed approach and the future work in the stated direction.", acknowledgement = ack-nhfb, fjournal = "ACM SIGOPS Operating Systems Review", keywords = "completely fair scheduler; fairness; Linux; process scheduling", } @Article{Xian:2008:CAS, author = "Feng Xian and Witawas Srisa-an and Hong Jiang", title = "Contention-aware scheduler: unlocking execution parallelism in multithreaded {Java} programs", journal = j-SIGPLAN, volume = "43", number = "10", pages = "163--180", month = sep, year = "2008", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1449955.1449778", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Oct 22 09:57:37 MDT 2008", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "In multithreaded programming, locks are frequently used as a mechanism for synchronization. Because today's operating systems do not consider lock usage as a scheduling criterion, scheduling decisions can be unfavorable to multithreaded applications, leading to performance issues such as convoying and heavy lock contention in systems with multiple processors. Previous efforts to address these issues (e.g., transactional memory, lock-free data structure) often treat scheduling decisions as 'a fact of life,' and therefore these solutions try to cope with the consequences of undesirable scheduling instead of dealing with the problem directly.\par In this paper, we introduce {\em Contention-Aware Scheduler (CA-Scheduler)}, which is designed to support efficient execution of large multithreaded Java applications in multiprocessor systems. Our proposed scheduler employs a scheduling policy that reduces lock contention. As will be shown in this paper, our prototype implementation of the CA-Scheduler in Linux and Sun HotSpot virtual machine only incurs 3.5\% runtime overhead, while the overall performance differences, when compared with a system with no contention awareness, range from a degradation of 3\% in a small multithreaded benchmark to an improvement of 15\% in a large Java application server benchmark.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "Java; operating systems; scheduling", } @Article{Aleen:2009:CAS, author = "Farhana Aleen and Nathan Clark", title = "Commutativity analysis for software parallelization: letting program transformations see the big picture", journal = j-SIGPLAN, volume = "44", number = "3", pages = "241--252", month = mar, year = "2009", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1508284.1508273", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue Jun 16 14:39:26 MDT 2009", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Extracting performance from many-core architectures requires software engineers to create multi-threaded applications, which significantly complicates the already daunting task of software development. One solution to this problem is automatic compile-time parallelization, which can ease the burden on software developers in many situations. Clearly, automatic parallelization in its present form is not suitable for many application domains and new compiler analyses are needed address its shortcomings.\par In this paper, we present one such analysis: a new approach for detecting commutative functions. Commutative functions are sections of code that can be executed in any order without affecting the outcome of the application, e.g., inserting elements into a set. Previous research on this topic had one significant limitation, in that the results of a commutative functions must produce identical memory layouts. This prevented previous techniques from detecting functions like malloc, which may return different pointers depending on the order in which it is called, but these differing results do not affect the overall output of the application. Our new commutativity analysis correctly identify these situations to better facilitate automatic parallelization. We demonstrate that this analysis can automatically extract significant amounts of parallelism from many applications, and where it is ineffective it can provide software developers a useful list of functions that may be commutative provided semantic program changes that are not automatable.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "automatic software parallelization; commutative functions; random interpretation", } @Article{Amamiya:2009:CBN, author = "Satoshi Amamiya and Makoto Amamiya and Ryuzo Hasegawa and Hiroshi Fujita", title = "A continuation-based noninterruptible multithreading processor architecture", journal = j-J-SUPERCOMPUTING, volume = "47", number = "2", pages = "228--252", month = feb, year = "2009", CODEN = "JOSUED", ISSN = "0920-8542 (print), 1573-0484 (electronic)", ISSN-L = "0920-8542", bibdate = "Wed Aug 25 08:38:29 MDT 2010", bibsource = "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0920-8542&volume=47&issue=2; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.springerlink.com/openurl.asp?genre=article&issn=0920-8542&volume=47&issue=2&spage=228", acknowledgement = ack-nhfb, fjournal = "The Journal of Supercomputing", } @Article{Anderson:2009:LAC, author = "Zachary R. Anderson and David Gay and Mayur Naik", title = "Lightweight annotations for controlling sharing in concurrent data structures", journal = j-SIGPLAN, volume = "44", number = "6", pages = "98--109", month = jun, year = "2009", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1542476.1542488", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue Jun 16 14:41:16 MDT 2009", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "SharC is a recently developed system for checking data-sharing in multithreaded programs. Programmers specify sharing rules (read-only, protected by a lock, etc.) for individual objects, and the SharC compiler enforces these rules using static and dynamic checks. Violations of these rules indicate unintended data sharing, which is the underlying cause of harmful data-races. Additionally, SharC allows programmers to change the sharing rules for a specific object using a {\em sharing cast}, to capture the fact that sharing rules for an object often change during the object's lifetime. SharC was successfully applied to a number of multi-threaded C programs.\par However, many programs are not readily checkable using SharC because their sharing rules, and changes to sharing rules, effectively apply to whole data structures rather than to individual objects. We have developed a system called {\em Shoal\/} to address this shortcoming. In addition to the sharing rules and sharing cast of SharC, our system includes a new concept that we call {\em groups}. A group is a collection of objects all having the same sharing mode. Each group has a distinguished member called the {\em group leader}. When the sharing mode of the group leader changes by way of a sharing cast, the sharing mode of all members of the group also changes. This operation is made sound by maintaining the invariant that at the point of a sharing cast, the only external pointer into the group is the pointer to the group leader. The addition of groups allows checking safe concurrency at the level of data structures rather than at the level of individual objects.\par We demonstrate the necessity and practicality of groups by applying Shoal to a wide range of concurrent C programs (the largest approaching a million lines of code). In all benchmarks groups entail low annotation burden and no significant additional performance overhead.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "concurrent programming; data races; multithreaded programming", } @Article{Antonopoulos:2009:ASH, author = "Christos D. Antonopoulos and Filip Blagojevic and Andrey N. Chernikov and Nikos P. Chrisochoides and Dimitrios S. Nikolopoulos", title = "Algorithm, software, and hardware optimizations for {Delaunay} mesh generation on simultaneous multithreaded architectures", journal = j-J-PAR-DIST-COMP, volume = "69", number = "7", pages = "601--612", month = jul, year = "2009", CODEN = "JPDCER", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Wed Sep 1 16:27:25 MDT 2010", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.sciencedirect.com/science/journal/07437315", acknowledgement = ack-nhfb, fjournal = "Journal of Parallel and Distributed Computing", } @Article{Azizi:2009:AEC, author = "Omid Azizi and Aqeel Mahesri and Sanjay J. Patel and Mark Horowitz", title = "Area-efficiency in {CMP} core design: co-optimization of microarchitecture and physical design", journal = j-COMP-ARCH-NEWS, volume = "37", number = "2", pages = "56--65", month = may, year = "2009", CODEN = "CANED2", DOI = "http://doi.acm.org/10.1145/1577129.1577138", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Tue Aug 11 18:12:39 MDT 2009", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "In this paper, we examine the area-performance design space of a processing core for a chip multiprocessor (CMP), considering both the architectural design space and the tradeoffs of the physical design on which the architecture relies. We first propose a methodology for performing an integrated optimization of both the micro-architecture and the physical circuit design of a microprocessor. In our approach, we use statistical and convex fitting methods to capture a large micro-architectural design space. We then characterize the area-delay tradeoffs of the underlying circuits through RTL synthesis. Finally, we establish the relationship between the architecture and the circuits in an integrative model, which we use to optimize the processor. As a case study, we apply this methodology to explore the performance-area tradeoffs in a highly parallel accelerator architecture for visual computing applications. Based on some early circuit tradeoff data, our results indicate that two separate designs are performance/area optimal for our set of benchmarks: a simpler single-issue, 2-way multithreaded core running at high-frequency, and a more aggressively tuned dual-issue 4-way multithreaded design running at a lower frequency.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", } @Article{Barkstrom:2009:UAS, author = "Bruce R. Barkstrom", title = "On using {Ada} to solve problems in computational economics and related disciplines with concurrent, multiagent algorithms", journal = j-SIGADA-LETTERS, volume = "29", number = "3", pages = "61--72", month = dec, year = "2009", CODEN = "AALEE5", DOI = "http://doi.acm.org/10.1145/1647420.1647437", ISSN = "0736-721X", ISSN-L = "0736-721X", bibdate = "Mon Jun 21 14:04:37 MDT 2010", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Multiagent algorithms are widely used in computational economics and other social sciences to solve theoretical and practical problems. Because such algorithms are inherently concurrent and multithreaded, Ada's constructs for handling communications between concurrent processes and avoiding interference between them make the language very well suited to solving these problems, particularly given developments in multi-core CPU chip-making. This paper provides a concrete example of how Ada assists in solving problems in computational economics and related disciplines that work with multiagent systems. Solving a simple problem illustrates visualizing the agents as Ada tasks, using UML use cases and synchronization diagrams to design the communications patterns between agents, and applying protected objects and functions to avoid computational indeterminacy.", acknowledgement = ack-nhfb, fjournal = "ACM SIGAda Ada Letters", keywords = "computational and mathematical organization theory; computational economics; concurrent programming; multiagent systems; multithreaded programming", } @Article{Berger:2009:GSM, author = "Emery D. Berger and Ting Yang and Tongping Liu and Gene Novark", title = "{Grace}: safe multithreaded programming for {C\slash C++}", journal = j-SIGPLAN, volume = "44", number = "10", pages = "81--96", month = oct, year = "2009", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1640089.1640096", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Mon Jun 21 18:01:56 MDT 2010", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "The shift from single to multiple core architectures means that programmers must write concurrent, multithreaded programs in order to increase application performance. Unfortunately, multithreaded applications are susceptible to numerous errors, including deadlocks, race conditions, atomicity violations, and order violations. These errors are notoriously difficult for programmers to debug.\par This paper presents Grace, a software-only runtime system that eliminates concurrency errors for a class of multithreaded programs: those based on fork-join parallelism. By turning threads into processes, leveraging virtual memory protection, and imposing a sequential commit protocol, Grace provides programmers with the appearance of deterministic, sequential execution, while taking advantage of available processing cores to run code concurrently and efficiently. Experimental results demonstrate Grace's effectiveness: with modest code changes across a suite of computationally-intensive benchmarks (1-16 lines), Grace can achieve high scalability and performance while preventing concurrency errors.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "concurrency; determinism; deterministic concurrency; fork-join; sequential semantics", } @Article{Bocchino:2009:TES, author = "Robert L. {Bocchino, Jr.} and Vikram S. Adve and Danny Dig and Sarita V. Adve and Stephen Heumann and Rakesh Komuravelli and Jeffrey Overbey and Patrick Simmons and Hyojin Sung and Mohsen Vakilian", title = "A type and effect system for deterministic parallel {Java}", journal = j-SIGPLAN, volume = "44", number = "10", pages = "97--116", month = oct, year = "2009", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1639949.1640097", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Mon Jun 21 18:01:56 MDT 2010", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Today's shared-memory parallel programming models are complex and error-prone. While many parallel programs are intended to be deterministic, unanticipated thread interleavings can lead to subtle bugs and nondeterministic semantics. In this paper, we demonstrate that a practical {\em type and effect system\/} can simplify parallel programming by {\em guaranteeing deterministic semantics\/} with modular, compile-time type checking even in a rich, concurrent object-oriented language such as Java. We describe an object-oriented type and effect system that provides several new capabilities over previous systems for expressing deterministic parallel algorithms. We also describe a language called Deterministic Parallel Java (DPJ) that incorporates the new type system features, and we show that a core subset of DPJ is sound. We describe an experimental validation showing that DPJ can express a wide range of realistic parallel programs; that the new type system features are useful for such programs; and that the parallel programs exhibit good performance gains (coming close to or beating equivalent, nondeterministic multithreaded programs where those are available).", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "commutativity; determinism; deterministic parallelism; effect systems; effects", } @Article{Bratanov:2009:VMW, author = "Stanislav Bratanov and Roman Belenov and Nikita Manovich", title = "Virtual machines: a whole new world for performance analysis", journal = j-OPER-SYS-REV, volume = "43", number = "2", pages = "46--55", month = apr, year = "2009", CODEN = "OSRED8", DOI = "http://doi.acm.org/10.1145/1531793.1531802", ISSN = "0163-5980", ISSN-L = "0163-5980", bibdate = "Thu Apr 23 19:43:22 MDT 2009", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "This article addresses a problem of performance monitoring inside virtual machines (VMs). It advocates focused monitoring of particular virtualized programs, explains the need for and the importance of such an approach to performance monitoring in virtualized execution environments, and emphasizes its benefits for virtual machine manufacturers, virtual machine users (mostly, software developers) and hardware (processor) manufacturers. The article defines the problem of in-VM performance monitoring as the ability to employ modern methods and hardware performance monitoring capabilities inside virtual machines to an extent comparable with what is being done in real environments. Unfortunately, there are numerous reasons preventing us from achieving such an ambitious goal, one of those reasons being the lack of support from virtualization engines; that is why a novel method of 'cooperative' performance data collection is disclosed. The method implies collection of performance data at physical hardware and simultaneous tracking of software states inside a virtual machine. Each statistically visible execution point of the virtualized software may then be associated with information on real hardware events. The method effectively enables time-based sampling of virtualized workloads combined with hardware event counting, is applicable to unmodified, commercially available virtual machines, and has competitive precision and overhead. The practical significance and value of the method are further illustrated by studying a parallel workload and uncovering virtualization-specific performance issues of multithreaded programs.", acknowledgement = ack-nhfb, fjournal = "ACM SIGOPS Operating Systems Review", keywords = "hardware performance event counters; virtual machines", } @Book{Cormen:2009:IA, editor = "Thomas H. Cormen and Charles Eric Leiserson and Ronald L. Rivest and Clifford Stein", title = "Introduction to algorithms", publisher = pub-MIT, address = pub-MIT:adr, edition = "Third", pages = "xix + 1292", year = "2009", ISBN = "0-262-03384-4 (hardcover), 0-262-53305-7 (paperback)", ISBN-13 = "978-0-262-03384-8 (hardcover), 978-0-262-53305-8 (paperback)", LCCN = "QA76.6 .C662 2009", bibdate = "Thu Sep 9 14:42:33 MDT 2010", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib; z3950.loc.gov:7090/Voyager", abstract = "Some books on algorithms are rigorous but incomplete; others cover masses of material but lack rigor. Introduction to Algorithms uniquely combines rigor and comprehensiveness. The book covers a broad range of algorithms in depth, yet makes their design and analysis accessible to all levels of readers. Each chapter is relatively self-contained and can be used as a unit of study. The algorithms are described in English and in a pseudocode designed to be readable by anyone who has done a little programming. The explanations have been kept elementary without sacrificing depth of coverage or mathematical rigor. The first edition became a widely used text in universities worldwide as well as the standard reference for professionals. The second edition featured new chapters on the role of algorithms, probabilistic analysis and randomized algorithms, and linear programming. The third edition has been revised and updated throughout. It includes two completely new chapters, on van Emde Boas trees and multithreaded algorithms, and substantial additions to the chapter on recurrences (now called ``Divide-and-Conquer''). It features improved treatment of dynamic programming and greedy algorithms and a new notion of edge-based flow in the material on flow networks. Many new exercises and problems have been added for this edition.", acknowledgement = ack-nhfb, libnote = "Not in my library.", subject = "Computer programming; Computer algorithms", } @Article{deBoer:2009:SVC, author = "F. S. de Boer", title = "A shared-variable concurrency analysis of multi-threaded object-oriented programs", journal = j-THEOR-COMP-SCI, volume = "410", number = "2--3", pages = "128--141", day = "6", month = feb, year = "2009", CODEN = "TCSCDI", ISSN = "0304-3975 (print), 1879-2294 (electronic)", ISSN-L = "0304-3975", bibdate = "Mon Mar 28 21:21:46 MDT 2011", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.sciencedirect.com/science/journal/03043975", acknowledgement = ack-nhfb, fjournal = "Theoretical Computer Science", } @Article{Devietti:2009:DDS, author = "Joseph Devietti and Brandon Lucia and Luis Ceze and Mark Oskin", title = "{DMP}: deterministic shared memory multiprocessing", journal = j-SIGPLAN, volume = "44", number = "3", pages = "85--96", month = mar, year = "2009", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1508244.1508255", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue Jun 16 14:39:26 MDT 2009", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Current shared memory multicore and multiprocessor systems are nondeterministic. Each time these systems execute a multithreaded application, even if supplied with the same input, they can produce a different output. This frustrates debugging and limits the ability to properly test multithreaded code, becoming a major stumbling block to the much-needed widespread adoption of parallel programming.\par In this paper we make the case for fully deterministic shared memory multiprocessing (DMP). The behavior of an arbitrary multithreaded program on a DMP system is only a function of its inputs. The core idea is to make inter-thread communication fully deterministic. Previous approaches to coping with nondeterminism in multithreaded programs have focused on replay, a technique useful only for debugging. In contrast, while DMP systems are directly useful for debugging by offering repeatability by default, we argue that parallel programs should execute deterministically in the field as well. This has the potential to make testing more assuring and increase the reliability of deployed multithreaded software. We propose a range of approaches to enforcing determinism and discuss their implementation trade-offs. We show that determinism can be provided with little performance cost using our architecture proposals on future hardware, and that software-only approaches can be utilized on existing systems.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "debugging; determinism; multicores; parallel programming", } @Article{Eyerman:2009:MLP, author = "Stijn Eyerman and Lieven Eeckhout", title = "Memory-level parallelism aware fetch policies for simultaneous multithreading processors", journal = j-TACO, volume = "6", number = "1", pages = "3:1--3:??", month = mar, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1509864.1509867", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu May 7 14:55:25 MDT 2009", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "A thread executing on a simultaneous multithreading (SMT) processor that experiences a long-latency load will eventually stall while holding execution resources. Existing long-latency load aware SMT fetch policies limit the amount of resources allocated by a stalled thread by identifying long-latency loads and preventing the thread from fetching more instructions --- and in some implementations, instructions beyond the long-latency load are flushed to release allocated resources.\par This article proposes an SMT fetch policy that takes into account the available memory-level parallelism (MLP) in a thread. The key idea proposed in this article is that in case of an isolated long-latency load (i.e., there is no MLP), the thread should be prevented from allocating additional resources. However, in case multiple independent long-latency loads overlap (i.e., there is MLP), the thread should allocate as many resources as needed in order to fully expose the available MLP. MLP-aware fetch policies achieve better performance for MLP-intensive threads on SMT processors, leading to higher overall system throughput and shorter average turnaround time than previously proposed fetch policies.", acknowledgement = ack-nhfb, articleno = "3", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", keywords = "Fetch Policy; Memory-Level Parallelism (MLP); Simultaneous Multithreading (SMT)", } @Article{Eyerman:2009:PTC, author = "Stijn Eyerman and Lieven Eeckhout", title = "Per-thread cycle accounting in {SMT} processors", journal = j-SIGPLAN, volume = "44", number = "3", pages = "133--144", month = mar, year = "2009", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1508284.1508260", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue Jun 16 14:39:26 MDT 2009", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "This paper proposes a cycle accounting architecture for Simultaneous Multithreading (SMT) processors that estimates the execution times for each of the threads had they been executed alone, while they are running simultaneously on the SMT processor. This is done by accounting each cycle to either a base, miss event or waiting cycle component during multi-threaded execution. Single-threaded alone execution time is then estimated as the sum of the base and miss event components; the waiting cycle component represents the lost cycle count due to SMT execution. The cycle accounting architecture incurs reasonable hardware cost (around 1KB of storage) and estimates single-threaded performance with average prediction errors around 7.2\% for two-program workloads and 11.7\% for four-program workloads.\par The cycle accounting architecture has several important applications to system software and its interaction with SMT hardware. For one, the estimated single-thread alone execution time provides an accurate picture to system software of the actually consumed processor cycles per thread. The alone execution time instead of the total execution time (timeslice) may make system software scheduling policies more effective. Second, a new class of thread-progress aware SMT fetch policies based on per-thread progress indicators enable system software level priorities to be enforced at the hardware level.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "cycle accounting; simultaneous multithreading (SMT); thread-progress aware fetch policy", } @Article{Fung:2009:DWF, author = "Wilson W. L. Fung and Ivan Sham and George Yuan and Tor M. Aamodt", title = "Dynamic warp formation: {Efficient MIMD} control flow on {SIMD} graphics hardware", journal = j-TACO, volume = "6", number = "2", pages = "7:1--7:??", month = jun, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1543753.1543756", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Jul 2 12:32:04 MDT 2009", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Recent advances in graphics processing units (GPUs) have resulted in massively parallel hardware that is easily programmable and widely available in today's desktop and notebook computer systems. GPUs typically use single-instruction, multiple-data (SIMD) pipelines to achieve high performance with minimal overhead for control hardware. Scalar threads running the same computing kernel are grouped together into SIMD batches, sometimes referred to as warps. While SIMD is ideally suited for simple programs, recent GPUs include control flow instructions in the GPU instruction set architecture and programs using these instructions may experience reduced performance due to the way branch execution is supported in hardware. One solution is to add a stack to allow different SIMD processing elements to execute distinct program paths after a branch instruction. The occurrence of diverging branch outcomes for different processing elements significantly degrades performance using this approach. In this article, we propose dynamic warp formation and scheduling, a mechanism for more efficient SIMD branch execution on GPUs. It dynamically regroups threads into new warps on the fly following the occurrence of diverging branch outcomes. We show that a realistic hardware implementation of this mechanism improves performance by 13\%, on average, with 256 threads per core, 24\% with 512 threads, and 47\% with 768 threads for an estimated area increase of 8\%.", acknowledgement = ack-nhfb, articleno = "7", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", keywords = "control flow; fine-grained multithreading; GPU; SIMD", } @Article{Gabor:2009:SLA, author = "Ron Gabor and Avi Mendelson and Shlomo Weiss", title = "Service level agreement for multithreaded processors", journal = j-TACO, volume = "6", number = "2", pages = "6:1--6:??", month = jun, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1543753.1543755", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Thu Jul 2 12:32:04 MDT 2009", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Multithreading is widely used to increase processor throughput. As the number of shared resources increase, managing them while guaranteeing predicted performance becomes a major problem. Attempts have been made in previous work to ease this via different fairness mechanisms. In this article, we present a new approach to control the resource allocation and sharing via a service level agreement (SLA)-based mechanism; that is, via an agreement in which multithreaded processors guarantee a minimal level of service to the running threads. We introduce a new metric, {\em C\/}$_{SLA}$, for conformance to SLA in multithreaded processors and show that controlling resources using with SLA allows for higher gains than are achievable by previously suggested fairness techniques. It also permits improving one metric (e.g., power) while maintaining SLA in another (e.g., performance). We compare SLA enforcement to schemes based on other fairness metrics, which are mostly targeted at equalizing execution parameters. We show that using SLA rather than fairness based algorithms provides a range of acceptable execution points from which we can select the point that best fits our optimization target, such as maximizing the weighted speedup (sum of the speedups of the individual threads) or reducing power. We demonstrate the effectiveness of the new SLA approach using switch-on-event (coarse-grained) multithreading. Our weighted speedup improvement scheme successfully enforces SLA while improving the weighted speedup by an average of 10\% for unbalanced threads. This result is significant when compared with performance losses that may be incurred by fairness enforcement methods. When optimizing for power reduction in unbalanced threads SLA enforcement reduces the power by an average of 15\%. SLA may be complemented by other power reduction methods to achieve further power savings {\em and\/} maintain the same service level for the threads. We also demonstrate differentiated SLA, where weighted speedup is maximized while each thread may have a different throughput constraint.", acknowledgement = ack-nhfb, articleno = "6", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", keywords = "fairness; performance; power; Service level agreement; throughput", } @Article{Ganty:2009:VLA, author = "Pierre Ganty and Rupak Majumdar and Andrey Rybalchenko", title = "Verifying liveness for asynchronous programs", journal = j-SIGPLAN, volume = "44", number = "1", pages = "102--113", month = jan, year = "2009", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1594834.1480895", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Fri Oct 9 08:40:38 MDT 2009", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Asynchronous or 'event-driven' programming is a popular technique to efficiently and flexibly manage concurrent interactions. In these programs, the programmer can post tasks that get stored in a task buffer and get executed atomically by a non-preemptive scheduler at a future point. We give a decision procedure for the fair termination property of asynchronous programs. The fair termination problem asks, given an asynchronous program and a fairness condition on its executions, does the program always terminate on fair executions? The fairness assumptions rule out certain undesired bad behaviors, such as where the scheduler ignores a set of posted tasks forever, or where a non-deterministic branch is always chosen in one direction. Since every liveness property reduces to a fair termination property, our decision procedure extends to liveness properties of asynchronous programs. Our decision procedure for the fair termination of asynchronous programs assumes all variables are finite-state. Even though variables are finite-state, asynchronous programs can have an unbounded stack from recursive calls made by tasks, as well as an unbounded task buffer of pending tasks. We show a reduction from the fair termination problem for asynchronous programs to fair termination problems on Petri Nets, and our main technical result is a reduction of the latter problem to Presburger satisfiability. Our decidability result is in contrast to multithreaded recursive programs, for which liveness properties are undecidable. While we focus on fair termination, we show our reduction to Petri Nets can be used to prove related properties such as fair nonstarvation (every posted task is eventually executed) and safety properties such as boundedness (find a bound on the maximum number of posted tasks that can be in the task buffer at any point).", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "asynchronous (event-driven) programming; fair termination; liveness; Petri nets", } @TechReport{Granat:2009:NPQ, author = "Robert Granat and Bo K{\aa}gstr{\"o}m and Daniel Kressner", title = "A novel parallel {$QR$} algorithm for hybrid distributed memory {HPC} systems", type = "LAPACK Working Note", number = "216", institution = "Department of Computing Science and HPC2N", address = "Ume{\aa} University, S-901 Ume{\aa}, Sweden", month = apr, year = "2009", bibdate = "Fri Apr 24 12:25:43 2009", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.netlib.org/lapack/lawnspdf/lawn216.pdf", abstract = "A novel variant of the parallel QR algorithm for solving dense nonsymmetric eigenvalue problems on hybrid distributed high performance computing (HPC) systems is presented. For this purpose, we introduce the concept of multi-window bulge chain chasing and parallelize aggressive early deflation. The multi-window approach ensures that most computations when chasing chains of bulges are performed in level 3 BLAS operations, while the aim of aggressive early deflation is to speed up the convergence of the QR algorithm. Mixed MPI-OpenMP coding techniques are utilized for porting the codes to distributed memory platforms with multithreaded nodes, such as multicore processors. Numerous numerical experiments confirm the superior performance of our parallel QR algorithm in comparison with the existing ScaLAPACK code, leading to an implementation that is one to two orders of magnitude faster for sufficiently large problems, including a number of examples from applications.", acknowledgement = ack-nhfb, keywords = "aggressive early deflation; bulge chasing; Eigenvalue problem; hybrid distributed memory systems.; level 3 performance; multishift; nonsymmetric QR algorithm; parallel algorithms; parallel computations", utknumber = "UMINF-09.06", } @Article{Grant:2009:IEE, author = "Ryan E. Grant and Ahmad Afsahi", title = "Improving energy efficiency of asymmetric chip multithreaded multiprocessors through reduced {OS} noise scheduling", journal = j-CCPE, volume = "21", number = "18", pages = "2355--2376", day = "25", month = dec, year = "2009", CODEN = "CCPEBO", DOI = "http://dx.doi.org/10.1002/cpe.1454", ISSN = "1532-0626 (print), 1532-0634 (electronic)", ISSN-L = "1532-0626", bibdate = "Mon Dec 5 10:08:40 MST 2011", bibsource = "http://www.interscience.wiley.com/jpages/1532-0626; http://www.math.utah.edu/pub/tex/bib/ccpe.bib; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Concurrency and Computation: Prac\-tice and Experience", onlinedate = "8 Jul 2009", } @Article{Hoffman:2009:SAT, author = "Kevin J. Hoffman and Patrick Eugster and Suresh Jagannathan", title = "Semantics-aware trace analysis", journal = j-SIGPLAN, volume = "44", number = "6", pages = "453--464", month = jun, year = "2009", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1542476.1542527", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue Jun 16 14:41:16 MDT 2009", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "As computer systems continue to become more powerful and complex, so do programs. High-level abstractions introduced to deal with complexity in large programs, while simplifying human reasoning, can often obfuscate salient program properties gleaned from automated source-level analysis through subtle (often non-local) interactions. Consequently, understanding the effects of program changes and whether these changes violate intended protocols become difficult to infer. Refactorings, and feature additions, modifications, or removals can introduce hard-to-catch bugs that often go undetected until many revisions later.\par To address these issues, this paper presents a novel dynamic program analysis that builds a {\em semantic view\/} of program executions. These views reflect program abstractions and aspects; however, views are not simply projections of execution traces, but are linked to each other to capture semantic interactions among abstractions at different levels of granularity in a scalable manner.\par We describe our approach in the context of Java and demonstrate its utility to improve {\em regression analysis}. We first formalize a subset of Java and a grammar for traces generated at program execution. We then introduce several types of views used to analyze regression bugs along with a novel, scalable technique for semantic differencing of traces from different versions of the same program. Benchmark results on large open-source Java programs demonstrate that semantic-aware trace differencing can identify precise and useful details about the underlying cause for a regression, even in programs that use reflection, multithreading, or dynamic code generation, features that typically confound other analysis techniques.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "automated debugging; debugging aids; semantic tracing; testing tools; trace views; tracing", } @Article{Joshi:2009:RDP, author = "Pallavi Joshi and Chang-Seo Park and Koushik Sen and Mayur Naik", title = "A randomized dynamic program analysis technique for detecting real deadlocks", journal = j-SIGPLAN, volume = "44", number = "6", pages = "110--120", month = jun, year = "2009", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1543135.1542489", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue Jun 16 14:41:16 MDT 2009", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "We present a novel dynamic analysis technique that finds real deadlocks in multi-threaded programs. Our technique runs in two stages. In the first stage, we use an imprecise dynamic analysis technique to find potential deadlocks in a multi-threaded program by observing an execution of the program. In the second stage, we control a random thread scheduler to create the potential deadlocks with high probability. Unlike other dynamic analysis techniques, our approach has the advantage that it does not give any false warnings. We have implemented the technique in a prototype tool for Java, and have experimented on a number of large multi-threaded Java programs. We report a number of previously known and unknown real deadlocks that were found in these benchmarks.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "active testing; concurrency; deadlock detection; dynamic program analysis", } @Article{Kejariwal:2009:ELL, author = "Arun Kejariwal and Alexander V. Veidenbaum and Alexandru Nicolau and Milind Girkar and Xinmin Tian and Hideki Saito", title = "On the exploitation of loop-level parallelism in embedded applications", journal = j-TECS, volume = "8", number = "2", pages = "10:1--10:??", month = jan, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1457255.1457257", ISSN = "1539-9087", ISSN-L = "1539-9087", bibdate = "Thu Feb 5 19:15:05 MST 2009", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Advances in the silicon technology have enabled increasing support for hardware parallelism in embedded processors. Vector units, multiple processors/cores, multithreading, special-purpose accelerators such as DSPs or cryptographic engines, or a combination of the above have appeared in a number of processors. They serve to address the increasing performance requirements of modern embedded applications. To what extent the available hardware parallelism can be exploited is directly dependent on the amount of parallelism inherent in the given application and the congruence between the granularity of hardware and application parallelism. This paper discusses how loop-level parallelism in embedded applications can be exploited in hardware and software. Specifically, it evaluates the efficacy of automatic loop parallelization and the performance potential of different types of parallelism, viz., true thread-level parallelism (TLP), speculative thread-level parallelism and vector parallelism, when executing loops. Additionally, it discusses the interaction between parallelization and vectorization. Applications from both the industry-standard EEMBC{\reg},$^1$ 1.1, EEMBC 2.0 and the academic MiBench embedded benchmark suites are analyzed using the Intel{\reg}$^2$ C compiler. The results show the performance that can be achieved today on real hardware and using a production compiler, provide upper bounds on the performance potential of the different types of thread-level parallelism, and point out a number of issues that need to be addressed to improve performance. The latter include parallelization of libraries such as libc and design of parallel algorithms to allow maximal exploitation of parallelism. The results also point to the need for developing new benchmark suites more suitable to parallel compilation and execution.\par $^1$ Other names and brands may be claimed as the property of others.\par $^2$ Intel is a trademark of Intel Corporation or its subsidiaries in the United States and other countries.", acknowledgement = ack-nhfb, articleno = "10", fjournal = "ACM Transactions on Embedded Computing Systems", keywords = "libraries; Multi-cores; multithreading; parallel loops; programming models; system-on-chip (Soc); thread-level speculation; vectorization", } @Article{Kejariwal:2009:PSA, author = "Arun Kejariwal and Calin Cas{\c{c}}aval", title = "Parallelization spectroscopy: analysis of thread-level parallelism in {HPC} programs", journal = j-SIGPLAN, volume = "44", number = "4", pages = "293--294", month = apr, year = "2009", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1594835.1504221", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Fri Oct 9 08:40:49 MDT 2009", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "In this paper, we present a method --- parallelization spectroscopy --- for analyzing the thread-level parallelism available in production High Performance Computing (HPC) codes. We survey a number of techniques that are commonly used for parallelization and classify all the loops in the case study presented using a sensitivity metric: how likely is a particular technique is successful in parallelizing the loop.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "loop transformations; multithreading; parallelism", } @Article{Kunal:2009:HDS, author = "K. Kunal and K. George and M. Gautam and V. Kamakoti", title = "{HTM} design spaces: complete decoupling from caches and achieving highly concurrent transactions", journal = j-OPER-SYS-REV, volume = "43", number = "2", pages = "98--99", month = apr, year = "2009", CODEN = "OSRED8", DOI = "http://doi.acm.org/10.1145/1531793.1531809", ISSN = "0163-5980", ISSN-L = "0163-5980", bibdate = "Thu Apr 23 19:43:22 MDT 2009", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "This paper proposes a Hardware Transactional Memory (HTM) design for multi-core environments. Using a novel technique to keep track of transactional read-write entries, the design provides a holistic and scalable solution to Transactional Memory (TM) implementation issues of context switching, process migration and overflow handling. Another aspect of the design is that it allows transactions to run in a highly concurrent manner by using special techniques to handle conflict resolution, conflict detection and overflows. The feasibility and validity of the proposed design are demonstrated by developing a synthesizable Hardware Description Language (HDL) model of the design and also experimenting on the same with standard benchmarks.", acknowledgement = ack-nhfb, fjournal = "ACM SIGOPS Operating Systems Review", keywords = "context switching; hardware transactional memory; multi-threaded cores; operating systems; overflow handling; process migration", } @TechReport{Kurzak:2009:SLA, author = "Jakub Kurzak and Hatem Ltaief and Jack Dongarra and Rosa M. Badia", title = "Scheduling Linear Algebra Operations on Multicore Processors", type = "LAPACK Working Note", number = "213", institution = inst-UT-CS, address = inst-UT-CS:adr, month = feb, year = "2009", bibdate = "Fri Apr 24 12:25:43 2009", bibsource = "ftp://ftp.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.netlib.org/lapack/lawnspdf/lawn213.pdf", abstract = "We present performance results for dense linear algebra using the 8-series NVIDIA GPUs. Our matrix-matrix multiply routine (GEMM) runs 60\% faster than the vendor implementation in CUBLAS 1.1 and approaches the peak of hardware capabilities. Our LU, QR and Cholesky factorizations achieve up to 80--90\% of the peak GEMM rate. Our parallel LU running on two GPUs achieves up to $\approx$300 Gflop/s. These results are accomplished by challenging the accepted view of the GPU architecture and programming guidelines. We argue that modern GPUs should be viewed as multithreaded multicore vector units. We exploit blocking similarly to vector computers and heterogeneity of the system by computing both on GPU and CPU. This study includes detailed benchmarking of the GPU memory system that reveals sizes and latencies of caches and TLB. We present a couple of algorithmic optimizations aimed at increasing parallelism and regularity in the problem that provide us with slightly higher performance.", acknowledgement = ack-nhfb, keywords = "Cholesky; factorization; linear algebra; LU; multicore; QR; scheduling; task graph", utknumber = "UT-CS-09-636", } @Article{Lenharth:2009:RDO, author = "Andrew Lenharth and Vikram S. Adve and Samuel T. King", title = "Recovery domains: an organizing principle for recoverable operating systems", journal = j-SIGPLAN, volume = "44", number = "3", pages = "49--60", month = mar, year = "2009", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1508284.1508251", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue Jun 16 14:39:26 MDT 2009", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "We describe a strategy for enabling existing commodity operating systems to recover from unexpected run-time errors in nearly any part of the kernel, including core kernel components. Our approach is dynamic and request-oriented; it isolates the effects of a fault to the requests that caused the fault rather than to static kernel components. This approach is based on a notion of 'recovery domains,' an organizing principle to enable rollback of state affected by a request in a multithreaded system with minimal impact on other requests or threads. We have applied this approach on v2.4.22 and v2.6.27 of the Linux kernel and it required 132 lines of changed or new code: the other changes are all performed by a simple instrumentation pass of a compiler. Our experiments show that the approach is able to recover from otherwise fatal faults with minimal collateral impact during a recovery event.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "akeso; automatic fault recovery; recovery domains", } @Article{Madriles:2009:BST, author = "Carlos Madriles and Pedro L{\'o}pez and Josep M. Codina and Enric Gibert and Fernando Latorre and Alejandro Martinez and Ra{\'u}l Martinez and Antonio Gonzalez", title = "Boosting single-thread performance in multi-core systems through fine-grain multi-threading", journal = j-COMP-ARCH-NEWS, volume = "37", number = "3", pages = "474--483", month = jun, year = "2009", CODEN = "CANED2", DOI = "http://doi.acm.org/10.1145/1555754.1555813", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Tue Aug 11 18:12:55 MDT 2009", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Industry has shifted towards multi-core designs as we have hit the memory and power walls. However, single thread performance remains of paramount importance since some applications have limited thread-level parallelism (TLP), and even a small part with limited TLP impose important constraints to the global performance, as explained by Amdahl's law.\par In this paper we propose a novel approach for leveraging multiple cores to improve single-thread performance in a multi-core design. The proposed technique features a set of novel hardware mechanisms that support the execution of threads generated at compile time. These threads result from a fine-grain speculative decomposition of the original application and they are executed under a modified multi-core system that includes: (1) mechanisms to support multiple versions; (2) mechanisms to detect violations among threads; (3) mechanisms to reconstruct the original sequential order; and (4) mechanisms to checkpoint the architectural state and recovery to handle misspeculations.\par The proposed scheme outperforms previous hardware-only schemes to implement the idea of combining cores for executing single-thread applications in a multi-core design by more than 10\% on average on Spec2006 for all configurations. Moreover, single-thread performance is improved by 41\% on average when the proposed scheme is used on a Tiny Core, and up to 2.6x for some selected applications.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", keywords = "automatic parallelization; core-fusion; multicore; single-thread performance; speculative multithreading; thread-level parallelism", } @Article{Marino:2009:LES, author = "Daniel Marino and Madanlal Musuvathi and Satish Narayanasamy", title = "{LiteRace}: effective sampling for lightweight data-race detection", journal = j-SIGPLAN, volume = "44", number = "6", pages = "134--143", month = jun, year = "2009", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1542476.1542491", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue Jun 16 14:41:16 MDT 2009", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Data races are one of the most common and subtle causes of pernicious concurrency bugs. Static techniques for preventing data races are overly conservative and do not scale well to large programs. Past research has produced several dynamic data race detectors that can be applied to large programs. They are precise in the sense that they only report actual data races. However, dynamic data race detectors incur a high performance overhead, slowing down a program's execution by an order of magnitude.\par In this paper we present LiteRace, a very lightweight data race detector that samples and analyzes only selected portions of a program's execution. We show that it is possible to sample a multithreaded program at a low frequency, and yet, find infrequently occurring data races. We implemented LiteRace using Microsoft's Phoenix compiler. Our experiments with several Microsoft programs, Apache, and Firefox show that LiteRace is able to find more than 70\% of data races by sampling less than 2\% of memory accesses in a given program execution.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "concurrency bugs; dynamic data race detection; sampling", } @Article{Monchiero:2009:HSC, author = "Matteo Monchiero and Jung Ho Ahn and Ayose Falc{\'o}n and Daniel Ortega and Paolo Faraboschi", title = "How to simulate 1000 cores", journal = j-COMP-ARCH-NEWS, volume = "37", number = "2", pages = "10--19", month = may, year = "2009", CODEN = "CANED2", DOI = "http://doi.acm.org/10.1145/1577129.1577133", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Tue Aug 11 18:12:39 MDT 2009", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "This paper proposes a novel methodology to efficiently simulate shared-memory multiprocessors composed of hundreds of cores. The basic idea is to use thread-level parallelism in the software system and translate it into core-level parallelism in the simulated world. To achieve this, we first augment an existing full-system simulator to identify and separate the instruction streams belonging to the different software threads. Then, the simulator dynamically maps each instruction flow to the corresponding core of the target multi-core architecture, taking into account the inherent thread synchronization of the running applications. Our simulator allows a user to execute any multithreaded application in a conventional full-system simulator and evaluate the performance of the application on a many-core hardware. We carried out extensive simulations on the SPLASH-2 benchmark suite and demonstrated the scalability up to 1024 cores with limited simulation speed degradation vs. the single-core case on a fixed workload. The results also show that the proposed technique captures the intrinsic behavior of the SPLASH-2 suite, even when we scale up the number of shared-memory cores beyond the thousand-core limit.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", } @Article{Musoll:2009:LSO, author = "Enric Musoll", title = "Leakage-saving opportunities in mesh-based massive multi-core architectures", journal = j-COMP-ARCH-NEWS, volume = "37", number = "5", pages = "1--7", month = dec, year = "2009", CODEN = "CANED2", DOI = "http://doi.acm.org/10.1145/1755235.1755237", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Apr 8 18:42:25 MDT 2010", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "When processing multi-threaded workloads requiring significant inter-thread communication, opportunities to reduce power consumption arise due to the large latencies in obtaining data from the threads running on remote cores and the lack of architectural resources implemented in the simple cores to cover these latencies.\par In this work we propose to use the drowsy mode technique to save leakage power on the cores and leverage the mesh-based communication fabric to hide the wake-up latency of the core blocks. We have observed a potential for reducing the overall power of around 70\% in a generic homogeneous 256-core tile-based multi-core architecture.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", } @Article{Neamtiu:2009:STU, author = "Iulian Neamtiu and Michael Hicks", title = "Safe and timely updates to multi-threaded programs", journal = j-SIGPLAN, volume = "44", number = "6", pages = "13--24", month = jun, year = "2009", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1543135.1542479", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue Jun 16 14:41:16 MDT 2009", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Many dynamic updating systems have been developed that enable a program to be patched while it runs, to fix bugs or add new features. This paper explores techniques for supporting dynamic updates to multi-threaded programs, focusing on the problem of applying an update in a timely fashion while still producing correct behavior. Past work has shown that this tension of {\em safety\/} versus timeliness can be balanced for single-threaded programs. For multi-threaded programs, the task is more difficult because myriad thread interactions complicate understanding the possible program states to which a patch could be applied. Our approach allows the programmer to specify a few program points (e.g., one per thread) at which a patch may be applied, which simplifies reasoning about safety. To improve timeliness, a combination of static analysis and run-time support automatically expands these few points to many more that produce behavior equivalent to the originals. Experiments with thirteen realistic updates to three multi-threaded servers show that we can safely perform a dynamic update within milliseconds when more straightforward alternatives would delay some updates indefinitely.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "dynamic software updating; multi-threading; update safety; update timeliness", } @Article{Nicolau:2009:TEP, author = "Alexandru Nicolau and Guangqiang Li and Arun Kejariwal", title = "Techniques for efficient placement of synchronization primitives", journal = j-SIGPLAN, volume = "44", number = "4", pages = "199--208", month = apr, year = "2009", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1504176.1504207", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Fri Oct 9 08:40:49 MDT 2009", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Harnessing the hardware parallelism of the emerging multi-cores systems necessitates concurrent software. Unfortunately, most of the existing mainstream software is sequential in nature. Although one could auto-parallelize a given program, the efficacy of this is largely limited to floating-point codes. One of the ways to alleviate the above limitation is to parallelize programs, which cannot be auto-parallelized, via explicit synchronization. In this regard, efficient placement of the synchronization primitives --- say, post, wait --- plays a key role in achieving high degree of thread-level parallelism ({\em TLP\/}). In this paper, we propose novel compiler techniques for the above. Specifically, given a control flow graph ({\em CFG\/}), the proposed techniques place a post as early as possible and place a wait as late as possible in the CFG, subject to dependences. We demonstrate the efficacy of our techniques, on a real machine, using real codes, specifically, from the industry-standard SPEC CPU benchmarks, the Linux kernel and other widely used open source codes. Our results show that the proposed techniques yield significantly higher levels of TLP than the state-of-the-art.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "compilers; multithreading; parallelization; performance", } @Article{Olszewski:2009:KED, author = "Marek Olszewski and Jason Ansel and Saman Amarasinghe", title = "{Kendo}: efficient deterministic multithreading in software", journal = j-SIGPLAN, volume = "44", number = "3", pages = "97--108", month = mar, year = "2009", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1508244.1508256", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue Jun 16 14:39:26 MDT 2009", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Although chip-multiprocessors have become the industry standard, developing parallel applications that target them remains a daunting task. Non-determinism, inherent in threaded applications, causes significant challenges for parallel programmers by hindering their ability to create parallel applications with repeatable results. As a consequence, parallel applications are significantly harder to debug, test, and maintain than sequential programs.\par This paper introduces Kendo: a new software-only system that provides deterministic multithreading of parallel applications. Kendo enforces a deterministic interleaving of lock acquisitions and specially declared non-protected reads through a novel dynamically load-balanced deterministic scheduling algorithm. The algorithm tracks the progress of each thread using performance counters to construct a deterministic logical time that is used to compute an interleaving of shared data accesses that is both deterministic and provides good load balancing. Kendo can run on today's commodity hardware while incurring only a modest performance cost. Experimental results on the SPLASH-2 applications yield a geometric mean overhead of only 16\% when running on 4 processors. This low overhead makes it possible to benefit from Kendo even after an application is deployed. Programmers can start using Kendo today to program parallel applications that are easier to develop, debug, and test.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "debugging; determinism; deterministic multithreading; multicore; parallel programming", } @Article{Pichel:2009:IDR, author = "J. C. Pichel and D. B. Heras and J. C. Cabaleiro and F. F. Rivera", title = "Increasing data reuse of sparse algebra codes on simultaneous multithreading architectures", journal = j-CCPE, volume = "21", number = "15", pages = "1838--1856", month = oct, year = "2009", CODEN = "CCPEBO", DOI = "http://dx.doi.org/10.1002/cpe.1404", ISSN = "1532-0626 (print), 1532-0634 (electronic)", ISSN-L = "1532-0626", bibdate = "Mon Dec 5 10:08:38 MST 2011", bibsource = "http://www.interscience.wiley.com/jpages/1532-0626; http://www.math.utah.edu/pub/tex/bib/ccpe.bib; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Concurrency and Computation: Prac\-tice and Experience", onlinedate = "11 Feb 2009", } @Article{Quintana-Orti:2009:PMA, author = "Gregorio Quintana-Ort{\'\i} and Enrique S. Quintana-Ort{\'\i} and Robert A. {Van De Geijn} and Field G. {Van Zee} and Ernie Chan", title = "Programming matrix algorithms-by-blocks for thread-level parallelism", journal = j-TOMS, volume = "36", number = "3", pages = "14:1--14:26", month = jul, year = "2009", CODEN = "ACMSCU", DOI = "http://doi.acm.org/10.1145/1527286.1527288", ISSN = "0098-3500 (print), 1557-7295 (electronic)", ISSN-L = "0098-3500", bibdate = "Tue Jul 21 14:09:07 MDT 2009", bibsource = "http://www.acm.org/pubs/contents/journals/toms/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "With the emergence of thread-level parallelism as the primary means for continued performance improvement, the programmability issue has reemerged as an obstacle to the use of architectural advances. We argue that evolving legacy libraries for dense and banded linear algebra is not a viable solution due to constraints imposed by early design decisions. We propose a philosophy of abstraction and separation of concerns that provides a promising solution in this problem domain. The first abstraction, FLASH, allows algorithms to express computation with matrices consisting of contiguous blocks, facilitating algorithms-by-blocks. Operand descriptions are registered for a particular operation a priori by the library implementor. A runtime system, SuperMatrix, uses this information to identify data dependencies between suboperations, allowing them to be scheduled to threads out-of-order and executed in parallel. But not all classical algorithms in linear algebra lend themselves to conversion to algorithms-by-blocks. We show how our recently proposed LU factorization with incremental pivoting and a closely related algorithm-by-blocks for the QR factorization, both originally designed for out-of-core computation, overcome this difficulty. Anecdotal evidence regarding the development of routines with a core functionality demonstrates how the methodology supports high productivity while experimental results suggest that high performance is abundantly achievable.", acknowledgement = ack-nhfb, articleno = "14", fjournal = "ACM Transactions on Mathematical Software", keywords = "high-performance; libraries; Linear algebra; multithreaded architectures", } @Article{Ratanaworabhan:2009:DTA, author = "Paruj Ratanaworabhan and Martin Burtscher and Darko Kirovski and Benjamin Zorn and Rahul Nagpal and Karthik Pattabiraman", title = "Detecting and tolerating asymmetric races", journal = j-SIGPLAN, volume = "44", number = "4", pages = "173--184", month = apr, year = "2009", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1504176.1504202", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Fri Oct 9 08:40:49 MDT 2009", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "This paper introduces ToleRace, a runtime system that allows programs to detect and even tolerate asymmetric data races. Asymmetric races are race conditions where one thread correctly acquires and releases a lock for a shared variable while another thread improperly accesses the same variable. ToleRace provides approximate isolation in the critical sections of lock-based parallel programs by creating a local copy of each shared variable when entering a critical section, operating on the local copies, and propagating the appropriate copies upon leaving the critical section. We start by characterizing all possible interleavings that can cause races and precisely describe the effect of ToleRace in each case. Then, we study the theoretical aspects of an oracle that knows exactly what type of interleaving has occurred. Finally, we present two software implementations of ToleRace and evaluate them on multithreaded applications from the SPLASH2 and PARSEC suites. Our implementation on top of a dynamic instrumentation tool, which works directly on executables and requires no source code modifications, incurs an overhead of a factor of two on average. Manually adding ToleRace to the source code of these applications results in an average overhead of 6.4 percent.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "dynamic instrumentation; race detection and toleration; runtime support", } @Article{Riccobene:2009:SCB, author = "Elvinia Riccobene and Patrizia Scandurra and Sara Bocchio and Alberto Rosti and Luigi Lavazza and Luigi Mantellini", title = "{SystemC\slash C-based} model-driven design for embedded systems", journal = j-TECS, volume = "8", number = "4", pages = "30:1--30:??", month = jul, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1550987.1550993", ISSN = "1539-9087", ISSN-L = "1539-9087", bibdate = "Thu Jul 23 12:32:49 MDT 2009", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "This article summarizes our effort, since 2004 up to the present time, for improving the current industrial Systems-on-Chip and Embedded Systems design by joining the capabilities of the unified modeling language (UML) and SystemC/C programming languages to operate at system-level. The proposed approach exploits the OMG model-driven architecture --- a framework for Model-driven Engineering --- capabilities of reducing abstract, coarse-grained and platform-independent system models to fine-grained and platform-specific models. We first defined a design methodology and a development flow for the hardware, based on a SystemC UML profile and encompassing different levels of abstraction. We then included a multithread C UML profile for modelling software applications. Both SystemC/C profiles are consistent sets of modelling constructs designed to lift the programming features (both structural and behavioral) of the two coding languages to the UML modeling level. The new codesign flow is supported by an environment, which allows system modeling at higher abstraction levels (from a functional executable level to a register transfer level) and supports automatic code-generation/back-annotation from/to UML models.", acknowledgement = ack-nhfb, articleno = "30", fjournal = "ACM Transactions on Embedded Computing Systems", keywords = "C; ES; MDE; SoC; SystemC; UML", } @Article{Roy:2009:LPF, author = "Indrajit Roy and Donald E. Porter and Michael D. Bond and Kathryn S. McKinley and Emmett Witchel", title = "{Laminar}: practical fine-grained decentralized information flow control", journal = j-SIGPLAN, volume = "44", number = "6", pages = "63--74", month = jun, year = "2009", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1543135.1542484", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue Jun 16 14:41:16 MDT 2009", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Decentralized information flow control (DIFC) is a promising model for writing programs with powerful, end-to-end security guarantees. Current DIFC systems that run on commodity hardware can be broadly categorized into two types: language-level and operating system-level DIFC. Language level solutions provide no guarantees against security violations on system resources, like files and sockets. Operating system solutions can mediate accesses to system resources, but are inefficient at monitoring the flow of information through fine-grained program data structures.\par This paper describes Laminar, the first system to implement decentralized information flow control using a single set of abstractions for OS resources and heap-allocated objects. Programmers express security policies by labeling data with secrecy and integrity labels, and then access the labeled data in lexically scoped security regions. Laminar enforces the security policies specified by the labels at runtime. Laminar is implemented using a modified Java virtual machine and a new Linux security module. This paper shows that security regions ease incremental deployment and limit dynamic security checks, allowing us to retrofit DIFC policies on four application case studies. Replacing the applications' ad-hoc security policies changes less than 10\% of the code, and incurs performance overheads from 1\% to 56\%. Whereas prior DIFC systems only support limited types of multithreaded programs, Laminar supports a more general class of multithreaded DIFC programs that can access heterogeneously labeled data.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "information flow control; java virtual machine; operating systems; security region", } @Article{Sidiroglou:2009:AAS, author = "Stelios Sidiroglou and Oren Laadan and Carlos Perez and Nicolas Viennot and Jason Nieh and Angelos D. Keromytis", title = "{ASSURE}: automatic software self-healing using rescue points", journal = j-SIGPLAN, volume = "44", number = "3", pages = "37--48", month = mar, year = "2009", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1508284.1508250", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue Jun 16 14:39:26 MDT 2009", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Software failures in server applications are a significant problem for preserving system availability. We present ASSURE, a system that introduces rescue points that recover software from unknown faults while maintaining both system integrity and availability, by mimicking system behavior under known error conditions. Rescue points are locations in existing application code for handling a given set of programmer-anticipated failures, which are automatically repurposed and tested for safely enabling fault recovery from a larger class of (unanticipated) faults. When a fault occurs at an arbitrary location in the program, ASSURE restores execution to an appropriate rescue point and induces the program to recover execution by virtualizing the program's existing error-handling facilities. Rescue points are identified using fuzzing, implemented using a fast coordinated checkpoint-restart mechanism that handles multi-process and multi-threaded applications, and, after testing, are injected into production code using binary patching. We have implemented an ASSURE Linux prototype that operates without application source code and without base operating system kernel changes. Our experimental results on a set of real-world server applications and bugs show that ASSURE enabled recovery for all of the bugs tested with fast recovery times, has modest performance overhead, and provides automatic self-healing orders of magnitude faster than current human-driven patch deployment methods.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "binary patching; checkpoint restart; error recovery; reliable software; software self-healing", } @Article{Son:2009:CDD, author = "Seung Woo Son and Mahmut Kandemir and Mustafa Karakoy and Dhruva Chakrabarti", title = "A compiler-directed data prefetching scheme for chip multiprocessors", journal = j-SIGPLAN, volume = "44", number = "4", pages = "209--218", month = apr, year = "2009", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1504176.1504208", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Fri Oct 9 08:40:49 MDT 2009", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Data prefetching has been widely used in the past as a technique for hiding memory access latencies. However, data prefetching in multi-threaded applications running on chip multiprocessors (CMPs) can be problematic when multiple cores compete for a shared on-chip cache (L2 or L3). In this paper, we (i) quantify the impact of conventional data prefetching on shared caches in CMPs. The experimental data collected using multi-threaded applications indicates that, while data prefetching improves performance in small number of cores, its benefits reduce significantly as the number of cores is increased, that is, it is not scalable; (ii) identify harmful prefetches as one of the main contributors for degraded performance with a large number of cores; and (iii) propose and evaluate a compiler-directed data prefetching scheme for shared on-chip cache based CMPs. The proposed scheme first identifies program phases using static compiler analysis, and then divides the threads into groups within each phase and assigns a customized prefetcher thread (helper thread) to each group of threads. This helps to reduce the total number of prefetches issued, prefetch overheads, and negative interactions on the shared cache space due to data prefetches, and more importantly, makes compiler-directed prefetching a scalable optimization for CMPs. Our experiments with the applications from the SPEC OMP benchmark suite indicate that the proposed scheme improves overall parallel execution latency by 18.3\% over the no-prefetch case and 6.4\% over the conventional data prefetching scheme (where each core prefetches its data independently), on average, when 12 cores are used. The corresponding average performance improvements with 24 cores are 16.4\% (over the no-prefetch case) and 11.7\% (over the conventional prefetching case). We also demonstrate that the proposed scheme is robust under a wide range of values of our major simulation parameters, and the improvements it achieves come very close to those that can be achieved using an optimal scheme.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "chip multiprocessors; compiler; helper thread; prefetching", } @Article{Suleman:2009:ACS, author = "M. Aater Suleman and Onur Mutlu and Moinuddin K. Qureshi and Yale N. Patt", title = "Accelerating critical section execution with asymmetric multi-core architectures", journal = j-SIGPLAN, volume = "44", number = "3", pages = "253--264", month = mar, year = "2009", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1508244.1508274", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue Jun 16 14:39:26 MDT 2009", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "To improve the performance of a single application on Chip Multiprocessors (CMPs), the application must be split into threads which execute concurrently on multiple cores. In multi-threaded applications, critical sections are used to ensure that only one thread accesses shared data at any given time. Critical sections can serialize the execution of threads, which significantly reduces performance and scalability.\par This paper proposes Accelerated Critical Sections (ACS), a technique that leverages the high-performance core(s) of an Asymmetric Chip Multiprocessor (ACMP) to accelerate the execution of critical sections. In ACS, selected critical sections are executed by a high-performance core, which can execute the critical section faster than the other, smaller cores. As a result, ACS reduces serialization: it lowers the likelihood of threads waiting for a critical section to finish. Our evaluation on a set of 12 critical-section-intensive workloads shows that ACS reduces the average execution time by 34\% compared to an equal-area 32T-core symmetric CMP and by 23\% compared to an equal-area ACMP. Moreover, for 7 out of the 12 workloads, ACS improves scalability by increasing the number of threads at which performance saturates.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "cmp; critical sections; heterogeneous cores; locks; multi-core; parallel programming", } @Book{Swinnen:2009:APA, author = "G{\'e}rard Swinnen", title = "Apprendre {\'a} programmer avec Python: objet, multithreading, {\'e}v{\'e}nements, bases de donn{\'e}es, programmation web, programmation r{\'e}seau, Unicode", publisher = pub-EYROLLES, address = pub-EYROLLES:adr, pages = "xviii + 341", year = "2009", LCCN = "????", bibdate = "Thu Apr 16 12:00:29 MDT 2009", bibsource = "carmin.sudoc.abes.fr:210/ABES-Z39-PUBLIC; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, language = "French", } @Article{Tallent:2009:EPM, author = "Nathan R. Tallent and John M. Mellor-Crummey", title = "Effective performance measurement and analysis of multithreaded applications", journal = j-SIGPLAN, volume = "44", number = "4", pages = "229--240", month = apr, year = "2009", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1504176.1504210", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Fri Oct 9 08:40:49 MDT 2009", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Understanding why the performance of a multithreaded program does not improve linearly with the number of cores in a shared-memory node populated with one or more multicore processors is a problem of growing practical importance. This paper makes three contributions to performance analysis of multithreaded programs. First, we describe how to measure and attribute {\em parallel idleness}, namely, where threads are stalled and unable to work. This technique applies broadly to programming models ranging from explicit threading ({\em e.g.}, Pthreads) to higher-level models such as Cilk and OpenMP. Second, we describe how to measure and attribute {\em parallel overhead\/} -- when a thread is performing miscellaneous work other than executing the user's computation. By employing a combination of compiler support and post-mortem analysis, we incur no measurement cost beyond normal profiling to glean this information. Using {\em idleness\/} and {\em overhead\/} metrics enables one to pinpoint areas of an application where concurrency should be increased (to reduce idleness), decreased (to reduce overhead), or where the present parallelization is hopeless (where idleness and overhead are both high). Third, we describe how to measure and attribute arbitrary performance metrics for high-level multithreaded programming models, such as Cilk. This requires bridging the gap between the expression of logical concurrency in programs and its realization at run-time as it is adaptively partitioned and scheduled onto a pool of threads. We have prototyped these ideas in the context of Rice University's HPCToolkit performance tools. We describe our approach, implementation, and experiences applying this approach to measure and attribute work, idleness, and overhead in executions of Cilk programs.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "call path profiling; hpctoolkit; multithreaded programming models; performance analysis", } @Article{Thakur:2009:TSE, author = "Rajeev Thakur and William Gropp", title = "Test suite for evaluating performance of multithreaded {MPI} communication", journal = j-PARALLEL-COMPUTING, volume = "35", number = "12", pages = "608--617", month = dec, year = "2009", CODEN = "PACOEJ", ISSN = "0167-8191", ISSN-L = "0167-8191", bibdate = "Thu Sep 2 17:51:11 MDT 2010", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.sciencedirect.com/science/journal/01678191", acknowledgement = ack-nhfb, fjournal = "Parallel Computing", } @Article{Vera:2009:SRL, author = "Xavier Vera and Jaume Abella and Javier Carretero and Antonio Gonz{\'a}lez", title = "Selective replication: {A} lightweight technique for soft errors", journal = j-TOCS, volume = "27", number = "4", pages = "8:1--8:30", month = dec, year = "2009", CODEN = "ACSYEC", DOI = "http://doi.acm.org/10.1145/1658357.1658359", ISSN = "0734-2071", ISSN-L = "0734-2071", bibdate = "Mon Mar 15 09:06:46 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/tocs/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Soft errors are an important challenge in contemporary microprocessors. Modern processors have caches and large memory arrays protected by parity or error detection and correction codes. However, today's failure rate is dominated by flip flops, latches, and the increasing sensitivity of combinational logic to particle strikes. Moreover, as Chip Multi-Processors (CMPs) become ubiquitous, meeting the FIT budget for new designs is becoming a major challenge.\par Solutions based on replicating threads have been explored deeply; however, their high cost in performance and energy make them unsuitable for current designs. Moreover, our studies based on a typical configuration for a modern processor show that focusing on the top 5 most vulnerable structures can provide up to 70\% reduction in FIT rate. Therefore, full replication may overprotect the chip by reducing the FIT much below budget.\par We propose {\em Selective Replication}, a lightweight-reconfigurable mechanism that achieves a high FIT reduction by protecting the most vulnerable instructions with minimal performance and energy impact. Low performance degradation is achieved by not requiring additional issue slots and reissuing instructions only during the time window between when they are retirable and they actually retire. Coverage can be reconfigured online by replicating only a subset of the instructions (the most vulnerable ones). Instructions' vulnerability is estimated based on the area they occupy and the time they spend in the issue queue. By changing the vulnerability threshold, we can adjust the trade-off between coverage and performance loss.\par Results for an out-of-order processor configured similarly to Intel{\reg} Core\TM{} Micro-Architecture show that our scheme can achieve over 65\% FIT reduction with less than 4\% performance degradation with small area and complexity overhead.", acknowledgement = ack-nhfb, articleno = "8", fjournal = "ACM Transactions on Computer Systems", keywords = "AVF prediction; FIT reduction; redundant multithreading; Soft errors", } @Article{Wang:2009:TDA, author = "Yin Wang and St{\'e}phane Lafortune and Terence Kelly and Manjunath Kudlur and Scott Mahlke", title = "The theory of deadlock avoidance via discrete control", journal = j-SIGPLAN, volume = "44", number = "1", pages = "252--263", month = jan, year = "2009", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1480881.1480913", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Fri Oct 9 08:40:38 MDT 2009", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Deadlock in multithreaded programs is an increasingly important problem as ubiquitous multicore architectures force parallelization upon an ever wider range of software. This paper presents a theoretical foundation for dynamic deadlock avoidance in concurrent programs that employ conventional mutual exclusion and synchronization primitives (e.g., multithreaded C/Pthreads programs). Beginning with control flow graphs extracted from program source code, we construct a formal model of the program and then apply Discrete Control Theory to automatically synthesize deadlock-avoidance control logic that is implemented by program instrumentation. At run time, the control logic avoids deadlocks by postponing lock acquisitions. Discrete Control Theory guarantees that the program instrumented with our synthesized control logic cannot deadlock. Our method furthermore guarantees that the control logic is maximally permissive: it postpones lock acquisitions only when necessary to prevent deadlocks, and therefore permits maximal runtime concurrency. Our prototype for C/Pthreads scales to real software including Apache, OpenLDAP, and two kinds of benchmarks, automatically avoiding both injected and naturally occurring deadlocks while imposing modest runtime overheads.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "concurrent programming; discrete control theory; dynamic deadlock avoidance; multicore processors; multithreaded programming; parallel programming", } @Article{Yu:2009:CIC, author = "Jie Yu and Satish Narayanasamy", title = "A case for an interleaving constrained shared-memory multi-processor", journal = j-COMP-ARCH-NEWS, volume = "37", number = "3", pages = "325--336", month = jun, year = "2009", CODEN = "CANED2", DOI = "http://doi.acm.org/10.1145/1555815.1555796", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Tue Aug 11 18:12:55 MDT 2009", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Shared-memory multi-threaded programming is inherently more difficult than single-threaded programming. The main source of complexity is that, the threads of an application can interleave in so many different ways. To ensure correctness, a programmer has to test all possible thread interleavings, which, however, is impractical.\par Many rare thread interleavings remain untested in production systems, and they are the root cause for a majority of concurrency bugs. We propose a shared-memory multi-processor design that avoids untested interleavings to improve the correctness of a multi-threaded program. Since untested interleavings tend to occur infrequently at runtime, the performance cost of avoiding them is not high.\par We propose to encode the set of tested correct interleavings in a program's binary executable using {\em Predecessor Set (PSet)\/} constraints. These constraints are efficiently enforced at runtime using processor support, which ensures that the runtime follows a tested interleaving. We analyze several bugs in open source applications such as MySQL, Apache, Mozilla, etc., and show that, by enforcing PSet constraints, we can avoid not only data races and atomicity violations, but also other forms of concurrency bugs.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", keywords = "concurrency bugs; multiprocessors; parallel programming; software reliability", } @Article{Ziarek:2009:SWB, author = "Lukasz Ziarek and Suresh Jagannathan and Matthew Fluet and Umut A. Acar", title = "Speculative {$N$}-way barriers (abstract only)", journal = j-SIGPLAN, volume = "44", number = "5", pages = "8--8", month = may, year = "2009", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1629635.1629637", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Mon Jun 21 18:01:41 MDT 2010", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Speculative execution is an important technique that has historically been used to extract concurrency from sequential programs. While techniques to support speculation work well when computations perform relatively simple actions (e.g., reads and writes to known locations), understanding speculation for multi-threaded programs in which threads may communicate and synchronize through multiple shared references is significantly more challenging, and is the focus of this paper.\par We use as our reference point a simple higher-order concurrent language extended with an n-way barrier and a fork/join execution model. Our technique permits the expression guarded by the barrier to speculatively proceed before the barrier has been satisfied (i.e., before all threads that synchronize on that barrier have done so) and to have participating threads that would normally block on the barrier to speculatively proceed as well. Our solution formulates safety properties under which speculation is correct in a fork/join model, and per-synchronization basis.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", } @Article{Agarwal:2010:DDP, author = "R. Agarwal and S. Bensalem and E. Farchi and K. Havelund and Y. Nir-Buchbinder and S. Stoller and S. Ur and L. Wang", title = "Detection of deadlock potentials in multithreaded programs", journal = j-IBM-JRD, volume = "54", number = "5", pages = "3:1--3:15", month = "????", year = "2010", CODEN = "IBMJAE", DOI = "http://dx.doi.org/10.1147/JRD.2010.2060276", ISSN = "0018-8646 (print), 2151-8556 (electronic)", ISSN-L = "0018-8646", bibdate = "Sun Feb 20 14:29:19 MST 2011", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.research.ibm.com/journal/", acknowledgement = ack-nhfb, fjournal = "IBM Journal of Research and Development", } @Article{Agrawal:2010:HLF, author = "Kunal Agrawal and Charles E. Leiserson and Jim Sukha", title = "Helper locks for fork-join parallel programming", journal = j-SIGPLAN, volume = "45", number = "5", pages = "245--256", month = may, year = "2010", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1693453.1693487", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue Aug 31 22:39:18 MDT 2010", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Helper locks allow programs with large parallel critical sections, called parallel regions, to execute more efficiently by enlisting processors that might otherwise be waiting on the helper lock to aid in the execution of the parallel region. Suppose that a processor {\em p\/} is executing a parallel region {\em A\/} after having acquired the lock {\em L\/} protecting {\em A}. If another processor {\em p\/} $\prime$ tries to acquire {\em L}, then instead of blocking and waiting for {\em p\/} to complete {\em A}, processor {\em p\/} $\prime$ joins {\em p\/} to help it complete {\em A}. Additional processors not blocked on {\em L\/} may also help to execute {\em A}.\par The HELPER runtime system can execute fork-join computations augmented with helper locks and parallel regions. HELPER supports the unbounded nesting of parallel regions. We provide theoretical completion-time and space-usage bounds for a design of HELPER based on work stealing. Specifically, let {\em V\/} be the number of parallel regions in a computation, let {\em T\/}$_1$ be its work, and let {\em T\/} $\infty$ be its 'aggregate span' --- the sum of the spans (critical-path lengths) of all its parallel regions. We prove that HELPER completes the computation in expected time {\em O\/} ({\em T\/}$_1$ / {\em P\/} P + {\em T\/} $\infty$+ {\em PV\/} on {\em P\/} processors. This bound indicates that programs with a small number of highly parallel critical sections can attain linear speedup. For the space bound, we prove that HELPER completes a program using only {\em O\/} ({\em P\/} S$_1$ stack space, where S$_1$ is the sum, over all regions, of the stack space used by each region in a serial execution. Finally, we describe a prototype of HELPER implemented by modifying the Cilk multithreaded runtime system. We used this prototype to implement a concurrent hash table with a resize operation protected by a helper lock.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "Cilk; dynamic multithreading; helper lock; nested parallelism; parallel region; scheduling; work stealing", } @Article{Barthe:2010:SMP, author = "Gilles Barthe and Tamara Rezk and Alejandro Russo and Andrei Sabelfeld", title = "Security of multithreaded programs by compilation", journal = j-TISSEC, volume = "13", number = "3", pages = "21:1--21:??", month = jul, year = "2010", CODEN = "ATISBQ", DOI = "http://doi.acm.org/10.1145/1805974.1895977", ISSN = "1094-9224 (print), 1557-7406 (electronic)", ISSN-L = "1094-9224", bibdate = "Wed Jul 28 14:57:15 MDT 2010", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "End-to-End security of mobile code requires that the code neither intentionally nor accidentally propagates sensitive information to an adversary. Although mobile code is commonly multithreaded low-level code, there lack enforcement mechanisms that ensure information security for such programs. The modularity is three-fold: we give modular extensions of sequential semantics, sequential security typing, and sequential security-type preserving compilation that allow us enforcing security for multithreaded programs. Thanks to the modularity, there are no more restrictions on multithreaded source programs than on sequential ones, and yet we guarantee that their compilations are provably secure for a wide class of schedulers.", acknowledgement = ack-nhfb, articleno = "21", fjournal = "ACM Transactions on Information and System Security", keywords = "compilers; Noninterference; schedulers; type systems", } @Article{Bergan:2010:CCRa, author = "Tom Bergan and Owen Anderson and Joseph Devietti and Luis Ceze and Dan Grossman", title = "{CoreDet}: a compiler and runtime system for deterministic multithreaded execution", journal = j-COMP-ARCH-NEWS, volume = "38", number = "1", pages = "53--64", month = mar, year = "2010", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Mar 17 14:42:04 MDT 2010", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", } @Article{Bergan:2010:CCRb, author = "Tom Bergan and Owen Anderson and Joseph Devietti and Luis Ceze and Dan Grossman", title = "{CoreDet}: a compiler and runtime system for deterministic multithreaded execution", journal = j-SIGPLAN, volume = "45", number = "3", pages = "53--64", month = mar, year = "2010", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1736020.1736029", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Mar 17 13:46:56 MDT 2010", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "The behavior of a multithreaded program does not depend only on its inputs. Scheduling, memory reordering, timing, and low-level hardware effects all introduce nondeterminism in the execution of multithreaded programs. This severely complicates many tasks, including debugging, testing, and automatic replication. In this work, we avoid these complications by eliminating their root cause: we develop a compiler and runtime system that runs arbitrary multithreaded C/C++ POSIX Threads programs deterministically.\par A trivial nonperformant approach to providing determinism is simply deterministically serializing execution. Instead, we present a compiler and runtime infrastructure that ensures determinism but resorts to serialization rarely, for handling interthread communication and synchronization. We develop two basic approaches, both of which are largely dynamic with performance improved by some static compiler optimizations. First, an ownership-based approach detects interthread communication via an evolving table that tracks ownership of memory regions by threads. Second, a buffering approach uses versioned memory and employs a deterministic commit protocol to make changes visible to other threads. While buffering has larger single-threaded overhead than ownership, it tends to scale better (serializing less often). A hybrid system sometimes performs and scales better than either approach individually.\par Our implementation is based on the LLVM compiler infrastructure. It needs neither programmer annotations nor special hardware. Our empirical evaluation uses the PARSEC and SPLASH2 benchmarks and shows that our approach scales comparably to nondeterministic execution.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "compilers; determinism; multicore; multithreading", } @Article{Bokhari:2010:EPM, author = "Shahid Bokhari and Joel Saltz", title = "Exploring the performance of massively multithreaded architectures", journal = j-CCPE, volume = "22", number = "5", pages = "588--616", day = "10", month = apr, year = "2010", CODEN = "CCPEBO", DOI = "http://dx.doi.org/10.1002/cpe.1484", ISSN = "1532-0626 (print), 1532-0634 (electronic)", ISSN-L = "1532-0626", bibdate = "Mon Dec 5 10:08:42 MST 2011", bibsource = "http://www.interscience.wiley.com/jpages/1532-0626; http://www.math.utah.edu/pub/tex/bib/ccpe.bib; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Concurrency and Computation: Prac\-tice and Experience", onlinedate = "1 Sep 2009", } @Article{Bronson:2010:PCB, author = "Nathan G. Bronson and Jared Casper and Hassan Chafi and Kunle Olukotun", title = "A practical concurrent binary search tree", journal = j-SIGPLAN, volume = "45", number = "5", pages = "257--268", month = may, year = "2010", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1693453.1693488", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue Aug 31 22:39:18 MDT 2010", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "We propose a concurrent relaxed balance AVL tree algorithm that is fast, scales well, and tolerates contention. It is based on optimistic techniques adapted from software transactional memory, but takes advantage of specific knowledge of the algorithm to reduce overheads and avoid unnecessary retries. We extend our algorithm with a fast linearizable clone operation, which can be used for consistent iteration of the tree. Experimental evidence shows that our algorithm outperforms a highly tuned concurrent skip list for many access patterns, with an average of 39\% higher single-threaded throughput and 32\% higher multi-threaded throughput over a range of contention levels and operation mixes.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "optimistic concurrency; snapshot isolation", } @Article{Burnim:2010:ACD, author = "Jacob Burnim and Koushik Sen", title = "Asserting and checking determinism for multithreaded programs", journal = j-CACM, volume = "53", number = "6", pages = "97--105", month = jun, year = "2010", CODEN = "CACMA2", DOI = "http://doi.acm.org/10.1145/1743546.1743572", ISSN = "0001-0782 (print), 1557-7317 (electronic)", ISSN-L = "0001-0782", bibdate = "Mon Jun 21 12:34:55 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/cacm/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Communications of the ACM", } @Article{Chen:2010:CCM, author = "Changno Chen and Marc Moreno Maza and Yuzhen Xie", title = "Cache complexity and multicore implementation for univariate real root isolation", journal = j-ACM-COMM-COMP-ALGEBRA, volume = "44", number = "3", pages = "97--98", month = sep, year = "2010", CODEN = "????", DOI = "http://dx.doi.org/10.1145/1940475.1940483", ISSN = "1932-2232 (print), 1932-2240 (electronic)", ISSN-L = "1932-2232", bibdate = "Thu Mar 31 10:24:16 MDT 2011", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Isolating the real roots of a univariate polynomial is a driving subject in computer algebra. This problem has been studied under various angles from algebraic algorithms [1, 2, 7] to implementation techniques [3, 5]. Today, multicores are the most popular parallel hardware architectures. Beside, understanding the implications of hierarchical memory on performance software engineering has become essential. These observations motivate our study. We analyze the cache complexity of the core routine of many real root isolation algorithms namely, the Taylor shift. Then, we present efficient multithreaded implementation on multicores.", acknowledgement = ack-nhfb, fjournal = "ACM Communications in Computer Algebra", issue = "173", } @Article{Chetlur:2010:SWM, author = "M. Chetlur and U. Devi and P. Dutta and P. Gupta and L. Chen and Z. Zhu and S. Kalyanaraman and Y. Lin", title = "A software {WiMAX} medium access control layer using massively multithreaded processors", journal = j-IBM-JRD, volume = "54", number = "1", pages = "??--??", month = "????", year = "2010", CODEN = "IBMJAE", ISSN = "0018-8646 (print), 2151-8556 (electronic)", ISSN-L = "0018-8646", bibdate = "Sat May 1 17:44:14 MDT 2010", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.research.ibm.com/journal/", URL = "http://www.research.ibm.com/journal/abstracts/rd/541/chetlur-dutta.html", acknowledgement = ack-nhfb, articleno = "9", fjournal = "IBM Journal of Research and Development", } @Article{Choi:2010:MDA, author = "Jee W. Choi and Amik Singh and Richard W. Vuduc", title = "Model-driven autotuning of sparse matrix-vector multiply on {GPUs}", journal = j-SIGPLAN, volume = "45", number = "5", pages = "115--126", month = may, year = "2010", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1693453.1693471", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue Aug 31 22:39:18 MDT 2010", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "We present a performance model-driven framework for automated performance tuning (autotuning) of sparse matrix-vector multiply (SpMV) on systems accelerated by graphics processing units (GPU). Our study consists of two parts.\par First, we describe several carefully hand-tuned SpMV implementations for GPUs, identifying key GPU-specific performance limitations, enhancements, and tuning opportunities. These implementations, which include variants on classical blocked compressed sparse row (BCSR) and blocked ELLPACK (BELLPACK) storage formats, match or exceed state-of-the-art implementations. For instance, our best BELLPACK implementation achieves up to 29.0 Gflop/s in single-precision and 15.7 Gflop/s in double-precision on the NVIDIA T10P multiprocessor (C1060), enhancing prior state-of-the-art unblocked implementations (Bell and Garland, 2009) by up to 1.8\times and 1.5\times for single-and double-precision respectively.\par However, achieving this level of performance requires input matrix-dependent parameter tuning. Thus, in the second part of this study, we develop a performance model that can guide tuning. Like prior autotuning models for CPUs (e.g., Im, Yelick, and Vuduc, 2004), this model requires offline measurements and run-time estimation, but more directly models the structure of multithreaded vector processors like GPUs. We show that our model can identify the implementations that achieve within 15\% of those found through exhaustive search.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "GPU; performance modeling; sparse matrix-vector multiplication", } @Article{Coons:2010:GEU, author = "Katherine E. Coons and Sebastian Burckhardt and Madanlal Musuvathi", title = "{GAMBIT}: effective unit testing for concurrency libraries", journal = j-SIGPLAN, volume = "45", number = "5", pages = "15--24", month = may, year = "2010", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1837853.1693458", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue Aug 31 22:39:18 MDT 2010", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "As concurrent programming becomes prevalent, software providers are investing in concurrency libraries to improve programmer productivity. Concurrency libraries improve productivity by hiding error-prone, low-level synchronization from programmers and providing higher-level concurrent abstractions. Testing such libraries is difficult, however, because concurrency failures often manifest only under particular scheduling circumstances. Current best testing practices are often inadequate: heuristic-guided fuzzing is not systematic, systematic schedule enumeration does not find bugs quickly, and stress testing is neither systematic nor fast.\par To address these shortcomings, we propose a prioritized search technique called GAMBIT that combines the speed benefits of heuristic-guided fuzzing with the soundness, progress, and reproducibility guarantees of stateless model checking. GAMBIT combines known techniques such as partial-order reduction and preemption-bounding with a generalized best-first search frame- work that prioritizes schedules likely to expose bugs. We evaluate GAMBIT's effectiveness on newly released concurrency libraries for Microsoft's .NET framework. Our experiments show that GAMBIT finds bugs more quickly than prior stateless model checking techniques without compromising coverage guarantees or reproducibility.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "concurrency; model checking; multithreading; partial-order reduction; preemption bound; software testing", } @Article{Ding:2010:PCM, author = "Jason Jianxun Ding and Abdul Waheed and Jingnan Yao and Laxmi N. Bhuyan", title = "Performance characterization of multi-thread and multi-core processors based {XML} application oriented networking systems", journal = j-J-PAR-DIST-COMP, volume = "70", number = "5", pages = "584--597", month = may, year = "2010", CODEN = "JPDCER", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Wed Sep 1 16:27:28 MDT 2010", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.sciencedirect.com/science/journal/07437315", acknowledgement = ack-nhfb, fjournal = "Journal of Parallel and Distributed Computing", } @Article{Dohi:2010:IPE, author = "Keisuke Dohi and Yuichiro Shibata and Tsuyoshi Hamada and Tomonari Masada and Kiyoshi Oguri and Duncan A. Buell", title = "Implementation of a programming environment with a multithread model for reconfigurable systems", journal = j-COMP-ARCH-NEWS, volume = "38", number = "4", pages = "40--45", month = sep, year = "2010", CODEN = "CANED2", DOI = "http://dx.doi.org/10.1145/1926367.1926375", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Jan 20 14:27:03 MST 2011", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Reconfigurable systems are known to be able to achieve higher performance than traditional microprocessor architecture for many application fields. However, in order to extract a full potential of the reconfigurable systems, programmers often have to design and describe the best suited code for their target architecture with specialized knowledge. The aim of this paper is to assist the users of reconfigurable systems by implementing a translator with a multithread model. The experimental results show our translator automatically generates efficient performance-aware code segments including DMA transfer and shift registers for memory access optimization.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", } @Article{Eggers:2010:AL, author = "Susan Eggers", title = "{2010 Athena} lecture", journal = j-SIGPLAN, volume = "45", number = "6", pages = "98--98", month = jun, year = "2010", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1809028.1806608", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Fri Oct 8 17:53:18 MDT 2010", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Susan Eggers, a Professor of Computer Science and Engineering at the University of Washington, joined her department in 1989. She received a B.A. in 1965 from Connecticut College and a Ph. D. in 1989 from the University of California, Berkeley. Her research interests are in computer architecture and back-end compiler optimization, with an emphasis on experimental performance analysis. With her colleague Hank Levy and their students, she developed the first commercially viable multithreaded architecture, Simultaneous Multithreading, adopted by Intel (as Hyperthreading), IBM, Sun and others. Her current research is in the areas of distributed dataflow machines, FPGAs and chip multiprocessors. In 1989 Professor Eggers was awarded an IBM Faculty Development Award, in 1990 an NSF Presidential Young Investigator Award, in 1994 the Microsoft Professorship in Computer Science and Engineering, and in 2009 the ACM-W Athena Lecturer. She is a Fellow of the ACM and IEEE, a Fellow of the AAAS, and a member of the National Academy of Engineering.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "invited talk", } @Article{Eyerman:2010:PJS, author = "Stijn Eyerman and Lieven Eeckhout", title = "Probabilistic job symbiosis modeling for {SMT} processor scheduling", journal = j-SIGPLAN, volume = "45", number = "3", pages = "91--102", month = mar, year = "2010", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1736020.1736033", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Mar 17 13:46:56 MDT 2010", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Symbiotic job scheduling boosts simultaneous multithreading (SMT) processor performance by co-scheduling jobs that have `compatible' demands on the processor's shared resources. Existing approaches however require a sampling phase, evaluate a limited number of possible co-schedules, use heuristics to gauge symbiosis, are rigid in their optimization target, and do not preserve system-level priorities/shares.\par This paper proposes probabilistic job symbiosis modeling, which predicts whether jobs will create positive or negative symbiosis when co-scheduled without requiring the co-schedule to be evaluated. The model, which uses per-thread cycle stacks computed through a previously proposed cycle accounting architecture, is simple enough to be used in system software. Probabilistic job symbiosis modeling provides six key innovations over prior work in symbiotic job scheduling: (i) it does not require a sampling phase, (ii) it readjusts the job co-schedule continuously, (iii) it evaluates a large number of possible co-schedules at very low overhead, (iv) it is not driven by heuristics, (v) it can optimize a performance target of interest (e.g., system throughput or job turnaround time), and (vi) it preserves system-level priorities/shares. These innovations make symbiotic job scheduling both practical and effective.\par Our experimental evaluation, which assumes a realistic scenario in which jobs come and go, reports an average 16\% (and up to 35\%) reduction in job turnaround time compared to the previously proposed SOS (sample, optimize, symbios) approach for a two-thread SMT processor, and an average 19\% (and up to 45\%) reduction in job turnaround time for a four-thread SMT processor.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "performance modeling; simultaneous multi-threading (SMT); symbiotic job scheduling", } @Article{Gibson:2010:FSC, author = "Dan Gibson and David A. Wood", title = "{Forwardflow}: a scalable core for power-constrained {CMPs}", journal = j-COMP-ARCH-NEWS, volume = "38", number = "3", pages = "14--25", month = jun, year = "2010", CODEN = "CANED2", DOI = "http://doi.acm.org/10.1145/1816038.1815966", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Tue Jul 6 14:11:46 MDT 2010", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Chip Multiprocessors (CMPs) are now commodity hardware, but commoditization of parallel software remains elusive. In the near term, the current trend of increased core-per-socket count will continue, despite a lack of parallel software to exercise the hardware. Future CMPs must deliver thread-level parallelism when software provides threads to run, but must also continue to deliver performance gains for single threads by exploiting instruction-level parallelism and memory-level parallelism. However, power limitations will prevent conventional cores from exploiting both simultaneously.\par This work presents the Forwardflow Architecture, which can scale its execution logic up to run single threads, or down to run multiple threads in a CMP. Forwardflow dynamically builds an explicit internal dataflow representation from a conventional instruction set architecture, using forward dependence pointers to guide instruction wakeup, selection, and issue. Forwardflow's backend is organized into discrete units that can be individually (de-)activated, allowing each core's performance to be scaled by system software at the architectural level.\par On single threads, Forwardflow core scaling yields a mean runtime reduction of 21\% for a 37\% increase in power consumption. For multithreaded workloads, a Forwardflow-based CMP allows system software to select the performance point that best matches available power.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", keywords = "chip multiprocessor (cmp); power; scalable core", } @Article{Illikkal:2010:PQP, author = "Ramesh Illikkal and Vineet Chadha and Andrew Herdrich and Ravi Iyer and Donald Newell", title = "{PIRATE}: {QoS} and performance management in {CMP} architectures", journal = j-SIGMETRICS, volume = "37", number = "4", pages = "3--10", month = mar, year = "2010", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1773394.1773396", ISSN = "0163-5999 (print), 1557-9484 (electronic)", ISSN-L = "0163-5999", bibdate = "Wed Aug 25 07:35:13 MDT 2010", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "As new multi-threaded usage models such as virtualization and consolidation take advantage of multiple cores in CMP architectures, the impact of shared resource contention between VMs and user-level applications introduces Quality of Service(QoS) concerns and challenges. QoS-aware management of these shared platform resources is therefore becoming increasingly important. Various QoS schemes for resource management have been recently proposed, but most of these prior efforts have been focused on controlling individual resource allocation based on priority information passed down from the OS or Hypervisor to system resources. The complexity of this approach increases when multiple levels of resources are associated with an application's performance and power consumption. In this paper we employ simpler rate-based QoS mechanisms which control the execution rate of competing applications. To enable differentiation between simultaneously running applications' performance and power consumption, these rate mechanisms need to dynamically adjust the execution of application. Our proposed PI-RATE architecture introduces a control-theoretic approach to dynamically adjust the execution rate of each application based on the QoS target and monitored resource utilization. We evaluate three modes of PI-RATE architecture --- cache QoS targets, performance QoS targets and power QoS targets --- to show that the PI-RATE architecture is flexible and effective at enabling QoS in a CMP platform.", acknowledgement = ack-nhfb, fjournal = "ACM SIGMETRICS Performance Evaluation Review", keywords = "clock modulation; frequency scaling; integral controller; proportional", } @Article{Jang:2010:DTE, author = "Byunghyun Jang and Perhaad Mistry and Dana Schaa and Rodrigo Dominguez and David Kaeli", title = "Data transformations enabling loop vectorization on multithreaded data parallel architectures", journal = j-SIGPLAN, volume = "45", number = "5", pages = "353--354", month = may, year = "2010", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1837853.1693510", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue Aug 31 22:39:18 MDT 2010", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Loop vectorization, a key feature exploited to obtain high performance on Single Instruction Multiple Data (SIMD) vector architectures, is significantly hindered by irregular memory access patterns in the data stream. This paper describes data transformations that allow us to vectorize loops targeting massively multithreaded data parallel architectures. We present a mathematical model that captures loop-based memory access patterns and computes the most appropriate data transformations in order to enable vectorization. Our experimental results show that the proposed data transformations can significantly increase the number of loops that can be vectorized and enhance the data-level parallelism of applications. Our results also show that the overhead associated with our data transformations can be easily amortized as the size of the input data set increases. For the set of high performance benchmark kernels studied, we achieve consistent and significant performance improvements (up to 11.4X) by applying vectorization using our data transformation approach.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "data transformation; GPGPU; loop vectorization", } @Article{Laadan:2010:TLA, author = "Oren Laadan and Nicolas Viennot and Jason Nieh", title = "Transparent, lightweight application execution replay on commodity multiprocessor operating systems", journal = j-SIGMETRICS, volume = "38", number = "1", pages = "155--166", month = jun, year = "2010", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1811039.1811057", ISSN = "0163-5999 (print), 1557-9484 (electronic)", ISSN-L = "0163-5999", bibdate = "Wed Aug 25 07:35:52 MDT 2010", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "We present Scribe, the first system to provide transparent, low-overhead application record-replay and the ability to go live from replayed execution. Scribe introduces new lightweight operating system mechanisms, rendezvous and sync points, to efficiently record nondeterministic interactions such as related system calls, signals, and shared memory accesses. Rendezvous points make a partial ordering of execution based on system call dependencies sufficient for replay, avoiding the recording overhead of maintaining an exact execution ordering. Sync points convert asynchronous interactions that can occur at arbitrary times into synchronous events that are much easier to record and replay.\par We have implemented Scribe without changing, relinking, or recompiling applications, libraries, or operating system kernels, and without any specialized hardware support such as hardware performance counters. It works on commodity Linux operating systems, and commodity multi-core and multiprocessor hardware. Our results show for the first time that an operating system mechanism can correctly and transparently record and replay multi-process and multi-threaded applications on commodity multiprocessors. Scribe recording overhead is less than 2.5\% for server applications including Apache and MySQL, and less than 15\% for desktop applications including Firefox, Acrobat, OpenOffice, parallel kernel compilation, and movie playback.", acknowledgement = ack-nhfb, fjournal = "ACM SIGMETRICS Performance Evaluation Review", keywords = "debugging; fault-tolerance; record-replay; virtualization", } @Article{Lee:2010:REO, author = "Dongyoon Lee and Benjamin Wester and Kaushik Veeraraghavan and Satish Narayanasamy and Peter M. Chen and Jason Flinn", title = "{Respec}: efficient online multiprocessor replay via speculation and external determinism", journal = j-SIGPLAN, volume = "45", number = "3", pages = "77--90", month = mar, year = "2010", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1736020.1736031", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Mar 17 13:46:56 MDT 2010", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Deterministic replay systems record and reproduce the execution of a hardware or software system. While it is well known how to replay uniprocessor systems, replaying shared memory multiprocessor systems at low overhead on commodity hardware is still an open problem. This paper presents Respec, a new way to support deterministic replay of shared memory multithreaded programs on commodity multiprocessor hardware. Respec targets online replay in which the recorded and replayed processes execute concurrently.\par Respec uses two strategies to reduce overhead while still ensuring correctness: speculative logging and externally deterministic replay. Speculative logging optimistically logs less information about shared memory dependencies than is needed to guarantee deterministic replay, then recovers and retries if the replayed process diverges from the recorded process. Externally deterministic replay relaxes the degree to which the two executions must match by requiring only their system output and final program states match. We show that the combination of these two techniques results in low recording and replay overhead for the common case of data-race-free execution intervals and still ensures correct replay for execution intervals that have data races.\par We modified the Linux kernel to implement our techniques. Our software system adds on average about 18\% overhead to the execution time for recording and replaying programs with two threads and 55\% overhead for programs with four threads.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "external determinism; replay; speculative execution", } @Article{Lin:2010:TAC, author = "Yi-Neng Lin and Ying-Dar Lin and Yuan-Cheng Lai", title = "Thread allocation in {CMP}-based multithreaded network processors", journal = j-PARALLEL-COMPUTING, volume = "36", number = "2--3", pages = "104--116", month = feb # "\slash " # mar, year = "2010", CODEN = "PACOEJ", ISSN = "0167-8191", ISSN-L = "0167-8191", bibdate = "Thu Sep 2 17:51:12 MDT 2010", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.sciencedirect.com/science/journal/01678191", acknowledgement = ack-nhfb, fjournal = "Parallel Computing", } @Article{Mannarswamy:2010:CAS, author = "Sandya Mannarswamy and Dhruva R. Chakrabarti and Kaushik Rajan and Sujoy Saraswati", title = "Compiler aided selective lock assignment for improving the performance of software transactional memory", journal = j-SIGPLAN, volume = "45", number = "5", pages = "37--46", month = may, year = "2010", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1693453.1693460", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue Aug 31 22:39:18 MDT 2010", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Atomic sections have been recently introduced as a language construct to improve the programmability of concurrent software. They simplify programming by not requiring the explicit specification of locks for shared data. Typically atomic sections are supported in software either through the use of optimistic concurrency by using transactional memory or through the use of pessimistic concurrency using compiler-assigned locks. As a software transactional memory (STM) system does not take advantage of the specific memory access patterns of an application it often suffers from false conflicts and high validation overheads. On the other hand, the compiler usually ends up assigning coarse grain locks as it relies on whole program points-to analysis which is conservative by nature. This adversely affects performance by limiting concurrency. In order to mitigate the disadvantages associated with STM's lock assignment scheme, we propose a hybrid approach which combines STM's lock assignment with a compiler aided selective lock assignment scheme (referred to as SCLA-STM). SCLA-STM overcomes the inefficiencies associated with a purely compile-time lock assignment approach by (i) using the underlying STM for shared variables where only a conservative analysis is possible by the compiler (e.g., in the presence of may-alias points to information) and (ii) being selective about the shared data chosen for the compiler-aided lock assignment. We describe our prototype SCLA-STM scheme implemented in the HP-UX IA-64 C/C++ compiler, using TL2 as our STM implementation. We show that SCLA-STM improves application performance for certain STAMP benchmarks from 1.68\% to 37.13\%.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "compilers; multithreading; parallelization; performance", } @Article{Marino:2010:DSE, author = "Daniel Marino and Abhayendra Singh and Todd Millstein and Madanlal Musuvathi and Satish Narayanasamy", title = "{DRFX}: a simple and efficient memory model for concurrent programming languages", journal = j-SIGPLAN, volume = "45", number = "6", pages = "351--362", month = jun, year = "2010", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1806596.1806636", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Fri Oct 8 17:53:18 MDT 2010", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "The most intuitive memory model for shared-memory multithreaded programming is {\em sequential consistency\/} (SC), but it disallows the use of many compiler and hardware optimizations thereby impacting performance. Data-race-free (DRF) models, such as the proposed C++0x memory model, guarantee SC execution for datarace-free programs. But these models provide no guarantee at all for racy programs, compromising the safety and debuggability of such programs. To address the safety issue, the Java memory model, which is also based on the DRF model, provides a weak semantics for racy executions. However, this semantics is subtle and complex, making it difficult for programmers to reason about their programs and for compiler writers to ensure the correctness of compiler optimizations.\par We present the DRFx memory model, which is simple for programmers to understand and use while still supporting many common optimizations. We introduce a {\em memory model (MM) exception\/} which can be signaled to halt execution. If a program executes without throwing this exception, then DRFx guarantees that the execution is SC. If a program throws an MM exception during an execution, then DRFx guarantees that the program has a data race. We observe that SC violations can be detected in hardware through a lightweight form of conflict detection. Furthermore, our model safely allows aggressive compiler and hardware optimizations within compiler-designated program regions. We formalize our memory model, prove several properties about this model, describe a compiler and hardware design suitable for DRFx, and evaluate the performance overhead due to our compiler and hardware requirements.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "data races; memory model exception; memory models; sequential consistency; soft fences", } @Article{McKenney:2010:WGM, author = "Paul E. McKenney and Maged M. Michael and Josh Triplett and Jonathan Walpole", title = "Why the grass may not be greener on the other side: a comparison of locking vs. transactional memory", journal = j-OPER-SYS-REV, volume = "44", number = "3", pages = "93--101", month = jul, year = "2010", CODEN = "OSRED8", DOI = "http://doi.acm.org/10.1145/1842733.1842749", ISSN = "0163-5980", ISSN-L = "0163-5980", bibdate = "Thu Aug 19 14:21:54 MDT 2010", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "The advent of multi-core and multi-threaded processor architectures highlights the need to address the well-known shortcomings of the ubiquitous lock-based synchronization mechanisms. To this end, transactional memory has been viewed by many as a promising alternative to locking. This paper therefore presents a constructive critique of locking and transactional memory: their strengths, weaknesses, and opportunities for improvement.", acknowledgement = ack-nhfb, fjournal = "ACM SIGOPS Operating Systems Review", } @Article{Meng:2010:AOS, author = "Lingchuan Meng and Jeremy Johnson and Franz Franchetti and Yevgen Voronenko and Marc Moreno Maza and Yuzhen Xie", title = "Abstract only: {SPIRAL}-generated modular {FFTs}", journal = j-ACM-COMM-COMP-ALGEBRA, volume = "44", number = "2", pages = "25--26", month = jun, year = "2010", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1838599.1838616", ISSN = "1932-2232 (print), 1932-2240 (electronic)", ISSN-L = "1932-2232", bibdate = "Mon Aug 2 13:47:24 MDT 2010", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "In this poster we present the use of the SPIRAL system (www.spiral.net) to generate code for modular Fast Fourier Transforms (FFTs). SPIRAL is a library generation system that automatically generates platform-tuned implementations of digital signal processing algorithms with an emphasis on fast transforms. Currently, SPRIAL can generate highly optimized fixed point and floating-point FFTs for a variety of platforms including vectorization, multi-threaded and distributed memory parallelization. The code produced is competitive with the best available code for these platforms and SPIRAL is used by Intel for its IPP (Intel Performance Primitives) and MKL (Math kernel Library) libraries.\par The SPIRAL system uses a mathematical framework for representing and deriving algorithms. Algorithms are derived using rewrite rules and additional rules are used to symbolically manipulate algorithms into forms that take advantage of the underlying hardware. A search engine with a feedback loop is used to tune implementations to particular platforms. New transforms are added by introducing new symbols and their definition and new algorithms can be generated by adding new rules.\par We extended SPIRAL to generate algorithms for FFT computation over finite fields. This addition required adding a new data type, several new rules and a new transform (ModDFT) definition. In addition, the unparser (where code is generated) was extended so that it can generate scalar and vectorized code for modular arithmetic. With these enhancements, the SPRIAL machinery can be applied to modular transforms that are of interest to the computer algebra community. This provides a framework for systematically optimizing these transforms, utilizing vector and parallel computation, and for automatically tuning them to different platforms. In this poster we present preliminary results from this exploration. We show that the code generated by SPIRAL, with improved cache locality and vectorization, is approximately ten times faster than the modular FFT code in the modpn library.", acknowledgement = ack-nhfb, fjournal = "ACM Communications in Computer Algebra", issue = "172", } @Article{Meng:2010:DWS, author = "Jiayuan Meng and David Tarjan and Kevin Skadron", title = "Dynamic warp subdivision for integrated branch and memory divergence tolerance", journal = j-COMP-ARCH-NEWS, volume = "38", number = "3", pages = "235--246", month = jun, year = "2010", CODEN = "CANED2", DOI = "http://doi.acm.org/10.1145/1815961.1815992", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Tue Jul 6 14:11:46 MDT 2010", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "SIMD organizations amortize the area and power of fetch, decode, and issue logic across multiple processing units in order to maximize throughput for a given area and power budget. However, throughput is reduced when a set of threads operating in lockstep (a warp) are stalled due to long latency memory accesses. The resulting idle cycles are extremely costly. Multi-threading can hide latencies by interleaving the execution of multiple warps, but deep multi-threading using many warps dramatically increases the cost of the register files (multi-threading depth $\times$ SIMD width), and cache contention can make performance worse. Instead, intra-warp latency hiding should first be exploited. This allows threads that are ready but stalled by SIMD restrictions to use these idle cycles and reduces the need for multi-threading among warps. This paper introduces {\em dynamic warp subdivision\/} (DWS), which allows a single warp to occupy more than one slot in the scheduler without requiring extra register file space. Independent scheduling entities allow divergent branch paths to interleave their execution, and allow threads that hit to run ahead. The result is improved latency hiding and memory level parallelism (MLP). We evaluate the technique on a coherent cache hierarchy with private L1 caches and a shared L2 cache. With an area overhead of less than 1\%, experiments with eight data-parallel benchmarks show our technique improves performance on average by 1.7$\times$.", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", keywords = "branch divergence; cache; latency hiding; memory divergence; SIMD; warp", } @Article{Muralidhara:2010:IAS, author = "Sai Prashanth Muralidhara and Mahmut Kandemir and Padma Raghavan", title = "Intra-application shared cache partitioning for multithreaded applications", journal = j-SIGPLAN, volume = "45", number = "5", pages = "329--330", month = may, year = "2010", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1837853.1693498", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue Aug 31 22:39:18 MDT 2010", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "In this paper, we address the problem of partitioning a shared cache when the executing threads belong to the same application.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "cache; multicore; parallel applications", } @Article{Nakaike:2010:LER, author = "Takuya Nakaike and Maged M. Michael", title = "Lock elision for read-only critical sections in {Java}", journal = j-SIGPLAN, volume = "45", number = "6", pages = "269--278", month = jun, year = "2010", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1806596.1806627", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Fri Oct 8 17:53:18 MDT 2010", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "It is not uncommon in parallel workloads to encounter shared data structures with read-mostly access patterns, where operations that update data are infrequent and most operations are read-only. Typically, data consistency is guaranteed using mutual exclusion or read-write locks. The cost of atomic update of lock variables result in high overheads and high cache coherence traffic under active sharing, thus slowing down single thread performance and limiting scalability.\par In this paper, we present {\em SOLERO (Software Optimistic Lock Elision for Read-Only critical sections)}, a new lock implementation called for optimizing read-only critical sections in Java based on sequential locks. SOLERO is compatible with the conventional lock implementation of Java. However, unlike the conventional implementation, only critical sections that may write data or have side effects need to update lock variables, while read-only critical sections need only read lock variables without writing them. Each writing critical section changes the lock value to a new value. Hence, a read-only critical section is guaranteed to be consistent if the lock is free and its value does not change from the beginning to the end of the read-only critical section.\par Using Java workloads including SPECjbb2005 and the HashMap and TreeMap Java classes, we evaluate the performance impact of applying SOLERO to read-mostly locks. Our experimental results show performance improvements across the board, often substantial, in both single thread speed and scalability over the conventional lock implementation (mutual exclusion) and read-write locks. SOLERO improves the performance of SPECjbb2005 by 3-5\% on single and multiple threads. The results using the HashMap and TreeMap benchmarks show that SOLERO outperforms the conventional lock implementation and read-write locks by substantial multiples on multi-threads.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "java; just-in-time compiler; lock; lock elision; monitor; optimization; synchronization", } @Article{Park:2010:ISP, author = "Jung-Wook Park and Hoon-Mo Yang and Gi-Ho Park and Shin-Dug Kim and Charles C. Weems", title = "An instruction-systolic programmable shader architecture for multi-threaded {$3$D} graphics processing", journal = j-J-PAR-DIST-COMP, volume = "70", number = "11", pages = "1110--1118", month = nov, year = "2010", CODEN = "JPDCER", ISSN = "0743-7315 (print), 1096-0848 (electronic)", ISSN-L = "0743-7315", bibdate = "Wed Sep 1 16:27:29 MDT 2010", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.sciencedirect.com/science/journal/07437315", acknowledgement = ack-nhfb, fjournal = "Journal of Parallel and Distributed Computing", } @Article{Radojkovic:2010:TSB, author = "Petar Radojkovi{\'c} and Vladimir {\v{C}}akarevi{\'c} and Javier Verd{\'u} and Alex Pajuelo and Francisco J. Cazorla and Mario Nemirovsky and Mateo Valero", title = "Thread to strand binding of parallel network applications in massive multi-threaded systems", journal = j-SIGPLAN, volume = "45", number = "5", pages = "191--202", month = may, year = "2010", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1837853.1693480", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue Aug 31 22:39:18 MDT 2010", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "In processors with several levels of hardware resource sharing,like CMPs in which each core is an SMT, the scheduling process becomes more complex than in processors with a single level of resource sharing, such as pure-SMT or pure-CMP processors. Once the operating system selects the set of applications to simultaneously schedule on the processor (workload), each application/thread must be assigned to one of the hardware contexts(strands). We call this last scheduling step the Thread to Strand Binding or TSB. In this paper, we show that the TSB impact on the performance of processors with several levels of shared resources is high. We measure a variation of up to 59\% between different TSBs of real multithreaded network applications running on the UltraSPARC T2 processor which has three levels of resource sharing. In our view, this problem is going to be more acute in future multithreaded architectures comprising more cores, more contexts per core, and more levels of resource sharing.\par We propose a resource-sharing aware TSB algorithm (TSBSched) that significantly facilitates the problem of thread to strand binding for software-pipelined applications, representative of multithreaded network applications. Our systematic approach encapsulates both, the characteristics of multithreaded processors under the study and the structure of the software pipelined applications. Once calibrated for a given processor architecture, our proposal does not require hardware knowledge on the side of the programmer, nor extensive profiling of the application. We validate our algorithm on the UltraSPARC T2 processor running a set of real multithreaded network applications on which we report improvements of up to 46\% compared to the current state-of-the-art dynamic schedulers.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "CMT; process scheduling; simultaneous multithreading; UltraSPARC T2", } @Article{Rakvic:2010:TMT, author = "R. Rakvic and Q. Cai and J. Gonz{\'a}lez and G. Magklis and P. Chaparro and A. Gonz{\'a}lez", title = "Thread-management techniques to maximize efficiency in multicore and simultaneous multithreaded microprocessors", journal = j-TACO, volume = "7", number = "2", pages = "9:1--9:??", month = sep, year = "2010", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1839667.1839671", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Oct 2 18:05:46 MDT 2010", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "We provide an analysis of thread-management techniques that increase performance or reduce energy in multicore and Simultaneous Multithreaded (SMT) cores. Thread delaying reduces energy consumption by running the core containing the critical thread at maximum frequency while scaling down the frequency and voltage of the cores containing noncritical threads. In this article, we provide an insightful breakdown of thread delaying on a simulated multi-core microprocessor. Thread balancing improves overall performance by giving higher priority to the critical thread in the issue queue of an SMT core. We provide a detailed breakdown of performance results for thread-balancing, identifying performance benefits and limitations. For those benchmarks where a performance benefit is not possible, we introduce a novel thread-balancing mechanism on an SMT core that can reduce energy consumption. We have performed a detailed study on an Intel microprocessor simulator running parallel applications. Thread delaying can reduce energy consumption by 4\% to 44\% with negligible performance loss. Thread balancing can increase performance by 20\% or can reduce energy consumption by 23\%.", acknowledgement = ack-nhfb, articleno = "9", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", keywords = "critical threads; energy-aware; low-power; Meeting point thread characterization; microarchitecture; multi-threaded application; thread balancing; thread delaying", } @Article{Raman:2010:SPUa, author = "Arun Raman and Hanjun Kim and Thomas R. Mason and Thomas B. Jablin and David I. August", title = "Speculative parallelization using software multi-threaded transactions", journal = j-COMP-ARCH-NEWS, volume = "38", number = "1", pages = "65--76", month = mar, year = "2010", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Mar 17 14:42:04 MDT 2010", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", } @Article{Raman:2010:SPUb, author = "Arun Raman and Hanjun Kim and Thomas R. Mason and Thomas B. Jablin and David I. August", title = "Speculative parallelization using software multi-threaded transactions", journal = j-SIGPLAN, volume = "45", number = "3", pages = "65--76", month = mar, year = "2010", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1736020.1736030", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Mar 17 13:46:56 MDT 2010", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "With the right techniques, multicore architectures may be able to continue the exponential performance trend that elevated the performance of applications of all types for decades. While many scientific programs can be parallelized without speculative techniques, speculative parallelism appears to be the key to continuing this trend for general-purpose applications. Recently-proposed code parallelization techniques, such as those by Bridges et al. and by Thies et al., demonstrate scalable performance on multiple cores by using speculation to divide code into atomic units (transactions) that span multiple threads in order to expose data parallelism. Unfortunately, most software and hardware Thread-Level Speculation (TLS) memory systems and transactional memories are not sufficient because they only support single-threaded atomic units. Multi-threaded Transactions (MTXs) address this problem, but they require expensive hardware support as currently proposed in the literature. This paper proposes a Software MTX (SMTX) system that captures the {\em applicability\/} and {\em performance\/} of hardware MTX, but on {\em existing multicore machines}. The SMTX system yields a harmonic mean speedup of 13.36x on native hardware with four 6-core processors (24 cores in total) running speculatively parallelized applications.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "automatic parallelization; loop-level parallelism; multi-threaded transactions; pipelined parallelism; software transactional memory; thread-level speculation", } @Article{Rashid:2010:AEP, author = "Layali Rashid and Wessam M. Hassanein and Moustafa A. Hammad", title = "Analyzing and enhancing the parallel sort operation on multithreaded architectures", journal = j-J-SUPERCOMPUTING, volume = "53", number = "2", pages = "293--312", month = aug, year = "2010", CODEN = "JOSUED", ISSN = "0920-8542 (print), 1573-0484 (electronic)", ISSN-L = "0920-8542", bibdate = "Wed Aug 25 08:39:00 MDT 2010", bibsource = "http://springerlink.metapress.com/openurl.asp?genre=issue&issn=0920-8542&volume=53&issue=2; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.springerlink.com/openurl.asp?genre=article&issn=0920-8542&volume=53&issue=2&spage=293", acknowledgement = ack-nhfb, fjournal = "The Journal of Supercomputing", } @Article{Sanchez:2010:ACI, author = "Daniel Sanchez and George Michelogiannakis and Christos Kozyrakis", title = "An analysis of on-chip interconnection networks for large-scale chip multiprocessors", journal = j-TACO, volume = "7", number = "1", pages = "4:1--4:??", month = apr, year = "2010", CODEN = "????", DOI = "http://dx.doi.org/10.1145/1756065.1736069", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Wed May 5 15:38:13 MDT 2010", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "With the number of cores of chip multiprocessors (CMPs) rapidly growing as technology scales down, connecting the different components of a CMP in a scalable and efficient way becomes increasingly challenging. In this article, we explore the architectural-level implications of interconnection network design for CMPs with up to 128 fine-grain multithreaded cores. We evaluate and compare different network topologies using accurate simulation of the full chip, including the memory hierarchy and interconnect, and using a diverse set of scientific and engineering workloads.\par We find that the interconnect has a large impact on performance, as it is responsible for 60\% to 75\% of the miss latency. Latency, and not bandwidth, is the primary performance constraint, since, even with many threads per core and workloads with high miss rates, networks with enough bandwidth can be efficiently implemented for the system scales we consider. From the topologies we study, the flattened butterfly consistently outperforms the mesh and fat tree on all workloads, leading to performance advantages of up to 22\%. We also show that considering interconnect and memory hierarchy together when designing large-scale CMPs is crucial, and neglecting either of the two can lead to incorrect conclusions. Finally, the effect of the interconnect on overall performance becomes more important as the number of cores increases, making interconnection choices especially critical when scaling up.", acknowledgement = ack-nhfb, articleno = "4", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", keywords = "chip multiprocessors; hierarchical networks; Networks-on-chip", } @Article{Soundararajan:2010:CSE, author = "Niranjan Soundararajan and Anand Sivasubramaniam and Vijay Narayanan", title = "Characterizing the soft error vulnerability of multicores running multithreaded applications", journal = j-SIGMETRICS, volume = "38", number = "1", pages = "379--380", month = jun, year = "2010", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1811099.1811096", ISSN = "0163-5999 (print), 1557-9484 (electronic)", ISSN-L = "0163-5999", bibdate = "Wed Aug 25 07:35:52 MDT 2010", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Multicores have become the platform of choice across all market segments. Cost-effective protection against soft errors is important in these environments, due to the need to move to lower technology generations and the exploding number of transistors on a chip. While multicores offer the flexibility of varying the number of application threads and the number of cores on which they run, the reliability impact of choosing one configuration over another is unclear. Our study reveals that the reliability costs vary dramatically between configurations and being unaware could lead to a sub-optimal choice.", acknowledgement = ack-nhfb, fjournal = "ACM SIGMETRICS Performance Evaluation Review", keywords = "fit rate; multicore; soft errors", } @Article{Sutherland:2010:CTC, author = "Dean F. Sutherland and William L. Scherlis", title = "Composable thread coloring", journal = j-SIGPLAN, volume = "45", number = "5", pages = "233--244", month = may, year = "2010", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1693453.1693485", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue Aug 31 22:39:18 MDT 2010", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "This paper introduces the language-independent concept of ``thread usage policy.'' Many multi-threaded software systems contain policies that regulate associations among threads, executable code, and potentially shared state. A system, for example, may constrain which threads are permitted to execute particular code segments, usually as a means to constrain those threads from accessing or writing particular elements of state. These policies ensure properties such as state confinement or reader/writer constraints, often without recourse to locking or transaction discipline.\par Our approach allows developers to concisely document their thread usage policies in a manner that enables the use of sound scalable analysis to assess consistency of policy and as-written code. This paper identifies the key semantic concepts of our thread coloring language and illustrates how to use its succinct source-level annotations to express models of thread usage policies, following established annotation conventions for Java.\par We have built a prototype static analysis tool, implemented as an integrated development environment plug-in (for the Eclipse IDE), that notifies developers of discrepancies between policy annotations and as-written code. Our analysis technique uses several underlying algorithms based on abstract interpretation, call-graphs, and type inference. The resulting overall analysis is both sound and composable. We have used this prototype analysis tool in case studies to model and analyze more than a million lines of code.\par Our validation process included field trials on a wide variety of complex large-scale production code selected by the host organizations. Our in-field experience led us to focus on potential adoptability by real-world developers. We have developed techniques that can reduce annotation density to less than one line per thousand lines of code (KLOC). In addition, the prototype analysis tool supports an incremental and iterative approach to modeling and analysis. This approach enabled field trial partners to directly target areas of greatest concern and to achieve useful results within a few hours.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "annotation; Java; keywords: state consistency; multicore; race conditions; state confinement; thread policy", } @Article{Tallent:2010:ALC, author = "Nathan R. Tallent and John M. Mellor-Crummey and Allan Porterfield", title = "Analyzing lock contention in multithreaded applications", journal = j-SIGPLAN, volume = "45", number = "5", pages = "269--280", month = may, year = "2010", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1693453.1693489", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue Aug 31 22:39:18 MDT 2010", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Many programs exploit shared-memory parallelism using multithreading. Threaded codes typically use locks to coordinate access to shared data. In many cases, contention for locks reduces parallel efficiency and hurts scalability. Being able to quantify and attribute lock contention is important for understanding where a multithreaded program needs improvement.\par This paper proposes and evaluates three strategies for gaining insight into performance losses due to lock contention. First, we consider using a straightforward strategy based on call stack profiling to attribute idle time and show that it fails to yield insight into lock contention. Second, we consider an approach that builds on a strategy previously used for analyzing idleness in work-stealing computations; we show that this strategy does not yield insight into lock contention. Finally, we propose a new technique for measurement and analysis of lock contention that uses data associated with locks to blame lock holders for the idleness of spinning threads. Our approach incurs $\leq$ 5\% overhead on a quantum chemistry application that makes extensive use of locking (65M distinct locks, a maximum of 340K live locks, and an average of 30K lock acquisitions per second per thread) and attributes lock contention to its full static and dynamic calling contexts. Our strategy, implemented in HPCToolkit, is fully distributed and should scale well to systems with large core counts.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "HPCToolkit; lock contention; multithreading; performance analysis", } @Article{Tentyukov:2010:MVF, author = "M. Tentyukov and J. A. M. Vermaseren", title = "The multithreaded version of {FORM}", journal = j-COMP-PHYS-COMM, volume = "181", number = "8", pages = "1419--1427", month = aug, year = "2010", CODEN = "CPHCBZ", ISSN = "0010-4655 (print), 1879-2944 (electronic)", ISSN-L = "0010-4655", bibdate = "Sat Feb 11 09:54:30 MST 2012", bibsource = "http://www.math.utah.edu/pub/tex/bib/compphyscomm2010.bib; http://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.sciencedirect.com/science/journal/00104655", URL = "http://www.sciencedirect.com/science/article/pii/S0010465510001207", acknowledgement = ack-nhfb, fjournal = "Computer Physics Communications", } @Article{Tian:2010:SPU, author = "Chen Tian and Min Feng and Rajiv Gupta", title = "Speculative parallelization using state separation and multiple value prediction", journal = j-SIGPLAN, volume = "45", number = "8", pages = "63--72", month = aug, year = "2010", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1806651.1806663", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Fri Oct 8 17:55:48 MDT 2010", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "With the availability of chip multiprocessor (CMP) and simultaneous multithreading (SMT) machines, extracting thread level parallelism from a sequential program has become crucial for improving performance. However, many sequential programs cannot be easily parallelized due to the presence of dependences. To solve this problem, different solutions have been proposed. Some of them make the optimistic assumption that such dependences rarely manifest themselves at runtime. However, when this assumption is violated, the recovery causes very large overhead. Other approaches incur large synchronization or computation overhead when resolving the dependences. Consequently, for a loop with frequently arising cross-iteration dependences, previous techniques are not able to speed up the execution. In this paper we propose a compiler technique which uses state separation and multiple value prediction to speculatively parallelize loops in sequential programs that contain frequently arising cross-iteration dependences. The key idea is to generate multiple versions of a loop iteration based on multiple predictions of values of variables involved in cross-iteration dependences (i.e., live-in variables). These speculative versions and the preceding loop iteration are executed in separate memory states simultaneously. After the execution, if one of these versions is correct (i.e., its predicted values are found to be correct), then we merge its state and the state of the preceding iteration because the dependence between the two iterations is correctly resolved. The memory states of other incorrect versions are completely discarded. Based on this idea, we further propose a runtime adaptive scheme that not only gives a good performance but also achieves better CPU utilization. We conducted experiments on 10 benchmark programs on a real machine. The results show that our technique can achieve 1.7x speedup on average across all used benchmarks.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "multicore processors; speculative parallelization", } @Article{Torlak:2010:MCA, author = "Emina Torlak and Mandana Vaziri and Julian Dolby", title = "{MemSAT}: checking axiomatic specifications of memory models", journal = j-SIGPLAN, volume = "45", number = "6", pages = "341--350", month = jun, year = "2010", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1806596.1806635", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Fri Oct 8 17:53:18 MDT 2010", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Memory models are hard to reason about due to their complexity, which stems from the need to strike a balance between ease-of-programming and allowing compiler and hardware optimizations. In this paper, we present an automated tool, MemSAT, that helps in debugging and reasoning about memory models. Given an axiomatic specification of a memory model and a multi-threaded test program containing assertions, MemSAT outputs a trace of the program in which both the assertions and the memory model axioms are satisfied, if one can be found. The tool is fully automatic and is based on a SAT solver. If it cannot find a trace, it outputs a minimal subset of the memory model and program constraints that are unsatisfiable. We used MemSAT to check several existing memory models against their published test cases, including the current Java Memory Model by Manson et al. and a revised version of it by Sevcik and Aspinall. We found subtle discrepancies between what was expected and the actual results of test programs.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "axiomatic specifications; bounded model checking; memory models; sat", } @Article{Vlachos:2010:PEAa, author = "Evangelos Vlachos and Michelle L. Goodstein and Michael A. Kozuch and Shimin Chen and Babak Falsafi and Phillip B. Gibbons and Todd C. Mowry", title = "{ParaLog}: enabling and accelerating online parallel monitoring of multithreaded applications", journal = j-COMP-ARCH-NEWS, volume = "38", number = "1", pages = "271--284", month = mar, year = "2010", CODEN = "CANED2", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Wed Mar 17 14:42:04 MDT 2010", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", } @Article{Vlachos:2010:PEAb, author = "Evangelos Vlachos and Michelle L. Goodstein and Michael A. Kozuch and Shimin Chen and Babak Falsafi and Phillip B. Gibbons and Todd C. Mowry", title = "{ParaLog}: enabling and accelerating online parallel monitoring of multithreaded applications", journal = j-SIGPLAN, volume = "45", number = "3", pages = "271--284", month = mar, year = "2010", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1736020.1736051", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Mar 17 13:46:56 MDT 2010", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "{\em Instruction-grain lifeguards\/} monitor the events of a running application at the level of individual instructions in order to identify and help mitigate application bugs and security exploits. Because such lifeguards impose a 10-100X slowdown on existing platforms, previous studies have proposed hardware designs to accelerate lifeguard processing. However, these accelerators are either tailored to a specific class of lifeguards or suitable only for monitoring single-threaded programs.\par We present ParaLog, the first design of a system enabling fast online parallel monitoring of multithreaded parallel applications. ParaLog supports a broad class of software-defined lifeguards. We show how three existing accelerators can be enhanced to support online multithreaded monitoring, dramatically reducing lifeguard overheads. We identify and solve several challenges in monitoring parallel applications and/or parallelizing these accelerators, including (i) enforcing inter-thread data dependences, (ii) dealing with inter-thread effects that are not reflected in coherence traffic, (iii) dealing with unmonitored operating system activity, and (iv) ensuring lifeguards can access shared metadata with negligible synchronization overheads. We present our system design for both Sequentially Consistent and Total Store Ordering processors. We implement and evaluate our design on a 16 core simulated CMP, using benchmarks from SPLASH-2 and PARSEC and two lifeguards: a data-flow tracking lifeguard and a memory-access checker lifeguard. Our results show that (i) our parallel accelerators improve performance by 2-9X and 1.13-3.4X for our two lifeguards, respectively, (ii) we are 5-126X faster than the time-slicing approach required by existing techniques, and (iii) our average overheads for applications with eight threads are 51\% and 28\% for the two lifeguards, respectively.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "hardware support for debugging; instruction-grain lifeguards; online parallel monitoring", } @Article{Welch:2010:SCF, author = "Peter H. Welch and Jan B. Pedersen", title = "{Santa Claus}: {Formal} analysis of a process-oriented solution", journal = j-TOPLAS, volume = "32", number = "4", pages = "14:1--14:37", month = apr, year = "2010", CODEN = "ATPSDT", DOI = "http://doi.acm.org/10.1145/1734206.1734211", ISSN = "0164-0925 (print), 1558-4593 (electronic)", ISSN-L = "0164-0925", bibdate = "Fri May 21 12:47:03 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/toplas/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "With the commercial development of multicore processors, the challenges of writing multithreaded programs to take advantage of these new hardware architectures are becoming more and more pertinent. Concurrent programming is necessary to achieve the performance that the hardware offers. Traditional approaches present concurrency as an {\em advanced\/} topic: they have proven difficult to use, reason about with confidence, and scale up to high levels of concurrency. This article reviews {\em process-oriented design}, based on Hoare's algebra of Communicating Sequential Processes (CSP), and proposes that this approach to concurrency leads to solutions that are manageable by novice programmers; that is, they are easy to design and maintain, that they are scalable for complexity, {\em obviously correct}, and relatively easy to verify using formal reasoning and/or model checkers. These solutions can be developed in conventional programming languages (through CSP libraries) or specialized ones (such as occam-\pi) in a manner that directly reflects their formal expression. Systems can be developed without needing specialist knowledge of the CSP formalism, since the supporting mathematics is burnt into the tools and languages supporting it. We illustrate these concepts with the {\em Santa Claus problem}, which has been used as a challenge for concurrency mechanisms since 1994. We consider this problem as an example control system, producing external signals reporting changes of internal state (that model the external world). We claim our occam-\pi solution is {\em correct-by-design}, but follow this up with formal verification (using the FDR model checker for CSP) that the system is free from deadlock and livelock, that the produced control signals obey crucial ordering constraints, and that the system has key liveness properties.", acknowledgement = ack-nhfb, articleno = "14", fjournal = "ACM Transactions on Programming Languages and Systems", keywords = "concurrency; CSP; deadlock; event ordering; liveness; novice programmer; occam-pi; Process orientation; verification", } @Article{Wendykier:2010:PCH, author = "Piotr Wendykier and James G. Nagy", title = "{Parallel Colt}: {A} High-Performance {Java} Library for Scientific Computing and Image Processing", journal = j-TOMS, volume = "37", number = "3", pages = "31:1--31:22", month = sep, year = "2010", CODEN = "ACMSCU", DOI = "http://doi.acm.org/10.1145/1824801.1824809", ISSN = "0098-3500 (print), 1557-7295 (electronic)", ISSN-L = "0098-3500", bibdate = "Mon Sep 27 10:15:50 MDT 2010", bibsource = "http://www.acm.org/pubs/contents/journals/toms/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Major breakthroughs in chip and software design have been observed for the last nine years. In October 2001, IBM released the world's first multicore processor: POWER4. Six years later, in February 2007, NVIDIA made a public release of CUDA SDK, a set of development tools to write algorithms for execution on Graphic Processing Units (GPUs). Although software vendors have started working on parallelizing their products, the vast majority of existing code is still sequential and does not effectively utilize modern multicore CPUs and manycore GPUs.\par This article describes Parallel Colt, a multithreaded Java library for scientific computing and image processing. In addition to describing the design and functionality of Parallel Colt, a comparison to MATLAB is presented. Two ImageJ plugins for iterative image deblurring and motion correction of PET brain images are described as typical applications of this library. Performance comparisons with MATLAB, including GPU computations via AccelerEyes' Jacket toolbox are also given.", acknowledgement = ack-nhfb, articleno = "31", fjournal = "ACM Transactions on Mathematical Software", keywords = "Deconvolution; FFT; inverse problems; iterative methods; motion correction; multithreading; PET; regularization", } @Article{Wheeler:2010:VMM, author = "Kyle B. Wheeler and Douglas Thain", title = "Visualizing massively multithreaded applications with {ThreadScope}", journal = j-CCPE, volume = "22", number = "1", pages = "45--67", month = jan, year = "2010", CODEN = "CCPEBO", DOI = "http://dx.doi.org/10.1002/cpe.1469", ISSN = "1532-0626 (print), 1532-0634 (electronic)", ISSN-L = "1532-0626", bibdate = "Mon Dec 5 10:08:40 MST 2011", bibsource = "http://www.interscience.wiley.com/jpages/1532-0626; http://www.math.utah.edu/pub/tex/bib/ccpe.bib; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Concurrency and Computation: Prac\-tice and Experience", onlinedate = "13 Aug 2009", } @Article{Zhang:2010:DCS, author = "Eddy Z. Zhang and Yunlian Jiang and Xipeng Shen", title = "Does cache sharing on modern {CMP} matter to the performance of contemporary multithreaded programs?", journal = j-SIGPLAN, volume = "45", number = "5", pages = "203--212", month = may, year = "2010", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1693453.1693482", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue Aug 31 22:39:18 MDT 2010", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Most modern Chip Multiprocessors (CMP) feature shared cache on chip. For multithreaded applications, the sharing reduces communication latency among co-running threads, but also results in cache contention.\par A number of studies have examined the influence of cache sharing on multithreaded applications, but most of them have concentrated on the design or management of shared cache, rather than a systematic measurement of the influence. Consequently, prior measurements have been constrained by the reliance on simulators, the use of out-of-date benchmarks, and the limited coverage of deciding factors. The influence of CMP cache sharing on contemporary multithreaded applications remains preliminarily understood.\par In this work, we conduct a systematic measurement of the influence on two kinds of commodity CMP machines, using a recently released CMP benchmark suite, PARSEC, with a number of potentially important factors on program, OS, and architecture levels considered. The measurement shows some surprising results. Contrary to commonly perceived importance of cache sharing, neither positive nor negative effects from the cache sharing are significant for most of the program executions, regardless of the types of parallelism, input datasets, architectures, numbers of threads, and assignments of threads to cores. After a detailed analysis, we find that the main reason is the mismatch of current development and compilation of multithreaded applications and CMP architectures. By transforming the programs in a cache-sharing-aware manner, we observe up to 36\% performance increase when the threads are placed on cores appropriately.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "chip multiprocessors; parallel program optimizations; shared cache; thread scheduling", } @Article{Zhang:2010:FTS, author = "Yao Zhang and Jonathan Cohen and John D. Owens", title = "Fast tridiagonal solvers on the {GPU}", journal = j-SIGPLAN, volume = "45", number = "5", pages = "127--136", month = may, year = "2010", CODEN = "SINODQ", DOI = "http://doi.acm.org/10.1145/1693453.1693472", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue Aug 31 22:39:18 MDT 2010", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "We study the performance of three parallel algorithms and their hybrid variants for solving tridiagonal linear systems on a GPU: cyclic reduction (CR), parallel cyclic reduction (PCR) and recursive doubling (RD). We develop an approach to measure, analyze, and optimize the performance of GPU programs in terms of memory access, computation, and control overhead. We find that CR enjoys linear algorithm complexity but suffers from more algorithmic steps and bank conflicts, while PCR and RD have fewer algorithmic steps but do more work each step. To combine the benefits of the basic algorithms, we propose hybrid CR+PCR and CR+RD algorithms, which improve the performance of PCR, RD and CR by 21\%, 31\% and 61\% respectively. Our GPU solvers achieve up to a 28x speedup over a sequential LAPACK solver, and a 12x speedup over a multi-threaded CPU solver.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "GPGPU; performance optimization; tridiagonal linear system", } @Article{Bajaj:2011:FFP, author = "Chandrajit L. Bajaj and Rezaul Chowdhury and Vinay Siddahanavalli", title = "{$F^2$Dock}: Fast {Fourier} Protein-Protein Docking", journal = j-TCBB, volume = "8", number = "1", pages = "45--58", month = jan, year = "2011", CODEN = "ITCBCY", DOI = "http://dx.doi.org/10.1109/TCBB.2009.57", ISSN = "1545-5963 (print), 1557-9964 (electronic)", ISSN-L = "1545-5963", bibdate = "Mon Dec 20 18:39:04 MST 2010", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "The functions of proteins are often realized through their mutual interactions. Determining a relative transformation for a pair of proteins and their conformations which form a stable complex, reproducible in nature, is known as docking. It is an important step in drug design, structure determination, and understanding function and structure relationships. In this paper, we extend our nonuniform fast Fourier transform-based docking algorithm to include an adaptive search phase (both translational and rotational) and thereby speed up its execution. We have also implemented a multithreaded version of the adaptive docking algorithm for even faster execution on multicore machines. We call this protein-protein docking code {\rm F}^2Dock (F^2= {\rm \underline{F}ast\underline{F}ourier}).", acknowledgement = ack-nhfb, fjournal = "IEEE/ACM Transactions on Computational Biology and Bioinformatics", } @Article{Bientinesi:2011:CFS, author = "Paolo Bientinesi and Francisco D. Igual and Daniel Kressner and Matthias Petschow and Enrique S. Quintana-Ort{\'\i}", title = "Condensed forms for the symmetric eigenvalue problem on multi-threaded architectures", journal = j-CCPE, volume = "23", number = "7", pages = "694--707", month = may, year = "2011", CODEN = "CCPEBO", DOI = "http://dx.doi.org/10.1002/cpe.1680", ISSN = "1532-0626 (print), 1532-0634 (electronic)", ISSN-L = "1532-0626", bibdate = "Mon Dec 5 10:08:55 MST 2011", bibsource = "http://www.interscience.wiley.com/jpages/1532-0626; http://www.math.utah.edu/pub/tex/bib/ccpe.bib; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "Concurrency and Computation: Practice and Experience", onlinedate = "8 Nov 2010", } @Article{Burnim:2011:SCSa, author = "Jacob Burnim and George Necula and Koushik Sen", title = "Specifying and checking semantic atomicity for multithreaded programs", journal = j-COMP-ARCH-NEWS, volume = "39", number = "1", pages = "79--90", month = mar, year = "2011", CODEN = "CANED2", DOI = "http://dx.doi.org/10.1145/1961295.1950377", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Thu Aug 18 13:45:25 MDT 2011", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", } @Article{Burnim:2011:SCSb, author = "Jacob Burnim and George Necula and Koushik Sen", title = "Specifying and checking semantic atomicity for multithreaded programs", journal = j-SIGPLAN, volume = "46", number = "3", pages = "79--90", month = mar, year = "2011", CODEN = "SINODQ", DOI = "http://dx.doi.org/10.1145/1961296.1950377", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Tue May 24 10:55:08 MDT 2011", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", remark = "ASPLOS '11 conference proceedings", } @Article{Butler:2011:BAM, author = "Michael Butler and Leslie Barnes and Debjit Das Sarma and Bob Gelinas", title = "{Bulldozer}: An Approach to Multithreaded Compute Performance", journal = j-IEEE-MICRO, volume = "31", number = "2", pages = "6--15", month = mar # "\slash " # apr, year = "2011", CODEN = "IEMIDZ", DOI = "http://doi.ieeecomputersociety.org/10.1109/MM.2011.23", ISSN = "0272-1732 (print), 1937-4143 (electronic)", ISSN-L = "0272-1732", bibdate = "Tue Apr 26 13:50:28 2011", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "AMD's Bulldozer module represents a new direction in microarchitecture and includes a number of firsts for AMD, including AMD's multithreaded x86 processor, implementation of a shared Level 2 cache, and x86 processor to incorporate floating-point multiply-accumulate (FMAC). This article discusses the module's multithreading architecture, power-efficient microarchitecture, and subblocks, including the various microarchitectural latencies, bandwidths, and structure sizes.", acknowledgement = ack-nhfb, fjournal = "IEEE Micro", keywords = "Hot Chips 22 conference proceedings", } @Article{Chinya:2011:BDP, author = "Gautham N. Chinya and Jamison D. Collins and Perry H. Wang and Hong Jiang and Guei-Yuan Lueh and Thomas A. Piazza and Hong Wang", title = "{Bothnia}: a dual-personality extension to the {Intel} integrated graphics driver", journal = j-OPER-SYS-REV, volume = "45", number = "1", pages = "11--20", month = jan, year = "2011", CODEN = "OSRED8", DOI = "http://dx.doi.org/10.1145/1945023.1945027", ISSN = "0163-5980", ISSN-L = "0163-5980", bibdate = "Fri Feb 25 16:43:23 MST 2011", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "In this paper, we introduce Bothnia, an extension to the Intel production graphics driver to support a shared virtual memory heterogeneous multithreading programming model. With Bothnia, the Intel graphics device driver can support both the traditional 3D graphics rendering software stack and a new class of heterogeneous multithreaded applications, which can use both IA (Intel Architecture) CPU cores and Intel integrated Graphics and Media Accelerator (GMA) cores in the same virtual address space. We describe the necessary architectural supports in both IA CPU and the GMA cores and present a reference Bothnia implementation.", acknowledgement = ack-nhfb, fjournal = "ACM SIGOPS Operating Systems Review", } @Article{Davis:2011:ASM, author = "Timothy A. Davis", title = "{Algorithm 915}, {SuiteSparseQR}: {Multifrontal} multithreaded rank-revealing sparse {QR} factorization", journal = j-TOMS, volume = "38", number = "1", pages = "8:1--8:??", month = nov, year = "2011", CODEN = "ACMSCU", DOI = "http://dx.doi.org/10.1145/2049662.2049670", ISSN = "0098-3500 (print), 1557-7295 (electronic)", ISSN-L = "0098-3500", bibdate = "Thu Dec 15 08:59:34 MST 2011", bibsource = "http://www.acm.org/pubs/contents/journals/toms/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.math.utah.edu/pub/tex/bib/toms.bib", acknowledgement = ack-nhfb, articleno = "8", fjournal = "ACM Transactions on Mathematical Software (TOMS)", } @Article{Esparza:2011:CPB, author = "Javier Esparza and Pierre Ganty", title = "Complexity of pattern-based verification for multithreaded programs", journal = j-SIGPLAN, volume = "46", number = "1", pages = "499--510", month = jan, year = "2011", CODEN = "SINODQ", DOI = "http://dx.doi.org/10.1145/1925844.1926443", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jan 26 15:06:39 MST 2011", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", } @InProceedings{Ganesan:2011:MMP, author = "Karthik Ganesan and Lizy K. John", title = "{MAximum Multicore POwer (MAMPO)}: an automatic multithreaded synthetic power virus generation framework for multicore systems", crossref = "Lathrop:2011:SPI", pages = "53:1--53:12", year = "2011", DOI = "http://dx.doi.org/10.1145/2063384.2063455", bibdate = "Fri Dec 16 11:05:47 MST 2011", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.math.utah.edu/pub/tex/bib/supercomputing2011.bib", acknowledgement = ack-nhfb, articleno = "53", } @Article{Gupta:2011:PAR, author = "Ashutosh Gupta and Corneliu Popeea and Andrey Rybalchenko", title = "Predicate abstraction and refinement for verifying multi-threaded programs", journal = j-SIGPLAN, volume = "46", number = "1", pages = "331--344", month = jan, year = "2011", CODEN = "SINODQ", DOI = "http://dx.doi.org/10.1145/1925844.1926424", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jan 26 15:06:39 MST 2011", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", } @Article{Hsu:2011:MSS, author = "Chia-Jui Hsu and Jos{\'e} Luis Pino and Shuvra S. Bhattacharyya", title = "Multithreaded Simulation for Synchronous Dataflow Graphs", journal = j-TODAES, volume = "16", number = "3", pages = "25:1--25:??", month = jun, year = "2011", CODEN = "ATASFO", DOI = "http://dx.doi.org/10.1145/1970353.1970358", ISSN = "1084-4309 (print), 1557-7309 (electronic)", ISSN-L = "1084-4309", bibdate = "Tue Jun 14 11:55:50 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/todaes/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "For system simulation, Synchronous DataFlow (SDF) has been widely used as a core model of computation in design tools for digital communication and signal processing systems. The traditional approach for simulating SDF graphs is to compute and execute static schedules in single-processor desktop environments. Nowadays, however, multicore processors are increasingly popular desktop platforms for their potential performance improvements through thread-level parallelism. Without novel scheduling and simulation techniques that explicitly explore thread-level parallelism for executing SDF graphs, current design tools gain only minimal performance improvements on multicore platforms. In this article, we present a new multithreaded simulation scheduler, called MSS, to provide simulation runtime speedup for executing SDF graphs on multicore processors.", acknowledgement = ack-nhfb, articleno = "25", fjournal = "ACM Transactions on Design Automation of Electronic Systems", } @Article{Jeffrey:2011:IBM, author = "Dennis Jeffrey and Yan Wang and Chen Tian and Rajiv Gupta", title = "Isolating bugs in multithreaded programs using execution suppression", journal = j-SPE, volume = "41", number = "11", pages = "1259--1288", month = oct, year = "2011", CODEN = "SPEXBL", DOI = "http://dx.doi.org/10.1002/spe.1040", ISSN = "0038-0644 (print), 1097-024X (electronic)", ISSN-L = "0038-0644", bibdate = "Thu Sep 29 14:49:13 MDT 2011", bibsource = "http://onlinelibrary.wiley.com/journal/10.1002/(ISSN)1097-024X; http://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.math.utah.edu/pub/tex/bib/spe.bib", acknowledgement = ack-nhfb, fjournal = "Software --- Practice and Experience", onlinedate = "18 Jan 2011", } @Article{Joisha:2011:TEA, author = "Pramod G. Joisha and Robert S. Schreiber and Prithviraj Banerjee and Hans J. Boehm and Dhruva R. Chakrabarti", title = "A technique for the effective and automatic reuse of classical compiler optimizations on multithreaded code", journal = j-SIGPLAN, volume = "46", number = "1", pages = "623--636", month = jan, year = "2011", CODEN = "SINODQ", DOI = "http://dx.doi.org/10.1145/1925844.1926457", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Wed Jan 26 15:06:39 MST 2011", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", } @Article{Liao:2011:AUB, author = "Xiongfei Liao and Thambipillai Srikanthan", title = "Accelerating {UNISIM}-Based Cycle-Level Microarchitectural Simulations on Multicore Platforms", journal = j-TODAES, volume = "16", number = "3", pages = "26:1--26:??", month = jun, year = "2011", CODEN = "ATASFO", DOI = "http://dx.doi.org/10.1145/1970353.1970359", ISSN = "1084-4309 (print), 1557-7309 (electronic)", ISSN-L = "1084-4309", bibdate = "Tue Jun 14 11:55:50 MDT 2011", bibsource = "http://www.acm.org/pubs/contents/journals/todaes/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "UNISIM has been shown to ease the development of simulators for multi-/many-core systems. However, UNISIM cycle-level simulations of large-scale multiprocessor systems could be very time consuming. In this article, we propose a systematic framework for accelerating UNISIM cycle-level simulations on multicore platforms. The proposed framework relies on exploiting the fine-grained parallelism within the simulated cycles using POSIX threads. A multithreaded simulation engine has been devised from the single-threaded UNISIM SystemC engine to facilitate the exploitation of inherent parallelism. An adaptive technique that manages the overall computation workload by adjusting the number of threads employed at any given time is proposed. In addition, we have introduced a technique to balance the workloads of multithreaded executions.", acknowledgement = ack-nhfb, articleno = "26", fjournal = "ACM Transactions on Design Automation of Electronic Systems", } @Article{Ma:2011:SPC, author = "Kai Ma and Xue Li and Ming Chen and Xiaorui Wang", title = "Scalable power control for many-core architectures running multi-threaded applications", journal = j-COMP-ARCH-NEWS, volume = "39", number = "3", pages = "449--460", month = jun, year = "2011", CODEN = "CANED2", DOI = "http://dx.doi.org/10.1145/2024723.2000117", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Sep 5 17:15:11 MDT 2011", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", } @Article{Mahafzah:2011:PMI, author = "Basel A. Mahafzah", title = "Parallel multithreaded {IDA*} heuristic search: algorithm design and performance evaluation", journal = j-INT-J-PAR-EMER-DIST-SYS, volume = "26", number = "1", pages = "61--82", year = "2011", CODEN = "????", DOI = "http://dx.doi.org/10.1080/17445761003604521", ISSN = "1744-5760 (print), 1744-5779 (electronic)", ISSN-L = "1744-5760", bibdate = "Mon Sep 5 20:33:09 MDT 2011", bibsource = "http://www.informaworld.com/smpp/title~content=t713729127~link=cover; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, onlinedate = "6 Dec 2010", } @Article{Marino:2011:CSP, author = "Daniel Marino and Abhayendra Singh and Todd Millstein and Madanlal Musuvathi and Satish Narayanasamy", title = "A case for an {SC}-preserving compiler", journal = j-SIGPLAN, volume = "46", number = "6", pages = "199--210", month = jun, year = "2011", CODEN = "SINODQ", DOI = "http://dx.doi.org/10.1145/1993316.1993522", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Thu Jun 9 10:23:33 MDT 2011", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "The most intuitive memory consistency model for shared-memory multi-threaded programming is sequential consistency (SC). However, current concurrent programming languages support a relaxed model, as such relaxations are deemed necessary for enabling important optimizations. This paper demonstrates that an SC-preserving compiler, one that ensures that every SC behavior of a compiler-generated binary is an SC behavior of the source program, retains most of the performance benefits of an optimizing compiler. The key observation is that a large class of optimizations crucial for performance are either already SC-preserving or can be modified to preserve SC while retaining much of their effectiveness. An SC-preserving compiler, obtained by restricting the optimization phases in LLVM, a state-of-the-art C/C++ compiler, incurs an average slowdown of 3.8% and a maximum slowdown of 34% on a set of 30 programs from the SPLASH-2, PARSEC, and SPEC CINT2006 benchmark suites.\par While the performance overhead of preserving SC in the compiler is much less than previously assumed, it might still be unacceptable for certain applications. We believe there are several avenues for improving performance without giving up SC-preservation. In this vein, we observe that the overhead of our SC-preserving compiler arises mainly from its inability to aggressively perform a class of optimizations we identify as eager-load optimizations. This class includes common-subexpression elimination, constant propagation, global value numbering, and common cases of loop-invariant code motion. We propose a notion of interference checks in order to enable eager-load optimizations while preserving SC. Interference checks expose to the compiler a commonly used hardware speculation mechanism that can efficiently detect whether a particular variable has changed its value since last read.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", keywords = "LLVM compiler suite; sequential consistency (SC)", } @Article{Reddy:2011:BFH, author = "Dheeraj Reddy and David Koufaty and Paul Brett and Scott Hahn", title = "Bridging functional heterogeneity in multicore architectures", journal = j-OPER-SYS-REV, volume = "45", number = "1", pages = "21--33", month = jan, year = "2011", CODEN = "OSRED8", DOI = "http://dx.doi.org/10.1145/1945023.1945028", ISSN = "0163-5980", ISSN-L = "0163-5980", bibdate = "Fri Feb 25 16:43:23 MST 2011", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "Heterogeneous processors that mix big high performance cores with small low power cores promise excellent single-threaded performance coupled with high multi-threaded throughput and higher performance-per-watt. A significant portion of the commercial multicore heterogeneous processors are likely to have a common instruction set architecture( ISA). However, due to limited design resources and goals, each core is likely to contain ISA extensions not yet implemented in the other core. Therefore, such heterogeneous processors will have inherent functional asymmetry at the ISA level and face significant software challenges. This paper analyzes the software challenges to the operating system and the application layer software on a heterogeneous system with functional asymmetry, where the ISA of the small and big cores overlaps.", acknowledgement = ack-nhfb, fjournal = "ACM SIGOPS Operating Systems Review", } @Article{Vandierendonck:2011:MSR, author = "Hans Vandierendonck and Andr{\'e} Seznec", title = "Managing {SMT} resource usage through speculative instruction window weighting", journal = j-TACO, volume = "8", number = "3", pages = "12:1--12:??", month = oct, year = "2011", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2019608.2019611", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Oct 22 09:15:12 MDT 2011", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Simultaneous multithreading processors dynamically share processor resources between multiple threads. In general, shared SMT resources may be managed explicitly, for instance, by dynamically setting queue occupation bounds for each thread as in the DCRA and Hill-Climbing policies. Alternatively, resources may be managed implicitly; that is, resource usage is controlled by placing the desired instruction mix in the resources. In this case, the main resource management tool is the instruction fetch policy which must predict the behavior of each thread (branch mispredictions, long-latency loads, etc.) as it fetches instructions. In this article, we present the use of Speculative Instruction Window Weighting (SIWW) to bridge the gap between implicit and explicit SMT fetch policies.", acknowledgement = ack-nhfb, articleno = "12", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", } @Article{Yu:2011:SDH, author = "Wing-kei S. Yu and Ruirui Huang and Sarah Q. Xu and Sung-En Wang and Edwin Kan and G. Edward Suh", title = "{SRAM--DRAM} hybrid memory with applications to efficient register files in fine-grained multi-threading", journal = j-COMP-ARCH-NEWS, volume = "39", number = "3", pages = "247--258", month = jun, year = "2011", CODEN = "CANED2", DOI = "http://dx.doi.org/10.1145/2024723.2000094", ISSN = "0163-5964 (print), 1943-5851 (electronic)", ISSN-L = "0163-5964", bibdate = "Mon Sep 5 17:15:11 MDT 2011", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.math.utah.edu/pub/tex/bib/sigarch.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGARCH Computer Architecture News", } @Article{Zhao:2011:DCC, author = "Qin Zhao and David Koh and Syed Raza and Derek Bruening and Weng-Fai Wong and Saman Amarasinghe", title = "Dynamic cache contention detection in multi-threaded applications", journal = j-SIGPLAN, volume = "46", number = "7", pages = "27--38", month = jul, year = "2011", CODEN = "SINODQ", DOI = "http://dx.doi.org/10.1145/2007477.1952688", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Fri Sep 16 10:02:34 MDT 2011", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.math.utah.edu/pub/tex/bib/sigplan2010.bib", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", } @Article{Zhu:2011:TPS, author = "David (Yu) Zhu and Jaeyeon Jung and Dawn Song and Tadayoshi Kohno and David Wetherall", title = "{TaintEraser}: protecting sensitive data leaks using application-level taint tracking", journal = j-OPER-SYS-REV, volume = "45", number = "1", pages = "142--154", month = jan, year = "2011", CODEN = "OSRED8", DOI = "http://dx.doi.org/10.1145/1945023.1945039", ISSN = "0163-5980", ISSN-L = "0163-5980", bibdate = "Fri Feb 25 16:43:23 MST 2011", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "We present TaintEraser, a new tool that tracks the movement of sensitive user data as it flows through off-the-shelf applications. TaintEraser uses application-level dynamic taint analysis to let users run applications in their own environment while preventing unwanted information exposure. It is made possible by techniques we developed for accurate and efficient tainting: (1) Semantic-aware instruction-level tainting is critical to track taint accurately, without explosion or loss. (2) Function summaries provide an interface to handle taint propagation within the kernel and reduce the overhead of instruction-level tracking. (3) On-demand instrumentation enables fast loading of large applications. Together, these techniques let us analyze large, multi-threaded, networked applications in near real-time.", acknowledgement = ack-nhfb, fjournal = "ACM SIGOPS Operating Systems Review", } @Article{Ahn:2012:ISE, author = "Jung Ho Ahn and Norman P. Jouppi and Christos Kozyrakis and Jacob Leverich and Robert S. Schreiber", title = "Improving System Energy Efficiency with Memory Rank Subsetting", journal = j-TACO, volume = "9", number = "1", pages = "4:1--4:??", month = mar, year = "2012", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2133382.2133386", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Fri Mar 30 17:45:35 MDT 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "VLSI process technology scaling has enabled dramatic improvements in the capacity and peak bandwidth of DRAM devices. However, current standard DDR x DIMM memory interfaces are not well tailored to achieve high energy efficiency and performance in modern chip-multiprocessor-based computer systems. Their suboptimal performance and energy inefficiency can have a significant impact on system-wide efficiency since much of the system power dissipation is due to memory power. New memory interfaces, better suited for future many-core systems, are needed. In response, there are recent proposals to enhance the energy efficiency of main-memory systems by dividing a memory rank into subsets, and making a subset rather than a whole rank serve a memory request. We holistically assess the effectiveness of rank subsetting from system-wide performance, energy-efficiency, and reliability perspectives. We identify the impact of rank subsetting on memory power and processor performance analytically, compare two promising rank-subsetting proposals, Multicore DIMM and mini-rank, and verify our analysis by simulating a chip-multiprocessor system using multithreaded and consolidated workloads. We extend the design of Multicore DIMM for high-reliability systems and show that compared with conventional chipkill approaches, rank subsetting can lead to much higher system-level energy efficiency and performance at the cost of additional DRAM devices. This holistic assessment shows that rank subsetting offers compelling alternatives to existing processor-memory interfaces for future DDR systems.", acknowledgement = ack-nhfb, articleno = "4", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", } @Article{Bouajjani:2012:ARP, author = "Ahmed Bouajjani and Michael Emmi", title = "Analysis of recursively parallel programs", journal = j-SIGPLAN, volume = "47", number = "1", pages = "203--214", month = jan, year = "2012", CODEN = "SINODQ", DOI = "http://dx.doi.org/10.1145/2103621.2103681", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Thu Mar 15 18:16:55 MDT 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "We propose a general formal model of isolated hierarchical parallel computations, and identify several fragments to match the concurrency constructs present in real-world programming languages such as Cilk and X10. By associating fundamental formal models (vector addition systems with recursive transitions) to each fragment, we provide a common platform for exposing the relative difficulties of algorithmic reasoning. For each case we measure the complexity of deciding state-reachability for finite-data recursive programs, and propose algorithms for the decidable cases. The complexities which include PTIME, NP, EXPSPACE, and 2EXPTIME contrast with undecidable state-reachability for recursive multi-threaded programs.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", remark = "POPL '12 conference proceedings.", } @Article{Farzan:2012:VPC, author = "Azadeh Farzan and Zachary Kincaid", title = "Verification of parameterized concurrent programs by modular reasoning about data and control", journal = j-SIGPLAN, volume = "47", number = "1", pages = "297--308", month = jan, year = "2012", CODEN = "SINODQ", DOI = "http://dx.doi.org/10.1145/2103621.2103693", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Thu Mar 15 18:16:55 MDT 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "In this paper, we consider the problem of verifying thread-state properties of multithreaded programs in which the number of active threads cannot be statically bounded. Our approach is based on decomposing the task into two modules, where one reasons about data and the other reasons about control. The data module computes thread-state invariants (e.g., linear constraints over global variables and local variables of one thread) using the thread interference information computed by the control module. The control module computes a representation of thread interference, as an incrementally constructed data flow graph, using the data invariants provided by the data module. These invariants are used to rule out patterns of thread interference that can not occur in a real program execution. The two modules are incorporated into a feedback loop, so that the abstractions of data and interference are iteratively coarsened as the algorithm progresses (that is, they become weaker) until a fixed point is reached. Our approach is sound and terminating, and applicable to programs with infinite state (e.g., unbounded integers) and unboundedly many threads. The verification method presented in this paper has been implemented into a tool, called Duet. We demonstrate the effectiveness of our technique by verifying properties of a selection of Linux device drivers using Duet, and also compare Duet with previous work on verification of parameterized Boolean program using the Boolean abstractions of these drivers.", acknowledgement = ack-nhfb, fjournal = "ACM SIGPLAN Notices", remark = "POPL '12 conference proceedings.", } @Article{Pusukuri:2012:TTD, author = "Kishore Kumar Pusukuri and Rajiv Gupta and Laxmi N. Bhuyan", title = "Thread Tranquilizer: Dynamically reducing performance variation", journal = j-TACO, volume = "8", number = "4", pages = "46:1--46:??", month = jan, year = "2012", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2086696.2086725", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jan 21 07:49:49 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "To realize the performance potential of multicore systems, we must effectively manage the interactions between memory reference behavior and the operating system policies for thread scheduling and migration decisions. We observe that these interactions lead to significant variations in the performance of a given application, from one execution to the next, even when the program input remains unchanged and no other applications are being run on the system. Our experiments with multithreaded programs, including the TATP database application, SPECjbb2005, and a subset of PARSEC and SPEC OMP programs, on a 24-core Dell PowerEdge R905 server running OpenSolaris confirms the above observation.", acknowledgement = ack-nhfb, articleno = "46", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", } @Article{Radojkovic:2012:EIS, author = "Petar Radojkovi{\'c} and Sylvain Girbal and Arnaud Grasset and Eduardo Qui{\~n}ones and Sami Yehia and Francisco J. Cazorla", title = "On the evaluation of the impact of shared resources in multithreaded {COTS} processors in time-critical environments", journal = j-TACO, volume = "8", number = "4", pages = "34:1--34:??", month = jan, year = "2012", CODEN = "????", DOI = "http://dx.doi.org/10.1145/2086696.2086713", ISSN = "1544-3566 (print), 1544-3973 (electronic)", ISSN-L = "1544-3566", bibdate = "Sat Jan 21 07:49:49 MST 2012", bibsource = "http://portal.acm.org/; http://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.math.utah.edu/pub/tex/bib/taco.bib", abstract = "Commercial Off-The-Shelf (COTS) processors are now commonly used in real-time embedded systems. The characteristics of these processors fulfill system requirements in terms of time-to-market, low cost, and high performance-per-watt ratio. However, multithreaded (MT) processors are still not widely used in real-time systems because the timing analysis is too complex. In MT processors, simultaneously-running tasks share and compete for processor resources, so the timing analysis has to estimate the possible impact that the inter-task interferences have on the execution time of the applications. In this paper, we propose a method that quantifies the slowdown that simultaneously-running tasks may experience due to collision in shared processor resources.", acknowledgement = ack-nhfb, articleno = "34", fjournal = "ACM Transactions on Architecture and Code Optimization (TACO)", } %%% ==================================================================== %%% Cross-referenced entries must come last: @Proceedings{Anonymous:1990:PWU, editor = "Anonymous", booktitle = "Proceedings of the Winter 1990 USENIX Conference, Washington, DC, USA, January 22--26, 1990", title = "Proceedings of the Winter 1990 {USENIX} Conference, Washington, {DC}, {USA}, January 22--26, 1990", publisher = pub-USENIX, address = pub-USENIX:adr, pages = "xvi + 374", year = "1990", bibdate = "Sat Sep 28 20:03:34 MDT 1996", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Proceedings{IEEE:1990:PSN, editor = "{IEEE}", booktitle = "Proceedings, Supercomputing '90: November 12--16, 1990, New York Hilton at Rockefeller Center, New York, New York", title = "Proceedings, Supercomputing '90: November 12--16, 1990, New York Hilton at Rockefeller Center, New York, New York", publisher = pub-IEEE, address = pub-IEEE:adr, pages = "xxv + 982", year = "1990", ISBN = "0-8186-2056-0 (paperback: IEEE Computer Society), 0-89791-412-0 (paperback: ACM)", ISBN-13 = "978-0-8186-2056-0 (paperback: IEEE Computer Society), 978-0-89791-412-3 (paperback: ACM)", LCCN = "QA 76.88 S87 1990", bibdate = "Wed Aug 28 06:48:31 MDT 1996", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib; University of California MELVYL catalog.", note = "ACM order number 415903. IEEE Computer Society Press order number 2056. IEEE catalog number 90CH2916-5.", acknowledgement = ack-nhfb, classification = "C5440 (Multiprocessor systems and techniques); C5470 (Performance evaluation and testing); C6110 (Systems analysis and programming); C7000 (Computer applications)", keywords = "biological applications; computer applications; computer chess; innovative architectures; linear algebra algorithms; memory; networking computing; parallel languages; parallel processing; particle transport; partitioning; performance evaluation; performance visualizations; pipeline processing; program analysis; program restructuring; scheduling; supercomputers --- congresses; vector algorithms", } @Proceedings{Anonymous:1991:PIS, editor = "Anonymous", booktitle = "{Proceedings of the International Symposium on Supercomputing: Fukuoka, Japan, November 6--8, 1991}", title = "{Proceedings of the International Symposium on Supercomputing: Fukuoka, Japan, November 6--8, 1991}", publisher = "Kyushu University Press", address = "Fukuoka, Japan", pages = "iv + 261", year = "1991", ISBN = "4-87378-284-8", ISBN-13 = "978-4-87378-284-3", LCCN = "QA76.88.I1991", bibdate = "Fri Aug 30 08:01:51 MDT 1996", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "Supercomputers --- Congresses", } @Proceedings{Watt:1991:IPI, editor = "Stephen M. Watt", booktitle = "ISSAC '91: proceedings of the 1991 International Symposium on Symbolic and Algebraic Computation, July 15--17, 1991, Bonn, Germany", title = "{ISSAC} '91: proceedings of the 1991 International Symposium on Symbolic and Algebraic Computation, July 15--17, 1991, Bonn, Germany", publisher = pub-ACM, address = pub-ACM:adr, pages = "xiii + 468", year = "1991", ISBN = "0-89791-437-6", ISBN-13 = "978-0-89791-437-6", LCCN = "QA 76.95 I59 1991", bibdate = "Thu Sep 26 06:00:06 MDT 1996", bibsource = "http://www.math.utah.edu/pub/bibnet/authors/d/dirac-p-a-m.bib; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", abstract = "The following topics were dealt with: algorithms for symbolic mathematical computation; languages, systems and packages; computational geometry, group theory and number theory; automatic theorem proving and programming; interface of symbolics, numerics and graphics; applications in mathematics, science and engineering; and symbolic and algebraic computation in education.", acknowledgement = ack-nhfb, classification = "C1160 (Combinatorial mathematics); C4130 (Interpolation and function approximation); C4210 (Formal logic); C4240 (Programming and algorithm theory); C7310 (Mathematics)", confdate = "15--17 July 1991", conflocation = "Bonn, Germany", confsponsor = "ACM", keywords = "algebra --- data processing --- congresses; Algebraic computation; Algorithms; Automatic theorem proving; Computational geometry; Education; Engineering; Graphics; Group theory; Languages; Mathematics; mathematics --- data processing --- congresses; Number theory; Programming; Science; Symbolic mathematical computation; Symbolics", pubcountry = "USA", thesaurus = "Computational complexity; Formal languages; Interpolation; Number theory; Polynomials; Symbol manipulation", } @Proceedings{ACM:1992:CPI, editor = "{ACM}", booktitle = "Conference proceedings / 1992 International Conference on Supercomputing, July 19--23, 1992, Washington, DC", title = "Conference proceedings / 1992 International Conference on Supercomputing, July 19--23, 1992, Washington, {DC}", publisher = pub-ACM, address = pub-ACM:adr, pages = "x + 485", year = "1992", ISBN = "0-89791-485-6 (paperback), 0-89791-486-4", ISBN-13 = "978-0-89791-485-7 (paperback), 978-0-89791-486-4", LCCN = "QA 76.88 I57 1992", bibdate = "Wed Aug 28 06:48:31 MDT 1996", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib; University of California MELVYL catalog.", note = "Sponsored by ACM SIGARCH.", acknowledgement = ack-nhfb, keywords = "supercomputers --- congresses", } @Proceedings{IEEE:1992:PSM, editor = "{IEEE Computer Society. Technical Committee on Computer Architecture}", booktitle = "Proceedings, Supercomputing '92: Minneapolis, Minnesota, November 16-20, 1992", title = "Proceedings, Supercomputing '92: Minneapolis, Minnesota, November 16-20, 1992", publisher = pub-IEEE, address = pub-IEEE:adr, pages = "xxiv + 848", year = "1992", ISBN = "0-8186-2632-1 (case), 0-8186-2630-5 (paper), 0-8186-2631-3 (microfiche), 0-89791-537-2 (ACM Library series)", ISBN-13 = "978-0-8186-2632-6 (case), 978-0-8186-2630-2 (paper), 978-0-8186-2631-9 (microfiche), 978-0-89791-537-3 (ACM Library series)", LCCN = "QA76.5 .S894 1992", bibdate = "Wed Aug 28 06:48:31 MDT 1996", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib; University of California MELVYL catalog.", note = "Cover title: Supercomputing '91. ACM order number 415922. IEEE Computer Society Press order number 2630 IEEE catalog number 92CH3216-9.", acknowledgement = ack-nhfb, keywords = "artificial intelligence; biosciences; cache; compiling; distributed computing; fluids; industrial modeling; instruction-level optimization; interconnections; massively parallel systems; multiprocessing programs; multiprocessing systems; numerical applications; parallel algorithms; parallel programming; parallelizing transformations; particles; performance evaluation; performance methodology; register efficiency; scheduling; sparse matrix algorithms; supercomputers --- congresses; symbolic algorithms; waves", } @Proceedings{ACM:1993:CRT, editor = "{ACM}", key = "ACM SIGPLAN POPL '93", booktitle = "Conference record of the Twentieth Annual {ACM} {SIGPLAN-SIGACT} Symposium on Principles of Programming Languages: papers presented at the symposium, {Charleston, South Carolina}, {January} 10--13, 1993", title = "Conference record of the Twentieth Annual {ACM} {SIGPLAN-SIGACT} Symposium on Principles of Programming Languages: papers presented at the symposium, {Charleston, South Carolina}, {January} 10--13, 1993", publisher = pub-ACM, address = pub-ACM:adr, pages = "viii + 510", year = "1993", ISBN = "0-89791-560-7 (soft cover), 0-89791-561-5 (series hard cover)", ISBN-13 = "978-0-89791-560-1 (soft cover), 978-0-89791-561-8 (series hard cover)", LCCN = "QA76.7 .A15 1993", bibdate = "Mon May 03 18:38:48 1999", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "ACM order number 549930.", URL = "http://www.acm.org/pubs/contents/proceedings/plan/158511/index.html", acknowledgement = ack-nhfb, classification = "C4210 (Formal logic); C4240 (Programming and algorithm theory); C6110 (Systems analysis and programming); C6140D (High level languages); C6150C (Compilers, interpreters and other processors); C6170 (Expert systems)", confdate = "10-13 Jan. 1993", conflocation = "Charleston, SC, USA", confsponsor = "ACM", keywords = "Compilers; Computational complexity; electronic digital computers --- programming --- congresses; Functional programming; Lambda calculus; Lazy evaluation; Logic programming; Object-oriented languages; Parallel computing; Parametricity; Polymorphism; Program testing/debugging; Programming language principles; programming languages (electronic computers) --- congresses; Register allocation; Typed languages", thesaurus = "Computational complexity; High level languages; Lambda calculus; Program compilers; Programming; Programming theory; Storage allocation", } @Proceedings{ACM:1993:PTF, editor = "{ACM}", booktitle = "{Proceedings of the twenty-fifth annual {ACM} Symposium on the Theory of Computing, San Diego, California, May 16--18, 1993}", title = "{Proceedings of the twenty-fifth annual {ACM} Symposium on the Theory of Computing, San Diego, California, May 16--18, 1993}", publisher = pub-ACM, address = pub-ACM:adr, pages = "ix + 812", year = "1993", ISBN = "0-89791-591-7", ISBN-13 = "978-0-89791-591-5", LCCN = "QA 76.6 A13 1993", bibdate = "Thu Dec 3 07:11:18 MST 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "ACM order no. 508930.", acknowledgement = ack-nhfb, keywords = "computational complexity --- congresses", } @Proceedings{ACM:1993:TCS, editor = "ACM", booktitle = "TRI-Ada '93: Conference --- September 1993, Seattle, WA", title = "{TRI}-Ada '93: Conference --- September 1993, Seattle, {WA}", publisher = pub-ACM, address = pub-ACM:adr, pages = "vii + 482", year = "1993", ISBN = "0-89791-621-2", ISBN-13 = "978-0-89791-621-9", LCCN = "????", bibdate = "Thu Sep 04 12:56:10 1997", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "ACM Order No. 825930.", series = "TRIADA -proceedings- 1993", acknowledgement = ack-nhfb, sponsor = "Association for Computing Machinery; SIGAda.", } @Proceedings{IEEE:1993:PSP, editor = "{IEEE}", key = "Supercomputing'93", booktitle = "Proceedings, Supercomputing '93: Portland, Oregon, November 15--19, 1993", title = "Proceedings, Supercomputing '93: Portland, Oregon, November 15--19, 1993", publisher = pub-IEEE, address = pub-IEEE:adr, pages = "xxii + 935", year = "1993", ISBN = "0-8186-4340-4 (paperback), 0-8186-4341-2 (microfiche), 0-8186-4342-0 (hardback), 0-8186-4346-3 (CD-ROM)", ISBN-13 = "978-0-8186-4340-8 (paperback), 978-0-8186-4341-5 (microfiche), 978-0-8186-4342-2 (hardback), 978-0-8186-4346-0 (CD-ROM)", ISSN = "1063-9535", LCCN = "QA76.5 .S96 1993", bibdate = "Mon Jan 15 11:06:21 1996", bibsource = "ftp://ftp.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, classification = "631.1; 722.1; 722.3; 722.4; 723.2; 921.6", keywords = "Algorithms; Cache coherence; Clustered workstations; Computer graphics; Computer networks; Computer programming languages; Data parallel compilers; Data partitioning; Distributed computer systems; Eigenvalues and eigenfunctions; Finite element method; Flow visualization; Fluid mechanics; Linear algebra; Mass storage; Massively parallel processors; Natural sciences computing; Parallel languages; Parallel processing systems; Parallel rendering; Program compilers; Quantum theory; Scheduling; Sparse matrices; Supercomputers", sponsor = "Institute of Electrical and Electronics Engineers; Computer Society. Association for Computing Machinery; SIGARCH.", } @Proceedings{ACM:1994:ASC, editor = "{ACM}", booktitle = "{ACM SIGPLAN '94 Conference on Programming Language Design and Implementation (PLDI). Orlando, FL, USA, 20--24 June, 1994}", title = "{ACM SIGPLAN '94 Conference on Programming Language Design and Implementation (PLDI). Orlando, FL, USA, 20--24 June, 1994}", volume = "29(6)", publisher = pub-ACM, address = pub-ACM:adr, pages = "360", month = jun, year = "1994", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Fri Apr 24 18:36:02 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", series = j-SIGPLAN, acknowledgement = ack-nhfb, classification = "C4240 (Programming and algorithm theory); C6110 (Systems analysis and programming); C6140D (High level languages); C6150C (Compilers, interpreters and other processors); C6150G (Diagnostic, testing, debugging and evaluating systems)", conftitle = "ACM SIGPLAN '94 Conference on Programming Language Design and Implementation (PLDI)", keywords = "address calculation; array access errors; backtracking; cache performance; CLP; code replication; compilation techniques; continuation passing; garbage collected programs; high level languages; jump debugging; jump statements; lazy functional state threads; link-time optimisation; memory access coalescing; optimal tracing; optimisation; partial dead code elimination; pointer-based data structures; Presburger Formulas; program analysis tools; program compilers; program debugging; program optimisation; program structure tree; programming; programming language design; programming theory; programming theory program debugging; Prolog; register allocation; slicing programs; Standard ML; type analysis; zero-cost range splitting", sponsororg = "ACM", treatment = "P Practical; T Theoretical or Mathematical", } @Proceedings{ACM:1994:CRP, editor = "{ACM}", booktitle = "Conference record of {POPL} '94, 21st {ACM SIGPLAN-SIGACT} Symposium on Principles of Programming Languages: papers presented at the Symposium: Portland, Oregon, January 17--21, 1994", title = "Conference record of {POPL} '94, 21st {ACM SIGPLAN-SIGACT} Symposium on Principles of Programming Languages: papers presented at the Symposium: Portland, Oregon, January 17--21, 1994", publisher = pub-ACM, address = pub-ACM:adr, pages = "viii + 492", year = "1994", ISBN = "0-89791-636-0", ISBN-13 = "978-0-89791-636-3", LCCN = "QA76.7 .A15 1994", bibdate = "Sat Sep 7 07:51:54 MDT 1996", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.acm.org/pubs/contents/proceedings/plan/174675/index.html", abstract = "The following topics were dealt with: programming language principles; OOP; type theory; program correctness; lambda calculus; garbage collection; logic programming; scheduling; data flow graphs; functional programming; and continuation passing.", acknowledgement = ack-nhfb, classification = "C4210 (Formal logic); C4240 (Programming and algorithm theory); C6110J (Object-oriented programming); C6120 (File organisation); C6140D (High level languages); C6150C (Compilers, interpreters and other processors)", confdate = "17--21 Jan. 1994", conflocation = "Portland, OR, USA", confsponsor = "ACM", keywords = "Continuation passing; Data flow graphs; Functional programming; Garbage collection; Lambda calculus; Logic programming; OOP; Program correctness; Programming language principles; Scheduling; Type theory", thesaurus = "High level languages; Lambda calculus; Object-oriented programming; Program compilers; Program verification; Storage management; Type theory", } @Proceedings{ACM:1994:IPI, editor = "{ACM}", booktitle = "{ISSAC '94: Proceedings of the 1994 International Symposium on Symbolic and Algebraic Computation: July 20--22, 1994, Oxford, England, United Kingdom}", title = "{ISSAC '94: Proceedings of the 1994 International Symposium on Symbolic and Algebraic Computation: July 20--22, 1994, Oxford, England, United Kingdom}", publisher = pub-ACM, address = pub-ACM:adr, pages = "ix + 359", year = "1994", ISBN = "0-89791-638-7", ISBN-13 = "978-0-89791-638-7", LCCN = "QA76.95.I59 1994", bibdate = "Thu Sep 26 05:45:15 MDT 1996", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, confdate = "20--22 July 1994", conflocation = "Oxford, UK", confsponsor = "ACM", pubcountry = "USA", } @Proceedings{ACM:1994:SIC, editor = "ACM", booktitle = "{Sixth International Conference on Architectural Support for Programming Languages and Operating Systems (ASPLOS-VI). San Jose, CA, USA, 4--7 October, 1994}", title = "{Sixth International Conference on Architectural Support for Programming Languages and Operating Systems (ASPLOS-VI). San Jose, CA, USA, 4--7 October, 1994}", volume = "29(11)", publisher = pub-ACM, address = pub-ACM:adr, pages = "328", month = nov, year = "1994", CODEN = "SINODQ", ISSN = "0362-1340 (print), 1523-2867 (print), 1558-1160 (electronic)", ISSN-L = "0362-1340", bibdate = "Fri Apr 24 18:36:02 MDT 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", series = j-SIGPLAN, acknowledgement = ack-nhfb, classification = "C5220 (Computer architecture); C6140 (Programming languages); C6150J (Operating systems)", conflocation = "", conftitle = "Sixth International Conference on Architectural Support for Programming Languages and Operating Systems (ASPLOS-VI)", keywords = "architectural support; code transformation; computer architecture; instrumentation; measurement; memory access; multithreading; operating systems; operating systems (computers); parallel machines; programming languages; shares memory multiprocessors; uniprocessor performance", sponsororg = "ACM; IEEE Comput. Soc", } @Proceedings{Anonymous:1994:ICS, editor = "Anonymous", booktitle = "1994 International Computer Symposium Conference Proceedings", title = "1994 International Computer Symposium Conference Proceedings", publisher = "Nat. Chiao Tung Univ", address = "Hsinchu, Taiwan", pages = "xvi + 1310", year = "1994", ISBN = "????", ISBN-13 = "????", LCCN = "????", bibdate = "Sun Dec 22 10:19:23 MST 1996", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "2 vol.", acknowledgement = ack-nhfb, confdate = "12--15 Dec. 1994", conflocation = "Hsinchu, Taiwan", confsponsor = "Ministr. Educ.; Comput. Soc", pubcountry = "Taiwan", } @Proceedings{Anonymous:1994:USC, editor = "Anonymous", booktitle = "USENIX Summer conference: --- June 1994, Boston, MA", title = "{USENIX} Summer conference: -- June 1994, Boston, {MA}", publisher = pub-USENIX, address = pub-USENIX:adr, pages = "316", year = "1994", ISBN = "1-880446-62-6", ISBN-13 = "978-1-880446-62-1", LCCN = "QA 76.76 O63 U83 1994", bibdate = "Sat May 25 07:59:58 MDT 1996", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", series = "USENIX Conference Proceedings 1994", acknowledgement = ack-nhfb, } @Proceedings{Goldwasser:1994:PAS, editor = "Shafi Goldwasser", booktitle = "Proceedings: 35th Annual Symposium on Foundations of Computer Science, November 20--22, 1994, Santa Fe, New Mexico", title = "Proceedings: 35th Annual Symposium on Foundations of Computer Science, November 20--22, 1994, Santa Fe, New Mexico", publisher = pub-IEEE, address = pub-IEEE:adr, pages = "xiii + 837", year = "1994", CODEN = "ASFPDV", ISBN = "0-8186-6582-3", ISBN-13 = "978-0-8186-6582-0", ISSN = "0272-5428", LCCN = "QA 76 S979 1994", bibdate = "Thu Dec 3 07:11:18 MST 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "IEEE catalog number 94CH35717. IEEE Computer Society Press Order Number 6580-02.", acknowledgement = ack-nhfb, keywords = "electronic data processing --- congresses", } @Proceedings{IEEE:1994:PSH, editor = "{IEEE}", booktitle = "{Proceedings of the Scalable High-Performance Computing Conference, May 23--25, 1994, Knoxville, Tennessee}", title = "{Proceedings of the Scalable High-Performance Computing Conference, May 23--25, 1994, Knoxville, Tennessee}", publisher = pub-IEEE, address = pub-IEEE:adr, pages = "xviii + 852", year = "1994", ISBN = "0-8186-5680-8, 0-8186-5681-6", ISBN-13 = "978-0-8186-5680-4, 978-0-8186-5681-1", LCCN = "QA76.5 .S244 1994", bibdate = "Mon Aug 26 10:38:41 MDT 1996", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "IEEE catalog number 94TH0637-9.", acknowledgement = ack-nhfb, sponsor = "IEEE Computer Society; Technical Committee on Supercomputing Applications.", } @Proceedings{IEEE:1994:PSW, editor = "{IEEE}", booktitle = "{Proceedings, {Supercomputing '94: Washington, DC, November 14--18, 1994}}", title = "{Proceedings, {Supercomputing '94: Washington, DC, November 14--18, 1994}}", publisher = pub-IEEE, address = pub-IEEE:adr, pages = "xvii + 823", year = "1994", ISBN = "0-8186-6605-6 (paper), 0-8186-6606-4 (microfiche), 0-8186-6607-2 (case)", ISBN-13 = "978-0-8186-6605-6 (paper), 978-0-8186-6606-3 (microfiche), 978-0-8186-6607-0 (case)", ISSN = "1063-9535", LCCN = "QA76.5 .S894 1994", bibdate = "Fri Aug 30 08:01:51 MDT 1996", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "IEEE catalog number 94CH34819.", series = "Supercomputing", acknowledgement = ack-nhfb, keywords = "Supercomputers --- Congresses", sponsor = "IEEE.", } @Proceedings{IEEE:1994:ROS, editor = "IEEE", booktitle = "Real-time operating systems and software: RTOSS '94: 11th Workshop --- May 1994, Seattle, WA", title = "Real-time operating systems and software: {RTOSS} '94: 11th Workshop --- May 1994, Seattle, {WA}", publisher = pub-IEEE, address = pub-IEEE:adr, pages = "viii + 117", year = "1994", ISBN = "0-8186-5710-3", ISBN-13 = "978-0-8186-5710-8", LCCN = "QA76.54.I173 1994", bibdate = "Sat May 25 07:59:58 MDT 1996", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", series = "IEEE Workshop on Real Time Operating Systems and Software 1994; 11th", acknowledgement = ack-nhfb, sponsor = "IEEE; Computer Society; Technical Committee on Real-Time Systems.", } @Proceedings{ACM:1995:CPI, editor = "ACM", booktitle = "Conference proceedings of the 1995 International Conference on Supercomputing, Barcelona, Spain, July 3--7, 1995", title = "Conference proceedings of the 1995 International Conference on Supercomputing, Barcelona, Spain, July 3--7, 1995", publisher = pub-ACM, address = pub-ACM:adr, pages = "xii + 448", year = "1995", ISBN = "0-89791-728-6", ISBN-13 = "978-0-89791-728-5", LCCN = "QA 76.88 I57 1995", bibdate = "Mon Dec 23 18:50:57 1996", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", series = "Conference Proceedings of the International Conference on Supercomputing", acknowledgement = ack-nhfb, sponsor = "Association for Computing Machinery. Special Interest Group on Computer Architecture.", } @Proceedings{ACM:1995:CRP, editor = "{ACM}", booktitle = "Conference record of {POPL} '95, 22nd {ACM} {SIGPLAN-SIGACT} Symposium on Principles of Programming Languages: papers presented at the Symposium: San Francisco, California, January 22--25, 1995", title = "Conference record of {POPL} '95, 22nd {ACM} {SIGPLAN-SIGACT} Symposium on Principles of Programming Languages: papers presented at the Symposium: San Francisco, California, January 22--25, 1995", publisher = pub-ACM, address = pub-ACM:adr, pages = "vii + 408", year = "1995", ISBN = "0-89791-692-1", ISBN-13 = "978-0-89791-692-9", LCCN = "QA 76.7 A11 1995", bibdate = "Mon May 3 17:47:49 MDT 1999", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "ACM order number: 549950.", URL = "http://www.acm.org/pubs/contents/proceedings/plan/199448/index.html", acknowledgement = ack-nhfb, alttitle = "Proceedings, 22nd ACM SIGPLAN-SIGACT Symposium on Principles of Programming Languages POPL '95", annote = "Sponsored by the Association for Computing Machinery, Special Interest Group on Algorithms and Computation Theory (SIGACT), Special Interest Group on Programming Languages (SIGPLAN).", keywords = "Programming languages (Electronic computers) -- Congresses.", } @Proceedings{IEEE:1995:PCL, editor = "{IEEE Computer Society. Technical Committee on Computer Communications}", booktitle = "Proceedings: 20th Conference on Local Computer Networks, October 16--19, 1995, Minneapolis, Minnesota", title = "Proceedings: 20th Conference on Local Computer Networks, October 16--19, 1995, Minneapolis, Minnesota", publisher = pub-IEEE, address = pub-IEEE:adr, pages = "xii + 496", year = "1995", ISBN = "0-8186-7163-7 (microfiche), 0-8186-7162-9", ISBN-13 = "978-0-8186-7163-0 (microfiche), 978-0-8186-7162-3", LCCN = "TK5105.7 .C66 1995 Bar", bibdate = "Mon Sep 27 06:55:07 MDT 1999", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "IEEE Computer Society Press order number PR07162. IEEE catalog number 95TB100005", acknowledgement = ack-nhfb, keywords = "local area networks (computer networks) -- congresses", } @Proceedings{ACM:1996:FCP, editor = "{ACM}", booktitle = "FCRC '96: Conference proceedings of the 1996 International Conference on Supercomputing: Philadelphia, Pennsylvania, {USA}, May 25--28, 1996", title = "{FCRC} '96: Conference proceedings of the 1996 International Conference on Supercomputing: Philadelphia, Pennsylvania, {USA}, May 25--28, 1996", publisher = pub-ACM, address = pub-ACM:adr, pages = "xii + 406", year = "1996", ISBN = "0-89791-803-7", ISBN-13 = "978-0-89791-803-9", LCCN = "QA76.5 I61 1996", bibdate = "Wed Mar 18 12:33:29 MST 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "ACM order number 415961.", acknowledgement = ack-nhfb, keywords = "Supercomputers --- Congresses.", } @Proceedings{IEEE:1996:PSM, editor = "{IEEE}", booktitle = "Proceedings. Second MPI Developer's Conference: Notre Dame, IN, USA, 1--2 July 1996", title = "Proceedings. Second {MPI} Developer's Conference: Notre Dame, {IN}, {USA}, 1--2 July 1996", publisher = pub-IEEE, address = pub-IEEE:adr, pages = "ix + 207", year = "1996", ISBN = "0-8186-7533-0", ISBN-13 = "978-0-8186-7533-1", LCCN = "QA76.642 .M67 1996", bibdate = "Tue May 12 08:56:04 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, sponsororg = "IEEE Comput. Soc. Tech. Committee on Distributed Process", } @Proceedings{USENIX:1996:ATT, editor = "{USENIX} Association", booktitle = "4th Annual Tcl/Tk Workshop '96, July 10--13, 1996. Monterey, CA", title = "4th Annual Tcl/Tk Workshop '96, July 10--13, 1996. Monterey, {CA}", publisher = pub-USENIX, address = pub-USENIX:adr, pages = "????", day = "10--13", month = jul, year = "1996", ISBN = "1-880446-78-2", ISBN-13 = "978-1-880446-78-2", LCCN = "QA76.73.T44 T44 1996", bibdate = "Fri Oct 18 07:24:24 MDT 1996", bibsource = "ftp://ftp.uu.net/library/bibliography; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, location = "Monterey, CA", } @Proceedings{USENIX:1996:PFA, editor = "{USENIX}", booktitle = "Proceedings of the fourth annual Tcl\slash Tk Workshop, July 10--13, 1996, Monterey, California", title = "Proceedings of the fourth annual Tcl\slash Tk Workshop, July 10--13, 1996, Monterey, California", publisher = pub-USENIX, address = pub-USENIX:adr, pages = "235", year = "1996", ISBN = "1-880446-78-2", ISBN-13 = "978-1-880446-78-2", LCCN = "QA 76.73 T44 T35 1996", bibdate = "Mon May 11 11:50:25 1998", bibsource = "ftp://ftp.uu.net/library/bibliography; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.usenix.org/publications/library/proceedings/tcl96/", acknowledgement = ack-nhfb, location = "Monterey, CA", } @Proceedings{IEEE:1997:APD, editor = "{IEEE}", booktitle = "Advances in parallel and distributed computing: March 19--21, 1997, Shanghai, China: proceedings", title = "Advances in parallel and distributed computing: March 19--21, 1997, Shanghai, China: proceedings", publisher = pub-IEEE, address = pub-IEEE:adr, pages = "xii + 426", year = "1997", ISBN = "0-8186-7876-3 (paperback and case), 0-8186-7878-X (microfiche)", ISBN-13 = "978-0-8186-7876-9 (paperback and case), 978-0-8186-7878-3 (microfiche)", LCCN = "QA76.58 .A4 1997", bibdate = "Wed Apr 16 07:34:31 MDT 1997", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "electronic data processing -- distributed processing -- congresses; parallel processing (electronic computers) -- congresses", } @Proceedings{ACM:1998:AWJ, editor = "{ACM}", booktitle = "ACM 1998 Workshop on Java for High-Performance Network Computing", title = "{ACM} 1998 Workshop on Java for High-Performance Network Computing", publisher = pub-ACM, address = pub-ACM:adr, pages = "????", year = "1998", ISBN = "????", ISBN-13 = "????", LCCN = "????", bibdate = "Thu Apr 27 10:40:59 2000", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "Possibly unpublished, except electronically.", URL = "http://www.cs.ucsb.edu/conferences/java98/program.html", acknowledgement = ack-nhfb, } @Proceedings{ACM:1998:CRP, editor = "ACM", booktitle = "Conference record of POPL '98: the 25th ACM SIGPLAN-SIGACT Symposium on Principles of Programming Languages: papers presented at the Symposium, San Diego, California, 19--21 January 1998", title = "Conference record of {POPL} '98: the 25th {ACM} {SIGPLAN-SIGACT} Symposium on Principles of Programming Languages: papers presented at the Symposium, San Diego, California, 19--21 January 1998", publisher = pub-ACM, address = pub-ACM:adr, pages = "viii + 408", year = "1998", ISBN = "0-89791-979-3", ISBN-13 = "978-0-89791-979-1", LCCN = "QA76.7 .A15 1998", bibdate = "Mon May 3 17:47:49 MDT 1999", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", note = "ACM order number: 549981.", URL = "http://www.acm.org/pubs/contents/proceedings/plan/268946/index.html", acknowledgement = ack-nhfb, alttitle = "POPL '98 ACM SIGPLAN-SIGACT Symposium on Principles of Programming Languages Principles of programming languages Proceedings 25th ACM SIGPLAN-SIGACT Symposium on Principles of Programming Languages", keywords = "Electronic digital computers -- Programming -- Congresses.; Programming languages (Electronic computers) -- Congresses.", } @Proceedings{ACM:1998:PAI, editor = "{ACM}", booktitle = "{Proceedings: the 25th Annual International Symposium on Computer Architecture, June 27--July 1, 1998, Barcelona, Spain}", title = "{Proceedings: the 25th Annual International Symposium on Computer Architecture, June 27--July 1, 1998, Barcelona, Spain}", volume = "26(3)", publisher = pub-ACM, address = pub-ACM:adr, pages = "xiii + 394", year = "1998", ISBN = "0-8186-8491-7, 0-8186-8492-5, 0-8186-8493-3", ISBN-13 = "978-0-8186-8491-3, 978-0-8186-8492-0, 978-0-8186-8493-7", LCCN = "QA76.9.A73 S97 1998", bibdate = "Fri May 12 12:36:10 MDT 2006", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib; z3950.bibsys.no:2100/BIBSYS", note = "ACM Order Number 414984. IEEE Computer Society Order Number PR08491; IEEE Order Plan Catalog Number 98CB36235.", series = "Computer architecture news", URL = "http://portal.acm.org/toc.cfm?id=279358; http://portal.acm.org/toc.cfm?id=285930", acknowledgement = ack-nhfb, remark = "ISCA '25 proceedings.", } @Proceedings{ACM:1998:SHP, editor = "{ACM}", booktitle = "SC'98: High Performance Networking and Computing: Proceedings of the 1998 ACM\slash IEEE SC98 Conference: Orange County Convention Center, Orlando, Florida, USA, November 7--13, 1998", title = "{SC}'98: High Performance Networking and Computing: Proceedings of the 1998 {ACM}\slash {IEEE} {SC98} Conference: Orange County Convention Center, Orlando, Florida, {USA}, November 7--13, 1998", publisher = pub-ACM # " and " # pub-IEEE, address = pub-ACM:adr # " and " # pub-IEEE:adr, pages = "????", year = "1998", ISBN = "????", ISBN-13 = "????", LCCN = "????", bibdate = "Wed Oct 07 08:51:34 1998", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", URL = "http://www.supercomp.org/sc98/papers/", acknowledgement = ack-nhfb, } @Proceedings{ACM:1999:PASa, editor = "ACM", booktitle = "Proceedings of the ACM SIGPLAN '99 Conference on Programming Language Design and Implementation (PLDI '99), Atlanta, Georgia, 2--4 May 1999", title = "Proceedings of the {ACM} {SIGPLAN} '99 Conference on Programming Language Design and Implementation ({PLDI} '99), Atlanta, Georgia, 2--4 May 1999", publisher = pub-ACM, address = pub-ACM:adr, pages = "????", year = "1999", ISBN = "????", ISBN-13 = "????", LCCN = "????", bibdate = "Thu May 13 14:45:29 1999", bibsource = "http://www.acm.org/pubs/contents/proceedings/pldi/301122/index.html; http://www.acm.org/pubs/contents/proceedings/pldi/301618/index.html; http://www.cs.rutgers.edu/pldi99/program.html; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Proceedings{ACM:1999:SPO, editor = "{ACM}", booktitle = "SC'99: Oregon Convention Center 777 NE Martin Luther King Jr. Boulevard, Portland, Oregon, November 11--18, 1999", title = "{SC}'99: Oregon Convention Center 777 {NE} Martin Luther King Jr. Boulevard, Portland, Oregon, November 11--18, 1999", publisher = pub-ACM # " and " # pub-IEEE, address = pub-ACM:adr # " and " # pub-IEEE:adr, pages = "????", year = "1999", ISBN = "", ISBN-13 = "", LCCN = "", bibdate = "Thu Feb 24 09:35:00 2000", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Proceedings{Dongarra:1999:RAP, editor = "J. J. Dongarra and E. Luque and Tomas Margalef", booktitle = "{Recent advances in parallel virtual machine and message passing interface: 6th European {PVM}\slash {MPI} Users' Group Meeting, Barcelona, Spain, September 26--29, 1999: Proceedings}", title = "{Recent advances in parallel virtual machine and message passing interface: 6th European {PVM}\slash {MPI} Users' Group Meeting, Barcelona, Spain, September 26--29, 1999: Proceedings}", volume = "1697", publisher = pub-SV, address = pub-SV:adr, pages = "xvii + 551", year = "1999", CODEN = "LNCSD9", DOI = "????", ISBN = "3-540-66549-8 (softcover)", ISBN-13 = "978-3-540-66549-6 (softcover)", ISSN = "0302-9743 (print), 1611-3349 (electronic)", LCCN = "QA76.58 E973 1999", bibdate = "Wed Dec 8 06:34:56 MST 1999", bibsource = "ftp://ftp.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", series = ser-LNCS, URL = "http://link.springer-ny.com/link/service/series/0558/tocs/t1697.htm; http://www.springerlink.com/openurl.asp?genre=issue&issn=0302-9743&volume=1697", acknowledgement = ack-nhfb, alttitle = "PVM\slash MPI '99", keywords = "Data transmission systems; Parallel computers; Virtual computer systems", } @Proceedings{Anonymous:2000:CCI, editor = "Anonymous", booktitle = "Cool Chips III: An International Symposium on Low-Power and High-Speed Chips, Kikai-Shinko-Kaikan, Tokyo, Japan April 24--25, 2000", title = "Cool Chips {III}: An International Symposium on Low-Power and High-Speed Chips, Kikai-Shinko-Kaikan, Tokyo, Japan April 24--25, 2000", publisher = "????", address = "????", pages = "????", year = "2000", ISBN = "", LCCN = "", bibdate = "Mon Jan 08 09:19:21 2001", bibsource = "http://www.coolchips.org/index-cool3.html; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Book{Koniges:2000:ISP, editor = "Alice E. Koniges", booktitle = "Industrial Strength Parallel Computing", title = "Industrial Strength Parallel Computing", publisher = pub-MORGAN-KAUFMANN, address = pub-MORGAN-KAUFMANN:adr, pages = "xxv + 597", year = "2000", ISBN = "1-55860-540-1", ISBN-13 = "978-1-55860-540-4", LCCN = "QA76.58 .I483 2000", bibdate = "Fri Feb 04 18:30:40 2000", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Proceedings{ACM:2001:PAJ, editor = "{ACM}", booktitle = "Proceedings of the {ACM 2001 Java Grande\slash ISCOPE Conference: Palo Alto, Calif., June 2--4, 2001}", title = "Proceedings of the {ACM 2001 Java Grande\slash ISCOPE Conference: Palo Alto, Calif., June 2--4, 2001}", publisher = pub-ACM, address = pub-ACM:adr, pages = "vi + 186", year = "2001", ISBN = "1-58113-359-6", ISBN-13 = "978-1-58113-359-2", LCCN = "QA76.9.O35 A26 2001", bibdate = "Mon May 6 06:26:30 MDT 2002", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, keywords = "Java (computer program language) -- congresses; object-oriented methods (computer science) -- congresses", } @Proceedings{Boisvert:2001:ASS, editor = "Ronald F. Boisvert and Ping Tak Peter Tang", booktitle = "The architecture of scientific software: {IFIP TC2/WG2.5 Working Conference on the Architecture of Scientific Software, October 2--4, 2000, Ottawa, Canada}", title = "The architecture of scientific software: {IFIP TC2/WG2.5 Working Conference on the Architecture of Scientific Software, October 2--4, 2000, Ottawa, Canada}", volume = "60", publisher = pub-KLUWER, address = pub-KLUWER:adr, pages = "xx + 358", year = "2001", ISBN = "0-7923-7339-1", ISBN-13 = "978-0-7923-7339-1", LCCN = "QA76.758 .I345 2000", bibdate = "Fri May 27 08:46:38 2005", bibsource = "ftp://ftp.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", series = "IFIP", acknowledgement = ack-nhfb, tableofcontents = "Preface (p. ix)\\ Contributing Authors (p. xv)\\ Part I: Large-Scale Systems Integration\\ Network-Based Scientific Computing: Elias N. Houstis, Ann Christine Catlin, Ganesh Balakrishnan, Nitesh Dhanjani, GaHyun Park, John R. Rice, Spyros Lalis, Manolis Stamatogiannakis, Catherine E. Houstis (pp. 3--28) \\ Future Generations of Problem-Solving Environments: Jos{\'e} C. Cunha (pp. 29--38) \\ Developing an Architecture to Support the Implementation and Development of Scientific computing Applications: Dorian C. Arnold, Jack Dongarra (pp. 39--56) \\ PETSc and Overture: Lessons Learned Developing an Interface between Components: Kristopher R. Buschelman, William Gropp, Lois C. McInnes, Barry F. Smith (pp. 57--68) \\ Component Technology for High-Performance Scientific Simulation Software: Tom Epperly, Scott R. Kohn, Gary Kumfert (pp. 69--86) \\ A New Approach to Software Integration Frameworks for Multi-physics Simulation Codes: Eric de Sturler, Jay Hoeflinger, Laxmikant V. Kal{\'e}, Milind Bhandarkar (pp. 87--104) \\ Code Coupling using Parallel CORBA Objects: Christophe Ren{\'e}, Thierry Priol, Guillaume All{\'e}on (pp. 105--118) \\ A Collaborative Code Development Environment for Computational Electro-magnetics: Matthew S. Shields, Omer F. Rana, David W. Walker, David Colby (pp. 119--144) \\ Part II: The Architecture of Components\\ On the Role of Mathematical Abstractions for Scientific Computing: Krister {\AA}hlander, Magne Haveraaen, Hans Z. Munthe-Kaas (pp. 145--158) \\ Object-oriented Modeling of Parallel PDE Solvers: Michael Thun{\'e}, Krister {\AA}hlander, Malin Ljungberg, Markus Nord{\'e}n, Kurt Otto, Jarmo Rantakokko (pp. 159--174) \\ Broadway: A Software Architecture for Scientific Computing: Samuel Z. Guyer, Calvin Lin (pp. 175--192) \\ Formal Methods for High-Performance Linear Algebra Libraries: John A. Gunnels, Robert A. van de Geijn (pp. 193--210) \\ New Generalized Matrix Data Structures Lead to a Variety of High-Performance Algorithms: Fred G. Gustavson (pp. 211--234) \\ A Comprehensive DFT API for Scientific Computing: Ping Tak Peter Tang (pp. 235--256) \\ Using A Fortran Interface to POSIX Threads: Richard J. Hanson, Clay P. Breshears, Henry A. Gabb (pp. 257--272) \\ Data Management Systems for Scientific Applications: Reagan Moore (pp. 273--284) \\ Software Components for Application Development: Arnaud Desitter, Antoine Le Hyaric, Geoff Morgan, Gareth Shaw, Anne E. Trefethen (pp. 285--300) \\ Hierarchical Representation and Computation of Approximate Solutions in Scientific Simulations: Wayne H. Enright (pp. 301--316) \\ Software Architecture for the Investigation of Controllable Models with Complex Data Sets: Dmitry Belyshev, Vladimir I. Gurman (pp. 317--332) \\ A Mixed-Language Programming Methodology for High Performance Java Computing: Vladimir Getov (pp. 333--350) \\ Part III: Conference Information\\ The Architecture of Scientific Software: the Conference (pp. 351--356)\\ Index (pp. 357--358)", } @Proceedings{Eigenmann:2001:OSM, editor = "Rudolf Eigenmann and Michael J. Voss", booktitle = "{OpenMP} shared memory parallel programming: International Workshop on {OpenMP} Applications and Tools, {WOMPAT} 2001, West Lafayette, {IN}, {USA}, July 30--31, 2001: proceedings", title = "{OpenMP} shared memory parallel programming: International Workshop on {OpenMP} Applications and Tools, {WOMPAT} 2001, West Lafayette, {IN}, {USA}, July 30--31, 2001: proceedings", volume = "2104", publisher = pub-SV, address = pub-SV:adr, pages = "x + 184", year = "2001", ISBN = "3-540-42346-X (paperback)", ISBN-13 = "978-3-540-42346-1 (paperback)", LCCN = "QA76.642 .I589 2001; QA267.A1 L43 no.2104", bibdate = "Thu Jan 17 11:49:19 MST 2002", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", series = ser-LNCS, URL = "http://link.springer-ny.com/link/service/series/0558/tocs/t2104.htm", acknowledgement = ack-nhfb, keywords = "parallel programming (computer science) -- congresses", } @Proceedings{IEEE:2002:STI, editor = "{IEEE}", booktitle = "{SC2002}: From Terabytes to Insight. Proceedings of the {IEEE ACM SC 2002 Conference, November 16--22, 2002, Baltimore, MD, USA}", title = "{SC2002}: From Terabytes to Insight. Proceedings of the {IEEE ACM SC 2002 Conference, November 16--22, 2002, Baltimore, MD, USA}", publisher = pub-IEEE, address = pub-IEEE:adr, pages = "????", year = "2002", ISBN = "0-7695-1524-X", ISBN-13 = "978-0-7695-1524-3", LCCN = "????", bibdate = "Thu Feb 21 18:29:36 2002", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Proceedings{ACM:2003:SII, editor = "{ACM}", booktitle = "SC2003: Igniting Innovation. {Phoenix, AZ, November 15--21, 2003}", title = "{SC2003}: Igniting Innovation. {Phoenix, AZ, November 15--21, 2003}", publisher = pub-ACM # " and " # pub-IEEE, address = pub-ACM:adr # " and " # pub-IEEE:adr, pages = "????", year = "2003", ISBN = "1-58113-695-1", ISBN-13 = "978-1-58113-695-1", LCCN = "????", bibdate = "Thu Feb 21 18:29:36 2003", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib", acknowledgement = ack-nhfb, } @Proceedings{Chapman:2005:SMP, editor = "Barbara M. Chapman", booktitle = "{Shared memory parallel programming with OpenMP: 5th International Workshop on OpenMP Applications and Tools, WOMPAT 2004, Houston, TX, USA, May 17--18, 2004: Revised selected papers}", title = "{Shared memory parallel programming with OpenMP: 5th International Workshop on OpenMP Applications and Tools, WOMPAT 2004, Houston, TX, USA, May 17--18, 2004: Revised selected papers}", volume = "3349", publisher = pub-SV, address = pub-SV:adr, pages = "x + 147", year = "2005", CODEN = "LNCSD9", DOI = "http://dx.doi.org/10.1007/b105895", ISBN = "3-540-24560-X", ISBN-13 = "978-3-540-24560-5", ISSN = "0302-9743 (print), 1611-3349 (electronic)", LCCN = "QA76 .A1 L42 NO.3349", bibdate = "Thu Jun 2 07:26:02 MDT 2005", bibsource = "clavis.ucalgary.ca:2200/UNICORN; http://www.math.utah.edu/pub/tex/bib/multithreading.bib", series = ser-LNCS, URL = "http://www.springerlink.com/openurl.asp?genre=issue&issn=0302-9743&volume=3349; http://www.springerlink.com/openurl.asp?genre=volume&id=doi:10.1007/b105895", acknowledgement = ack-nhfb, meetingname = "International Workshop on OpenMP Applications and Tools (2004: Houston, Tex.)", subject = "Parallel programming (Computer science); Congresses", } @Proceedings{Lathrop:2011:SPI, editor = "Scott Lathrop and Jim Costa and William Kramer", booktitle = "{SC'11: Proceedings of 2011 International Conference for High Performance Computing, Networking, Storage and Analysis, Seattle, WA, November 12--18 2011}", title = "{SC'11: Proceedings of 2011 International Conference for High Performance Computing, Networking, Storage and Analysis, Seattle, WA, November 12--18 2011}", publisher = pub-ACM # " and " # pub-IEEE, address = pub-ACM:adr # " and " # pub-IEEE:adr, pages = "????", year = "2011", ISBN = "1-4503-0771-X", ISBN-13 = "978-1-4503-0771-0", LCCN = "????", bibdate = "Fri Dec 16 11:11:35 2011", bibsource = "http://www.math.utah.edu/pub/tex/bib/multithreading.bib; http://www.math.utah.edu/pub/tex/bib/supercomputing2011.bib", acknowledgement = ack-nhfb, xxeditor = "{ACM}", }