@Preamble{
  "\hyphenation{}"
}
@String{ack-nhfb = "Nelson H. F. Beebe,
                    University of Utah,
                    Department of Mathematics, 110 LCB,
                    155 S 1400 E RM 233,
                    Salt Lake City, UT 84112-0090, USA,
                    Tel: +1 801 581 5254,
                    FAX: +1 801 581 4148,
                    e-mail: \path|beebe@math.utah.edu|,
                            \path|beebe@acm.org|,
                            \path|beebe@computer.org| (Internet),
                    URL: \path|http://www.math.utah.edu/~beebe/|"}
@String{inst-ANL-MCS            = "Mathematics and Computer Science
                                  Division, Argonne National Laboratory"}
@String{inst-ANL-MCS:adr        = "9700 South Cass Avenue, Argonne, IL
                                  60439-4801,
                                  USA"}
@String{inst-INRIA              = "INRIA (Institut National de Recherche en
                                  Informatique et en Automatique)"}
@String{inst-INRIA:adr          = "Rocquencourt, France"}
@String{inst-UTK-CS             = "Department of Computer Science,
                                  University of Tennessee, Knoxville"}
@String{inst-UTK-CS:adr         = "Knoxville, TN 37996, USA"}
@String{inst-UCB-EECS           = "Department of Electrical Engineering
                                  and Computer Science, University of
                                  California, Berkeley"}
@String{inst-UCB-EECS:adr       = "Berkeley, CA, USA"}
@String{j-BIT                   = "BIT"}
@String{j-BIT-NUM-MATH          = "BIT Numerical Mathematics"}
@String{j-CCPE                  = "Concurrency and Computation: Prac\-tice and
                                   Experience"}
@String{j-CPE                   = "Concurrency: Prac\-tice and Experience"}
@String{j-ETNA                  = "Electron. Trans. Numer. Anal."}
@String{j-IEEE-TRANS-COMPUT     = "IEEE Transactions on Computers"}
@String{j-IEEE-TRANS-PAR-DIST-SYS = "IEEE Transactions on Parallel and
                                  Distributed Systems"}
@String{j-IJHPCA                = "The International Journal of High
                                  Performance Computing Applications"}
@String{j-IMA-J-NUMER-ANAL      = "IMA Journal of Numerical Analysis"}
@String{j-INT-J-HIGH-SPEED-COMPUTING = "International Journal of High Speed
                                  Computing (IJHSC)"}
@String{j-J-COMPUT-APPL-MATH    = "Journal of Computational and Applied
                                  Mathematics"}
@String{j-J-NUM-LIN-ALG-APPL    = "Journal of Numerical linear algebra with
                                  applications"}
@String{j-J-PAR-DIST-COMP       = "Journal of Parallel and Distributed
                                  Computing"}
@String{j-LECT-NOTES-COMP-SCI   = "Lecture Notes in Computer Science"}
@String{j-LINEAR-ALGEBRA-APPL   = "Linear Algebra and its Applications"}
@String{j-NUM-MATH              = "Numerische Mathematik"}
@String{j-NUMER-ALGORITHMS      = "Numerical Algorithms"}
@String{j-PARALLEL-COMPUTING    = "Parallel Computing"}
@String{j-PARALLEL-DIST-COMP-PRACT = "Parallel and Distributed Computing
                                  Practices"}
@String{j-PROC-IEEE             = "Proceedings of the IEEE"}
@String{j-SCI-PROG              = "Scientific Programming"}
@String{j-SIAM-J-MAT-ANA-APPL   = "SIAM Journal on Matrix Analysis and
                                  Applications"}
@String{j-SIAM-J-NUMER-ANAL     = "SIAM Journal on Numerical Analysis"}
@String{j-SIAM-J-SCI-COMP       = "SIAM Journal on Scientific Computing"}
@String{j-SUPERCOMPUTER         = "Supercomputer"}
@String{j-TOMS                  = "ACM Transactions on Mathematical Software"}
@String{pub-ACM                 = "ACM Press"}
@String{pub-ACM:adr             = "New York, NY 10036, USA"}
@String{pub-CAMBRIDGE           = "Cambridge University Press"}
@String{pub-CAMBRIDGE:adr       = "Cambridge, UK"}
@String{pub-ELSEVIER            = "Elsevier"}
@String{pub-ELSEVIER:adr        = "Amsterdam, The Netherlands"}
@String{pub-IEEE                = "IEEE Computer Society Press"}
@String{pub-IEEE:adr            = "1109 Spring Street, Suite 300, Silver Spring, MD 20910, USA"}
@String{pub-KLUWER              = "Kluwer Academic Publishers"}
@String{pub-KLUWER:adr          = "Dordrecht, The Netherlands"}
@String{pub-LONGMAN             = "Longman Scientific and Technical"}
@String{pub-LONGMAN:adr         = "Harlow, Essex, UK"}
@String{pub-SIAM                = "Society for Industrial and Applied
                                  Mathematics"}
@String{pub-SIAM:adr            = "Philadelphia, PA, USA"}
@String{pub-SV                  = "Spring{\-}er-Ver{\-}lag"}
@String{pub-SV:adr              = "Berlin, Germany~/ Heidelberg,
                                  Germany~/ London, UK~/ etc."}
@String{ser-LNCS                = "Lecture Notes in Computer Science"}
@TechReport{Demmel:1987:PDL,
  author =       "J. Demmel and J. Dongarra and J. {Du Croz} and A.
                 Greenbaum and S. Hammarling and D. Sorensen",
  title =        "Prospectus for the Development of a Linear Algebra
                 Library for High-Performance Computers",
  type =         "LAPACK Working Note",
  number =       "01",
  institution =  inst-ANL-MCS,
  address =      inst-ANL-MCS:adr,
  month =        sep,
  year =         "1987",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "ANL, MCS-TM-97, September 1987.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn01.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn01.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Dongarra:1987:BRM,
  author =       "J. Dongarra and S. Hammarling and D. Sorensen",
  title =        "Block Reduction of Matrices to Condensed Forms for
                 Eigenvalue Computations",
  type =         "LAPACK Working Note",
  number =       "02",
  institution =  inst-ANL-MCS,
  address =      inst-ANL-MCS:adr,
  month =        sep,
  year =         "1987",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "ANL, MCS-TM-99, September 1987. Published in
                 \cite{Dongarra:1989:BRM}.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn02.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn02.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Demmel:1988:CSS,
  author =       "J. Demmel and W. Kahan",
  title =        "Computing Small Singular Values of Bidiagonal Matrices
                 with Guaranteed High Relative Accuracy",
  type =         "LAPACK Working Note",
  number =       "03",
  institution =  inst-ANL-MCS,
  address =      inst-ANL-MCS:adr,
  month =        feb,
  year =         "1988",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "ANL, MCS-TM-110, February 1988.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn03.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn03.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Demmel:1988:GDS,
  author =       "J. Demmel and J. {Du Croz} and S. Hammarling and D.
                 Sorensen",
  title =        "Guidelines for the Design of Symmetric Eigenroutines,
                 {SVD}, and Iterative Refinement and Condition
                 Estimation for Linear Systems",
  type =         "LAPACK Working Note",
  number =       "04",
  institution =  inst-ANL-MCS,
  address =      inst-ANL-MCS:adr,
  month =        mar,
  year =         "1988",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "ANL, MCS-TM-111, March 1988.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn04.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn04.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Bischof:1988:PC,
  author =       "C. Bischof and J. Demmel and J. Dongarra and J. {Du
                 Croz} and A. Greenbaum and S. Hammarling and D.
                 Sorensen",
  title =        "Provisional Contents",
  type =         "LAPACK Working Note",
  number =       "05",
  institution =  inst-ANL-MCS,
  address =      inst-ANL-MCS:adr,
  month =        sep,
  year =         "1988",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "ANL, MCS-TM-38, September 1988.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn05.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn05.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Brewer:1988:TAAa,
  author =       "O. Brewer and J. Dongarra and D. Sorensen",
  title =        "Tools to Aid in the Analysis of Memory Access Patterns
                 for {FORTRAN} Programs",
  type =         "LAPACK Working Note",
  number =       "06",
  institution =  inst-ANL-MCS,
  address =      inst-ANL-MCS:adr,
  month =        jun,
  year =         "1988",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "ANL, MCS-TM-120, June 1988. Published in
                 \cite{Brewer:1988:TAAb}.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn06.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn06.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Barlow:1988:CAE,
  author =       "J. Barlow and J. Demmel",
  title =        "Computing Accurate Eigensystems of Scaled Diagonally
                 Dominant Matrices",
  type =         "LAPACK Working Note",
  number =       "07",
  institution =  inst-ANL-MCS,
  address =      inst-ANL-MCS:adr,
  month =        dec,
  year =         "1988",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "ANL, MCS-TM-126, December 1988. Published in
                 \cite{Barlow:1990:CAE}.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn07.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn07.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Bai:1989:BIHa,
  author =       "Z. Bai and J. Demmel",
  title =        "On a Block Implementation of {Hessenberg} Multishift
                 {$ Q R $} Iteration",
  type =         "LAPACK Working Note",
  number =       "08",
  institution =  inst-ANL-MCS,
  address =      inst-ANL-MCS:adr,
  month =        jan,
  year =         "1989",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "ANL, MCS-TM-127, January 1989. Published in
                 \cite{Bai:1989:BIHb}.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn08.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn08.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Demmel:1989:TMG,
  author =       "J. Demmel and A. McKenney",
  title =        "A Test Matrix Generation Suite",
  type =         "LAPACK Working Note",
  number =       "09",
  institution =  inst-ANL-MCS,
  address =      inst-ANL-MCS:adr,
  month =        mar,
  year =         "1989",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "ANL, MCS-P69-0389, March 1989.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn09.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn09.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Anderson:1989:ITI,
  author =       "E. Anderson and J. Dongarra",
  title =        "Installing and Testing the Initial Release of {LAPACK}
                 --- {Unix} and Non-{Unix} Versions",
  type =         "LAPACK Working Note",
  number =       "10",
  institution =  inst-ANL-MCS,
  address =      inst-ANL-MCS:adr,
  month =        may,
  year =         "1989",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "ANL, MCS-TM-130, May 1989.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn10.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn10.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Deift:1989:BSV,
  author =       "P. Deift and J. Demmel and L.-C. Li and C. Tomei",
  title =        "The Bidiagonal Singular Value Decomposition and
                 {Hamiltonian} Mechanics",
  type =         "LAPACK Working Note",
  number =       "11",
  institution =  inst-ANL-MCS,
  address =      inst-ANL-MCS:adr,
  month =        aug,
  year =         "1989",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "ANL, MCS-TM-133, August 1989. Published in
                 \cite{Deift:1991:BSV}.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn11.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn11.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Mayes:1989:BCF,
  author =       "P. Mayes and G. Radicati",
  title =        "Banded {Cholesky} factorization using level 3 {BLAS}",
  type =         "LAPACK Working Note",
  number =       "12",
  institution =  inst-ANL-MCS,
  address =      inst-ANL-MCS:adr,
  month =        aug,
  year =         "1989",
  bibdate =      "Sat Apr 23 06:29:27 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "ANL, MCS-TM-134, August 1989",
  URL =          "http://www.netlib.org/lapack/lawns/lawn12.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn12.pdf",
  acknowledgement = ack-nhfb,
  xxnote =       "Not available at Web site.",
}
@TechReport{Bai:1989:CNE,
  author =       "Z. Bai and J. Demmel and A. McKenney",
  title =        "On the Conditioning of the Nonsymmetric Eigenproblem:
                 Theory and Software",
  type =         "LAPACK Working Note",
  number =       "13",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        oct,
  year =         "1989",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-89-86, October 1989.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn13.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn13.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Demmel:1989:FPE,
  author =       "J. Demmel",
  title =        "On Floating Point Errors in {Cholesky}",
  type =         "LAPACK Working Note",
  number =       "14",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        oct,
  year =         "1989",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-89-87, October 1989.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn14.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn14.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Demmel:1989:JMM,
  author =       "J. Demmel and K. Veselic",
  title =        "{Jacobi}'s Method is More Accurate than {$ Q R $}",
  type =         "LAPACK Working Note",
  number =       "15",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        oct,
  year =         "1989",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-89-88, October 1989. Published in
                 \cite{Demmel:1992:JMM}.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn15.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn15.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Anderson:1989:RIR,
  author =       "E. Anderson and J. Dongarra",
  title =        "Results from the Initial Release of {LAPACK}",
  type =         "LAPACK Working Note",
  number =       "16",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        nov,
  year =         "1989",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-89-89, November 1989. (Replaced by LAWN 41 or
                 81!!)",
  URL =          "http://www.netlib.org/lapack/lawns/lawn16.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn16.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Greenbaum:1989:EQQ,
  author =       "A. Greenbaum and J. Dongarra",
  title =        "Experiments with {QR\slash QL} Methods for the
                 Symmetric Tridiagonal Eigenproblem",
  type =         "LAPACK Working Note",
  number =       "17",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        nov,
  year =         "1989",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-89-92, November 1989.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn17.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn17.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Anderson:1990:IGL,
  author =       "E. Anderson and J. Dongarra",
  title =        "Implementation Guide for {LAPACK}",
  type =         "LAPACK Working Note",
  number =       "18",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        apr,
  year =         "1990",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-90-101, April 1990.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn18.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn18.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Anderson:1990:EBA,
  author =       "E. Anderson and J. Dongarra",
  title =        "Evaluating Block Algorithm Variants in {LAPACK}",
  type =         "LAPACK Working Note",
  number =       "19",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        apr,
  year =         "1990",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-90-103, April 1990.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn19.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn19.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Anderson:1990:LPLa,
  author =       "E. Anderson and Z. Bai and C. Bischof and J. Demmel
                 and J. Dongarra and J. {Du Croz} and A. Greenbaum and
                 S. Hammarling and A. McKenney and D. Sorensen",
  title =        "{LAPACK}: {A} Portable Linear Algebra Library for
                 High-Performance Computers",
  type =         "LAPACK Working Note",
  number =       "20",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        may,
  year =         "1990",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-90-105, May 1990. Published in
                 \cite{Anderson:1990:LPLb}.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn20.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn20.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Croz:1990:FBM,
  author =       "Jeremy {Du Croz} and Peter Mayes and Giuseppe
                 Radicati",
  title =        "Factorizations of Band Matrices Using Level 3 {BLAS}",
  type =         "LAPACK Working Note",
  number =       "21",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        jul,
  year =         "1990",
  bibdate =      "Sat Apr 23 06:32:16 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT, CS-90-109, July 1990.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn21.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn21.pdf",
  acknowledgement = ack-nhfb,
  remark =       "Published in \cite[pp.~222--231]{Burkhart:1990:CVI}.",
  xxnote =       "Not available at Web site.",
}
@TechReport{Demmel:1990:SBA,
  author =       "J. Demmel and N. Higham",
  title =        "Stability of Block Algorithms with Fast Level 3
                 {BLAS}",
  type =         "LAPACK Working Note",
  number =       "22",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        jul,
  year =         "1990",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-90-110, July 1990. Published in
                 \cite{Demmel:1992:SBA}.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn22.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn22.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Demmel:1990:IEB,
  author =       "J. Demmel and N. Higham",
  title =        "Improved Error Bounds for Underdetermined System
                 Solvers",
  type =         "LAPACK Working Note",
  number =       "23",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        aug,
  year =         "1990",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-90-113, August 1990. Published in
                 \cite{Demmel:1993:IEB}.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn23.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn23.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Dongarra:1990:LBF,
  author =       "J. Dongarra and S. Ostrouchov",
  title =        "{LAPACK} Block Factorization Algorithms on the {Intel
                 iPSC\slash 860}",
  type =         "LAPACK Working Note",
  number =       "24",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        oct,
  year =         "1990",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-90-115, October, 1990.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn24.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn24.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Dongarra:1990:NCC,
  author =       "J. Dongarra and S. Hammarling and J. Wilkinson",
  title =        "Numerical Considerations in Computing Invariant
                 Subspaces",
  type =         "LAPACK Working Note",
  number =       "25",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        oct,
  year =         "1990",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-90-117, October, 1990. Published in
                 \cite{Dongarra:1992:NCC}.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn25.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn25.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Anderson:1990:PEL,
  author =       "E. Anderson and C. Bischof and J. Demmel and J.
                 Dongarra and J. {Du Croz} and S. Hammarling and W.
                 Kahan",
  title =        "Prospectus for an Extension to {LAPACK}: {A} Portable
                 Linear Algebra Library for High-Performance Computers",
  type =         "LAPACK Working Note",
  number =       "26",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  pages =        "10",
  month =        nov,
  year =         "1990",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-90-118, November 1990.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn26.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn26.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{DuCroz:1990:SMM,
  author =       "J. {Du Croz} and N. Higham",
  title =        "Stability of Methods for Matrix Inversion",
  type =         "LAPACK Working Note",
  number =       "27",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        oct,
  year =         "1990",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-90-119, October, 1990. Published in
                 \cite{Croz:1992:SMM}.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn27.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn27.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Dongarra:1990:IRS,
  author =       "J. Dongarra and P. Mayes and G. Radicati",
  title =        "The {IBM RISC System\slash 6000} and Linear Algebra
                 Operations",
  type =         "LAPACK Working Note",
  number =       "28",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        dec,
  year =         "1990",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-90-122, December 1990. Published in
                 \cite{Dongarra:1991:IRS}.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn28.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn28.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{vandeGeijn:1991:GCO,
  author =       "R. van de Geijn",
  title =        "On Global Combine Operations",
  type =         "LAPACK Working Note",
  number =       "29",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        apr,
  year =         "1991",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-91-129, April 1991. Published in
                 \cite{vandeGeijn:1994:GCO}.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn29.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn29.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Dongarra:1991:RCF,
  author =       "J. Dongarra and R. van de Geijn",
  title =        "Reduction to Condensed Form for the Eigenvalue Problem
                 on Distributed Memory Architectures",
  type =         "LAPACK Working Note",
  number =       "30",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        apr,
  year =         "1991",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-91-130, April 1991. Published in
                 \cite{Dongarra:1992:RCFb}.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn30.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn30.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Anderson:1991:GQF,
  author =       "E. Anderson and Z. Bai and J. Dongarra",
  title =        "Generalized {$ Q R $} Factorization and its
                 Applications",
  type =         "LAPACK Working Note",
  number =       "31",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        apr,
  year =         "1991",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-91-131, April 1991. Published in
                 \cite{Anderson:1992:GFA}.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn31.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn31.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Bischof:1991:GIC,
  author =       "C. Bischof and P. T. P. Tang",
  title =        "Generalized Incremental Condition Estimation",
  type =         "LAPACK Working Note",
  number =       "32",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        may,
  year =         "1991",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-91-132, May 1991. Published in
                 \cite{Bischof:1992:GIC}.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn32.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn32.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Bischof:1991:RIC,
  author =       "C. Bischof and P. T. P. Tang",
  title =        "Robust Incremental Condition Estimation",
  type =         "LAPACK Working Note",
  number =       "33",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        may,
  year =         "1991",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-91-133, May 1991.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn33.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn33.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Dongarra:1991:WB,
  author =       "J. J. Dongarra",
  title =        "Workshop on the {BLACS}",
  type =         "LAPACK Working Note",
  number =       "34",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        may,
  year =         "1991",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-91-134, May 1991.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn34.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn34.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Anderson:1991:IGL,
  author =       "E. Anderson and J. Dongarra and S. Ostrouchov",
  title =        "Implementation guide for {LAPACK}",
  type =         "LAPACK Working Note",
  number =       "35",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        aug,
  year =         "1991",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-91-138, August 1991.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn35.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn35.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Anderson:1991:RTS,
  author =       "E. Anderson",
  title =        "Robust Triangular solvers",
  type =         "LAPACK Working Note",
  number =       "36",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        aug,
  year =         "1991",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-91-142, August, 1991.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn36.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn36.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Dongarra:1991:TDB,
  author =       "Jack J. Dongarra and Robert A. van de Geijn",
  title =        "Two Dimensional Basic Linear Algebra Communication
                 Subprograms",
  type =         "LAPACK Working Note",
  number =       "37",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        oct,
  year =         "1991",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-91-138, October, 1991. Published in
                 \cite{Dongarra:1993:TDB}.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn37.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn37.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Bai:1991:DAC,
  author =       "Zhaojun Bai and James W. Demmel",
  title =        "On a Direct Algorithm for Computing Invariant
                 Subspaces with Specified Eigenvalues",
  type =         "LAPACK Working Note",
  number =       "38",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        nov,
  year =         "1991",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-91-139, November, 1991.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn38.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn38.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Demmel:1991:DPH,
  author =       "James Demmel and Jack Dongarra and W. Kahan",
  title =        "On Designing Portable High Performance Numerical
                 Libraries",
  type =         "LAPACK Working Note",
  number =       "39",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        jul,
  year =         "1991",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-91-141, July, 1991. Published in
                 \cite{Demmel:1992:DPH}.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn39.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn39.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Demmel:1992:BLF,
  author =       "James Demmel and Nick Higham and Rob Schreiber",
  title =        "Block {$ L U $} Factorization",
  type =         "LAPACK Working Note",
  number =       "40",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        feb,
  year =         "1992",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-92-149, February 1992.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn40.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn40.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Blackford:1992:IGL,
  author =       "Susan Blackford and Jack Dongarra",
  title =        "Installation Guide for {LAPACK}",
  type =         "LAPACK Working Note",
  number =       "41",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        mar,
  year =         "1992",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-92-151, March, 1992.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn41.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn41.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Higham:1992:PTB,
  author =       "Nick Higham",
  title =        "Perturbation Theory and Backward Error for {$ A X - X
                 B = C $}",
  type =         "LAPACK Working Note",
  number =       "42",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        apr,
  year =         "1992",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-92-153, April, 1992. Published in
                 \cite{Higham:1993:PTB}.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn42.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn42.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Dongarra:1992:LSD,
  author =       "Jack Dongarra and Robert van de Geijn and David
                 Walker",
  title =        "A Look at Scalable Dense Linear Algebra Libraries",
  type =         "LAPACK Working Note",
  number =       "43",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        apr,
  year =         "1992",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-92-155, April, 1992. Published in
                 \cite{Dongarra:1992:LASb}.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn43.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn43.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Anderson:1992:PLP,
  author =       "Edward Anderson and Jack Dongarra",
  title =        "Performance of {LAPACK}: {A} Portable Library of
                 Numerical Linear Algebra Routines",
  type =         "LAPACK Working Note",
  number =       "44",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        may,
  year =         "1992",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-92-156, May 1992. Published in
                 \cite{Anderson:1993:PLP}.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn44.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn44.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Demmel:1992:III,
  author =       "J. Demmel",
  title =        "The Inherent Inaccuracy of Implicit Tridiagonal {$ Q R
                 $}",
  type =         "LAPACK Working Note",
  number =       "45",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        may,
  year =         "1992",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-92-162, May 1992.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn45.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn45.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Bai:1992:CGS,
  author =       "Z. Bai and J. Demmel",
  title =        "Computing the Generalized Singular Value
                 Decomposition",
  type =         "LAPACK Working Note",
  number =       "46",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        may,
  year =         "1992",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-92-163, May 1992. Published in
                 \cite{Bai:1993:CGS}.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn46.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn46.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Demmel:1992:OPN,
  author =       "J. Demmel",
  title =        "Open Problems in Numerical Linear Algebra",
  type =         "LAPACK Working Note",
  number =       "47",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        may,
  year =         "1992",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-92-164, May 1992.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn47.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn47.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Demmel:1992:CAS,
  author =       "J. Demmel and W. Gragg",
  title =        "On Computing Accurate Singular Values and Eigenvalues
                 of Matrices with Acyclic Graphs",
  type =         "LAPACK Working Note",
  number =       "48",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        may,
  year =         "1992",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-92-166, May 1992. Published in
                 \cite{Demmel:1993:CAS}.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn48.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn48.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Demmel:1992:SFP,
  author =       "J. Demmel",
  title =        "A Specification for Floating Point Parallel Prefix",
  type =         "LAPACK Working Note",
  number =       "49",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        may,
  year =         "1992",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-92-167, May 1992.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn49.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn49.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Eijkhout:1992:DSD,
  author =       "Victor Eijkhout",
  title =        "Distributed Sparse Data Structures for Linear Algebra
                 Operations",
  type =         "LAPACK Working Note",
  number =       "50",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        may,
  year =         "1992",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-92-169, May 1992.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn50.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn50.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Eijkhout:1992:QPC,
  author =       "Victor Eijkhout",
  title =        "Qualitative Properties of the Conjugate Gradient and
                 {Lanczos} Methods in a Matrix Framework",
  type =         "LAPACK Working Note",
  number =       "51",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        may,
  year =         "1992",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-92-170, May 1992.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn51.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn51.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Heath:1992:CPN,
  author =       "Michael T. Heath and Padma Raghavan",
  title =        "A {Cartesian} Parallel Nested Dissection Algorithm",
  type =         "LAPACK Working Note",
  number =       "52",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        jun,
  year =         "1992",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-92-178, June 1992. Published in
                 \cite{Heath:1995:CPN}.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn52.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn52.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Demmel:1992:TPN,
  author =       "J. W. Demmel",
  title =        "Trading Off Parallelism and Numerical Stability",
  type =         "LAPACK Working Note",
  number =       "53",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        jun,
  year =         "1992",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-92-179, June 1992. Published in
                 \cite{Demmel:1993:TPN}.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn53.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn53.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Bai:1992:SDB,
  author =       "Z. Bai and J. W. Demmel",
  title =        "On Swapping Diagonal Blocks in Real {Schur} Form",
  type =         "LAPACK Working Note",
  number =       "54",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        oct,
  year =         "1992",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-92-182, October 1992. Published in
                 \cite{Bai:1993:SDB}.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn54.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn54.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Choi:1992:SSLa,
  author =       "J. Choi and J. Dongarra and R. Pozo and D. Walker",
  title =        "{ScaLAPACK}: {A} Scalable Linear Algebra for
                 Distributed Memory Concurrent Computers",
  type =         "LAPACK Working Note",
  number =       "55",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        nov,
  year =         "1992",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-92-181, November 1992. Published in
                 \cite{Choi:1992:SSLb}.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn55.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn55.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{DAzevedo:1993:RCC,
  author =       "E. F. D'Azevedo and V. L. Eijkhout and C. H. Romine",
  title =        "Reducing Communication Costs in the Conjugate Gradient
                 Algorithm on Distributed Memory Multiprocessors",
  type =         "LAPACK Working Note",
  number =       "56",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        jan,
  year =         "1993",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-93-185, January 1993.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn56.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn56.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Choi:1993:PPU,
  author =       "Jaeyoung Choi and Jack J. Dongarra and David W.
                 Walker",
  title =        "{PUMMA}: {Parallel Universal Matrix Multiplication
                 Algorithms} on Distributed Memory Concurrent
                 Computers",
  type =         "LAPACK Working Note",
  number =       "57",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        may,
  year =         "1993",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-93-187, May 1993. Published in
                 \cite{Choi:1994:PPU}.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn57.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn57.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Dongarra:1993:DLA,
  author =       "Jack Dongarra and David Walker",
  title =        "The Design of Linear Algebra Libraries for High
                 Performance Computer",
  type =         "LAPACK Working Note",
  number =       "58",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        jun,
  year =         "1993",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-93-188, June 1993.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn58.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn58.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Demmel:1993:FNA,
  author =       "James W. Demmel and Xiaoye Li",
  title =        "Faster Numerical Algorithms via Exception Handling",
  type =         "LAPACK Working Note",
  number =       "59",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        mar,
  year =         "1993",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-93-192, March 1993. Published in
                 \cite{Demmel:1994:FNA}.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn59.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn59.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Demmel:1993:PNLa,
  author =       "James W. Demmel and Michael T. Heath and Henk A. van
                 der Vorst",
  title =        "Parallel Numerical Linear Algebra",
  type =         "LAPACK Working Note",
  number =       "60",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        mar,
  year =         "1993",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-93-192, March 1993. Published in
                 \cite{Demmel:1993:PNLb}.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn60.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn60.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Dongarra:1993:OOD,
  author =       "J. Dongarra and R. Pozo and D. Walker",
  title =        "An Object Oriented Design for High Performance Linear
                 Algebra on Distributed Memory Architectures",
  type =         "LAPACK Working Note",
  number =       "61",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        aug,
  year =         "1993",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-93-200, August 1993.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn61.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn61.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Heath:1993:DSS,
  author =       "Michael T. Heath and Padma Raghavan",
  title =        "Distributed Solution of Sparse Linear Systems",
  type =         "LAPACK Working Note",
  number =       "62",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        aug,
  year =         "1993",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-93-201, August 1993.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn62.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn62.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Heath:1993:LPS,
  author =       "Michael T. Heath and Padma Raghavan",
  title =        "Line and Plane Separators",
  type =         "LAPACK Working Note",
  number =       "63",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        aug,
  year =         "1993",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-93-202, August 1993.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn63.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn63.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Raghavan:1993:DSG,
  author =       "Padma Raghavan",
  title =        "Distributed Sparse {Gaussian} Elimination and
                 Orthogonal Factorization",
  type =         "LAPACK Working Note",
  number =       "64",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        aug,
  year =         "1993",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-93-203, August 1993. Published in
                 \cite{Raghavan:1995:DSG}.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn64.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn64.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Choi:1993:PMT,
  author =       "Jaeyoung Choi and Jack J. Dongarra and David W.
                 Walker",
  title =        "Parallel Matrix Transpose Algorithms on Distributed
                 Memory Concurrent Computers",
  type =         "LAPACK Working Note",
  number =       "65",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        nov,
  year =         "1993",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-93-215, November, 1993. Published in
                 \cite{Choi:1994:PMT}.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn65.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn65.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Eijkhout:1993:CPI,
  author =       "Victor Eijkhout",
  title =        "A Characterization of Polynomial Iterative Methods",
  type =         "LAPACK Working Note",
  number =       "66",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        nov,
  year =         "1993",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-93-216, November 1993.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn66.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn66.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Desprez:1993:PCF,
  author =       "F. Desprez and J. Dongarra and B. Tourancheau",
  title =        "Performance Complexity of {$ L U $} Factorization with
                 Efficient Pipelining and Overlap on a Multiprocessor",
  type =         "LAPACK Working Note",
  number =       "67",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        dec,
  year =         "1993",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-93-218, December, 1993.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn67.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn67.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Berry:1994:HPA,
  author =       "Michael W. Berry and Jack J. Dongarra and Youngbae
                 Kim",
  title =        "A Highly Parallel Algorithm for the Reduction of a
                 Nonsymmetric Matrix to Block Upper-{Hessenberg} Form",
  type =         "LAPACK Working Note",
  number =       "68",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        feb,
  year =         "1994",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-94-221, February 1994.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn68.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn68.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Rutter:1994:SIC,
  author =       "J. Rutter",
  title =        "A Serial Implementation of {Cuppen}'s Divide and
                 Conquer Algorithm for the Symmetric Eigenvalue
                 Problem",
  type =         "LAPACK Working Note",
  number =       "69",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        mar,
  year =         "1994",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-94-225, March 1994.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn69.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn69.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Demmel:1994:CPB,
  author =       "James Demmel and Inderjit Dhillon and Huan Ren",
  title =        "On the Correctness of Parallel Bisection in Floating
                 Point",
  type =         "LAPACK Working Note",
  number =       "70",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        mar,
  year =         "1994",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-94-228, March 1994. Published in
                 \cite{Demmel:1995:CSB}.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn70.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn70.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Dongarra:1994:IRP,
  author =       "Jack Dongarra and Michael Kolatis",
  title =        "{IBM RS\slash 6000-550 \& -590} Performance for
                 Selected Routines in {ESSL}",
  type =         "LAPACK Working Note",
  number =       "71",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        apr,
  year =         "1994",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-94-231, April 1994.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn71.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn71.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Lehoucq:1995:CEU,
  author =       "R. Lehoucq",
  title =        "The Computation of Elementary Unitary Matrices",
  type =         "LAPACK Working Note",
  number =       "72",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        oct,
  year =         "1995",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-94-233, October 1995. Published in
                 \cite{Lehoucq:1996:CEU}.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn72.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn72.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Whaley:1994:BLA,
  author =       "R. Clint Whaley",
  title =        "Basic Linear Algebra Communication Subprograms:
                 Analysis and Implementation Across Multiple Parallel
                 Architectures",
  type =         "LAPACK Working Note",
  number =       "73",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        may,
  year =         "1994",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-94-234, May 1994.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn73.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn73.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Dongarra:1994:SMLa,
  author =       "J. Dongarra and A. Lumsdaine and X. Niu and R. Pozo
                 and K. Remington",
  title =        "A Sparse Matrix Library in {C++} for High Performance
                 Architectures",
  type =         "LAPACK Working Note",
  number =       "74",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        jul,
  year =         "1994",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-94-236, July 1994. Published in
                 \cite{Dongarra:1994:SMLb}.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn74.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn74.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Kaagstrom:1994:LSA,
  author =       "Bo K{\aa}gstr{\"o}m and Peter Poromaa",
  title =        "{LAPACK}-Style Algorithms and Software for Solving the
                 Generalized {Sylvester} Equation and Estimating the
                 Separating Between Regular Matrix Pairs",
  type =         "LAPACK Working Note",
  number =       "75",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        jul,
  year =         "1994",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-94-237, July 1994. Published in
                 \cite{Kaagstrom:1996:LSA}.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn75.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn75.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Barrett:1994:ABI,
  author =       "Richard Barrett and Michael Berry and Jack Dongarra
                 and Victor Eijkhout and Charles Romine",
  title =        "Algorithmic Bombardment for the Iterative Solution of
                 Linear Systems: {A} Poly-Iterative Approach",
  type =         "LAPACK Working Note",
  number =       "76",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        aug,
  year =         "1994",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-94-239, August, 1994. Published in
                 \cite{Barrett:1996:ABI}.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn76.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn76.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Eijkhout:1994:BCD,
  author =       "Victor Eijkhout and Roldan Pozo",
  title =        "Basic Concepts for Distributed Sparse Linear Algebra
                 Operations",
  type =         "LAPACK Working Note",
  number =       "77",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        aug,
  year =         "1994",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-94-240, August, 1994.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn77.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn77.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Eijkhout:1994:CVC,
  author =       "Victor Eijkhout",
  title =        "Computational variants of the {CGS} and {BiCGstab}
                 methods",
  type =         "LAPACK Working Note",
  number =       "78",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        aug,
  year =         "1994",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-94-241, August, 1994.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn78.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn78.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Henry:1994:PQA,
  author =       "Greg Henry and Robert van de Geijn",
  title =        "Parallelizing the {$ Q R $} Algorithm for the
                 Unsymmetric Algebraic Eigenvalue Problem: Myths and
                 Reality",
  type =         "LAPACK Working Note",
  number =       "79",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        aug,
  year =         "1994",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-94-244, August, 1994. Published in
                 \cite{Henry:1996:PAU}.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn79.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn79.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Choi:1994:DIS,
  author =       "J. Choi and J. J. Dongarra and S. Ostrouchov and A. P.
                 Petitet and D. W. Walker and R. C. Whaley",
  title =        "The Design and Implementation of the {ScaLAPACK} {$ L
                 U $}, {$ Q R $}, and {Cholesky} Factorization
                 Routines",
  type =         "LAPACK Working Note",
  number =       "80",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        sep,
  year =         "1994",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-94-246, September, 1994. Published in
                 \cite{Choi:1996:DIS}.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn80.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn80.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Blackford:1994:QIG,
  author =       "S. Blackford and J. Dongarra",
  title =        "Quick Installation Guide for {LAPACK} on {Unix}
                 Systems",
  type =         "LAPACK Working Note",
  number =       "81",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        sep,
  year =         "1994",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-94-249, September, 1994.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn81.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn81.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Dongarra:1994:CCI,
  author =       "J. Dongarra and M. Kolatis",
  title =        "Call Conversion Interface ({CCI}) for {LAPACK\slash
                 ESSL}",
  type =         "LAPACK Working Note",
  number =       "82",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        aug,
  year =         "1994",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-94-250, August, 1994.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn82.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn82.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Li:1994:RPB,
  author =       "Ren-Cang Li",
  title =        "Relative Perturbation Bounds for the Unitary Polar
                 Factor",
  type =         "LAPACK Working Note",
  number =       "83",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        sep,
  year =         "1994",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-94-251, September, 1994. Published in
                 \cite{Li:1997:RPB}.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn83.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn83.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Li:1994:RPTa,
  author =       "Ren-Cang Li",
  title =        "Relative Perturbation Theory: ({I}) Eigenvalue
                 Variations",
  type =         "LAPACK Working Note",
  number =       "84",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        sep,
  year =         "1994",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-94-252, September, 1994. Published in
                 \cite{Li:1998:RPT}.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn84.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn84.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Li:1994:RPTb,
  author =       "Ren-Cang Li",
  title =        "Relative Perturbation Theory: ({II}) Eigenspace
                 Variations",
  type =         "LAPACK Working Note",
  number =       "85",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        sep,
  year =         "1994",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-94-253, September, 1994. Published in
                 \cite{Li:1999:RPT}.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn85.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn85.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Demmel:1994:PFE,
  author =       "J. Demmel and K. Stanley",
  title =        "The Performance of Finding Eigenvalues and
                 Eigenvectors of Dense Symmetric Matrices on Distributed
                 Memory Computers",
  type =         "LAPACK Working Note",
  number =       "86",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        sep,
  year =         "1994",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-94-254, September, 1994.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn86.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn86.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Kaagstrom:1994:CES,
  author =       "B. K{\aa}gstr{\"o}m and P. Poromaa",
  title =        "Computing Eigenspaces with Specified Eigenvalues of a
                 Regular Matrix Pair ({A},{B}) and Condition Estimation:
                 Theory, Algorithms and Software",
  type =         "LAPACK Working Note",
  number =       "87",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        sep,
  year =         "1994",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-94-255, September, 1994. Published in
                 \cite{Kaagstrom:1996:CES}.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn87.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn87.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Gu:1994:ECS,
  author =       "Ming Gu and James Demmel and Inderjit Dhillon",
  title =        "Efficient Computation of the Singular Value
                 Decomposition with Applications to Least Squares
                 Problems",
  type =         "LAPACK Working Note",
  number =       "88",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        oct,
  year =         "1994",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-94-257, October, 1994.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn88.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn88.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Li:1994:SSE,
  author =       "Ren-Cang Li",
  title =        "Solving Secular Equations Stably and Efficiently",
  type =         "LAPACK Working Note",
  number =       "89",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        nov,
  year =         "1994",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-94-260, November, 1994.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn89.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn89.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Plank:1994:ABD,
  author =       "J. S. Plank and Y. Kim and J. J. Dongarra",
  title =        "Algorithm-Based Diskless Checkpointing for Fault
                 Tolerant Matrix Operations",
  type =         "LAPACK Working Note",
  number =       "90",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        dec,
  year =         "1994",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-94-268, December 1994. Published in
                 \cite{Plank:1995:ADC}.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn90.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn90.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Bai:1995:SDN,
  author =       "Z. Bai and J. Demmel and J. Dongarra and A. Petitet
                 and H. Robinson and K. Stanley",
  title =        "The Spectral Decomposition of Nonsymmetric Matrices on
                 Distributed Memory Parallel Computers",
  type =         "LAPACK Working Note",
  number =       "91",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        jan,
  year =         "1995",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-95-273, January 1995. Published in
                 \cite{Bai:1997:SDN}.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn91.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn91.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Choi:1995:DPDa,
  author =       "J. Choi and J. Dongarra and D. Walker",
  title =        "The Design of a Parallel Dense Linear Algebra Software
                 Library: Reduction to {Hessenberg}, Tridiagonal, and
                 Bidiagonal Form",
  type =         "LAPACK Working Note",
  number =       "92",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        feb,
  year =         "1995",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-95-275, February 1995. Published in
                 \cite{Choi:1994:DPD,Choi:1995:DPDb}.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn92.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn92.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Choi:2001:IGS,
  author =       "J. Choi and J. Demmel and I. Dhillon and J. Dongarra
                 and S. Ostrouchov and A. Petitet and K. Stanley and D.
                 Walker and R. C. Whaley",
  title =        "Installation Guide for {ScaLAPACK}",
  type =         "LAPACK Working Note",
  number =       "93",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  day =          "31",
  month =        aug,
  year =         "2001",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "Updated August 31, 2001 (Version 1.7).",
  URL =          "http://www.netlib.org/lapack/lawns/lawn93.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn93.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Dongarra:1997:UGB,
  author =       "J. Dongarra and R. C. Whaley",
  title =        "A User's Guide to the {BLACS v1.1}",
  type =         "LAPACK Working Note",
  number =       "94",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  day =          "5",
  month =        may,
  year =         "1997",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "Updated May 5, 1997 (Version 1.1).",
  URL =          "http://www.netlib.org/lapack/lawns/lawn94.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn94.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Choi:1995:SPL,
  author =       "J. Choi and J. Demmel and I. Dhillon and J. Dongarra
                 and S. Ostrouchov and A. Petitet and K. Stanley and D.
                 Walker and R. C. Whaley",
  title =        "{ScaLAPACK}: {A} Portable Linear Algebra Library for
                 Distributed Memory Computers --- Design Issues and
                 Performance",
  type =         "LAPACK Working Note",
  number =       "95",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        mar,
  year =         "1995",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-95-283, March 1995. Published in
                 \cite{Blackford:1996:SPL}.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn95.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn95.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{vandeGeijn:1995:SSU,
  author =       "R. A. van de Geijn and J. Watts",
  title =        "{SUMMA}: {Scalable Universal Matrix Multiplication
                 Algorithm}",
  type =         "LAPACK Working Note",
  number =       "96",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        apr,
  year =         "1995",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-95-286, April 1995. Published in
                 \cite{vandeGeijn:1997:SSU}.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn96.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn96.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Chakrabarti:1995:MBM,
  author =       "S. Chakrabarti and J. Demmel and D. Yelick",
  title =        "Modeling the Benefits of Mixed Data and Task
                 Parallelism",
  type =         "LAPACK Working Note",
  number =       "97",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        may,
  year =         "1995",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-95-289, May 1995.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn97.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn97.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Dongarra:1995:LVH,
  author =       "J. Dongarra and R. Pozo and D. Walker",
  title =        "{LAPACK++ V. 1.0}: High Performance Linear Algebra
                 Users' Guide",
  type =         "LAPACK Working Note",
  number =       "98",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        may,
  year =         "1995",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-95-290, May 1995.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn98.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn98.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Dongarra:1995:RCI,
  author =       "J. Dongarra and V. Eijkhout and A. Kalhan",
  title =        "Reverse Communication Interface for Linear Algebra
                 Templates for Iterative Methods",
  type =         "LAPACK Working Note",
  number =       "99",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        may,
  year =         "1995",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-95-291, May 1995.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn99.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn99.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Choi:1995:PSP,
  author =       "J. Choi and J. Dongarra and S. Ostrouchov and A.
                 Petitet and D. Walker and R. C. Whaley",
  title =        "A Proposal for a Set of Parallel Basic Linear Algebra
                 Subprograms",
  type =         "LAPACK Working Note",
  number =       "100",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        may,
  year =         "1995",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-95-292, May 1995. Published in
                 \cite{Choi:1995:PSP}.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn100.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn100.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Dongarra:1995:PFI,
  author =       "J. J. Dongarra and J. {Du Croz} and S. Hammarling and
                 J. Wa{\'s}niewski and A. Zemla",
  title =        "A Proposal for a {Fortran 90} Interface for {LAPACK}",
  type =         "LAPACK Working Note",
  number =       "101",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        jul,
  year =         "1995",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-95-295, July 1995. Published in
                 \cite{Dongarra:1996:PFI}.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn101.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn101.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Dongarra:1995:IVI,
  author =       "J. Dongarra and A. Lumsdaine and R. Pozo and K.
                 Remington",
  title =        "{IML++ v. 1.2}: Iterative Methods Library Reference
                 Guide",
  type =         "LAPACK Working Note",
  number =       "102",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        aug,
  year =         "1995",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-95-303, August 1995.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn102.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn102.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Demmel:1995:SAS,
  author =       "J. W. Demmel and S. C. Eisenstat and J. R. Gilbert and
                 X. S. Li and J. W. H. Liu",
  title =        "A Supernodal Approach to Sparse Partial Pivoting",
  type =         "LAPACK Working Note",
  number =       "103",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        sep,
  year =         "1995",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-95-304, September 1995. Published in
                 \cite{Demmel:1999:SAS}.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn103.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn103.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Higham:1995:IRL,
  author =       "N. J. Higham",
  title =        "Iterative Refinement and {LAPACK}",
  type =         "LAPACK Working Note",
  number =       "104",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        oct,
  year =         "1995",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-95-308, October 1995. Published in
                 \cite{Higham:1997:IRL}.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn104.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn104.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Higham:1995:SDP,
  author =       "N. J. Higham",
  title =        "Stability of the Diagonal Pivoting Method with Partial
                 Pivoting",
  type =         "LAPACK Working Note",
  number =       "105",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        oct,
  year =         "1995",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-95-309, October 1995. Published in
                 \cite{Higham:1997:SDP}.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn105.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn105.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Bai:1995:TLAa,
  author =       "Z. Bai and D. Day and J. Demmel and J. Dongarra and M.
                 Gu and A. Ruhe and H. van der Vorst",
  title =        "Templates for Linear Algebra Problems",
  type =         "LAPACK Working Note",
  number =       "106",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        oct,
  year =         "1995",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-95-311, October 1995. Published in
                 \cite{Bai:1995:TLAb}.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn106.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn106.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Kaagstrom:1995:GBLa,
  author =       "B. K{\aa}gstr{\"o}m and P. Ling and C. {Van Loan}",
  title =        "{GEMM}-Based Level 3 {BLAS}: High-Performance Model
                 Implementations and Performance Evaluation Benchmark",
  type =         "LAPACK Working Note",
  number =       "107",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        nov,
  year =         "1995",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-95-315, November 1995. Published in
                 \cite{Kaagstrom:1998:GBL}.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn107.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn107.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Kaagstrom:1995:GBLb,
  author =       "B. K{\aa}gstr{\"o}m and P. Ling and C. {Van Loan}",
  title =        "{GEMM}-Based Level 3 {BLAS}: Installation, Tuning and
                 Use of the Model Implementations and the Performance
                 Evaluation Benchmark",
  type =         "LAPACK Working Note",
  number =       "108",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        nov,
  year =         "1995",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-95-316, November 1995.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn108.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn108.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Dongarra:1995:BTW,
  author =       "J. Dongarra and S. Hammarling and S. Ostrouchov",
  title =        "{BLAS} Technical Workshop",
  type =         "LAPACK Working Note",
  number =       "109",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        nov,
  year =         "1995",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-95-317, November 1995.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn109.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn109.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Dongarra:1996:KCP,
  author =       "J. J. Dongarra and S. Hammarling and D. W. Walker",
  title =        "Key Concepts For Parallel Out-Of-Core {$ L U $}
                 Factorization",
  type =         "LAPACK Working Note",
  number =       "110",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        apr,
  year =         "1996",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-96-324, April 1996. Published in
                 \cite{Dongarra:1997:KCPb}.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn110.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn110.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Bilmes:1996:OMM,
  author =       "J. Bilmes and K. Asanovic and J. Demmel and D. Lam and
                 C.-W. Chin",
  title =        "Optimizing Matrix Multiply using {PHiPAC}: a Portable,
                 High-Performance, {ANSI C} Coding Methodology",
  type =         "LAPACK Working Note",
  number =       "111",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        may,
  year =         "1996",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-96-326, May 1996.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn111.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn111.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Blackford:1996:PEDa,
  author =       "L. S. Blackford and A. Cleary and J. Demmel and I.
                 Dhillon and J. Dongarra and S. Hammarling and A.
                 Petitet and H. Ren and K. Stanley and R. C. Whaley",
  title =        "Practical Experience in the Dangers of Heterogeneous
                 Computing",
  type =         "LAPACK Working Note",
  number =       "112",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        jul,
  year =         "1996",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-96-330, July 1996. Published in
                 \cite{Blackford:1996:PEDb,Blackford:1997:PEN}.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn112.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn112.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Quintana-Orti:1996:BPA,
  author =       "G. Quintana-Orti and E. S. Quintana-Orti and A.
                 Petitet",
  title =        "Block-Partitioned Algorithms for Solving the Linear
                 Least Squares Problem",
  type =         "LAPACK Working Note",
  number =       "113",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        jul,
  year =         "1996",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-96-333, July 1996.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn113.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn113.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Quintana-Orti:1996:BVQ,
  author =       "G. Quintana-Orti and X. Sun and C. Bischof",
  title =        "A {BLAS-3} Version of the {$ Q R $} Factorization with
                 Column Pivoting",
  type =         "LAPACK Working Note",
  number =       "114",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        aug,
  year =         "1996",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-96-334, August 1996.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn114.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn114.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Ren:1996:EAI,
  author =       "H. Ren",
  title =        "On the Error Analysis and Implementation of Some
                 Eigenvalue Decomposition and Singular Value
                 Decomposition Algorithms",
  type =         "LAPACK Working Note",
  number =       "115",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        sep,
  year =         "1996",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-96-336, September 1996.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn115.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn115.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Sidani:1996:PMD,
  author =       "M. Sidani and B. Harrod",
  title =        "Parallel Matrix Distributions: Have we been doing it
                 all right?",
  type =         "LAPACK Working Note",
  number =       "116",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        nov,
  year =         "1996",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-96-340, November 1996.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn116.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn116.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Blackford:1996:FIL,
  author =       "L. Susan Blackford and Jack J. Dongarra and Jeremy {Du
                 Croz} and Sven Hammarling and Jerzy Wa{\'s}niewski",
  title =        "A {Fortran 90} Interface for {LAPACK}",
  type =         "LAPACK Working Note",
  number =       "117",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        dec,
  year =         "1996",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-96-341, December 1996.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn117.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn117.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Dongarra:1997:DIP,
  author =       "J. J. Dongarra and E. F. D'Azevedo",
  title =        "The Design and Implementation of the Parallel
                 Out-of-core {ScaLAPACK} {$ L U $}, {$ Q R $}, and
                 {Cholesky} Factorization Routines",
  type =         "LAPACK Working Note",
  number =       "118",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        jan,
  year =         "1997",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-97-347, January 1997. Published in
                 \cite{DAzevedo:2000:DIP}.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn118.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn118.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Demmel:1997:CSV,
  author =       "James Demmel and Ming Gu and Stanley Eisenstat and
                 Ivan Slapni{\v{c}}ar and Kre{\v{s}}imir Veseli{\'c} and
                 Zlatko Drma{\v{c}}",
  title =        "Computing the Singular Value Decomposition with High
                 Relative Accuracy",
  type =         "LAPACK Working Note",
  number =       "119",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        feb,
  year =         "1997",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-97-348, February 1997. Published in
                 \cite{Demmel:1999:CSV}.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn119.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn119.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Desprez:1997:SBC,
  author =       "F. Desprez and J. Dongarra and A. Petitet and C.
                 Randriamaro and Y. Robert",
  title =        "Scheduling Block-Cyclic Array Redistribution",
  type =         "LAPACK Working Note",
  number =       "120",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        feb,
  year =         "1997",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-97-349, February 1997. Published in
                 \cite{Desprez:1998:SBA,Desprez:1998:SBC}.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn120.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn120.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Henry:1997:PIN,
  author =       "G. Henry and D. Watkins and J. Dongarra",
  title =        "A Parallel Implementation of the Nonsymmetric {$ Q R
                 $} Algorithm for Distributed Memory Architectures",
  type =         "LAPACK Working Note",
  number =       "121",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        mar,
  year =         "1997",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-97-352, March 1997. Published in
                 \cite{Henry:2002:PIN}.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn121.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn121.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Ahues:1997:NDC,
  author =       "M. Ahues and F. Tisseur",
  title =        "A New Deflation Criterion for the {$ Q R $}
                 Algorithm",
  type =         "LAPACK Working Note",
  number =       "122",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        mar,
  year =         "1997",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-97-353, March 1997.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn122.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn122.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Bai:1997:TMC,
  author =       "Z. Bai and D. Day and J. Demmel and J. Dongarra",
  title =        "A Test Matrix Collection for Non-{Hermitian}
                 Eigenvalue Problems",
  type =         "LAPACK Working Note",
  number =       "123",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        mar,
  year =         "1997",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-97-355, March 1997.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn123.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn123.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Demmel:1997:APS,
  author =       "J. Demmel and J. Gilbert and X. Li",
  title =        "An Asynchronous Parallel Supernodal Algorithm for
                 Sparse {Gaussian} Elimination",
  type =         "LAPACK Working Note",
  number =       "124",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        apr,
  year =         "1997",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-97-357, April 1997. Published in
                 \cite{Demmel:1999:APS}.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn124.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn124.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Cleary:1997:ISD,
  author =       "A. Cleary and J. Dongarra",
  title =        "Implementation in {ScaLAPACK} of Divide-and-Conquer
                 Algorithms for Banded and Tridiagonal Linear Systems",
  type =         "LAPACK Working Note",
  number =       "125",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        apr,
  year =         "1997",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-97-358, April 1997.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn125.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn125.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Anderson:1997:PIL,
  author =       "E. Anderson and M. Fahey",
  title =        "Performance Improvements to {LAPACK} for the {Cray
                 Scientific Library}",
  type =         "LAPACK Working Note",
  number =       "126",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        apr,
  year =         "1997",
  bibdate =      "Fri Apr 22 17:06:37 2005 UT-CS-97-359, April 1997.",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  URL =          "http://www.netlib.org/lapack/lawns/lawn126.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn126.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Li:1997:SGE,
  author =       "X. Li",
  title =        "Sparse {Gaussian} Elimination on High Performance
                 Computers",
  type =         "LAPACK Working Note",
  number =       "127",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        jun,
  year =         "1997",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-97-368, June 1997.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn127.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn127.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Petitet:1997:ARM,
  author =       "A. Petitet",
  title =        "Algorithmic Redistribution Methods for Block Cyclic
                 Decompositions",
  type =         "LAPACK Working Note",
  number =       "128",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        jul,
  year =         "1997",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-97-371, July 1997. Published in
                 \cite{Petitet:1999:ARM}.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn128.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn128.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Choi:1997:NPM,
  author =       "J. Choi",
  title =        "A New Parallel Matrix Multiplication Algorithm on
                 Distributed-Memory Concurrent Computers",
  type =         "LAPACK Working Note",
  number =       "129",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        sep,
  year =         "1997",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-97-369, September 1997. Published in
                 \cite{Choi:1998:NPM}.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn129.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn129.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Demmel:1997:ASS,
  author =       "J. Demmel",
  title =        "Accurate {SVDs} of Structured Matrices",
  type =         "LAPACK Working Note",
  number =       "130",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        oct,
  year =         "1997",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-97-375, October 1997.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn130.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn130.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Whaley:1997:ATL,
  author =       "R. Whaley and J. Dongarra",
  title =        "Automatically Tuned Linear Algebra Software",
  type =         "LAPACK Working Note",
  number =       "131",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        dec,
  year =         "1997",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-97-366, December 1997. Published in
                 \cite{Whaley:1998:ATL}.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn131.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn131.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Tisseur:1998:PDC,
  author =       "F. Tisseur and J. Dongarra",
  title =        "Parallelizing the Divide and Conquer Algorithm for the
                 Symmetric Tridiagonal Eigenvalue Problem on Distributed
                 Memory Architectures",
  type =         "LAPACK Working Note",
  number =       "132",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        mar,
  year =         "1998",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-98-382, March 1998.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn132.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn132.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Petitet:1998:ARM,
  author =       "A. Petitet and J. Dongarra",
  title =        "Algorithmic Redistribution Methods for Block Cyclic
                 Distributions",
  type =         "LAPACK Working Note",
  number =       "133",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        mar,
  year =         "1998",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-98-383, March 1998. Published in
                 \cite{Petitet:1999:ARM}.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn133.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn133.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Wasniewski:1998:HPL,
  author =       "J. Wa{\'s}niewski and J. Dongarra",
  title =        "High Performance Linear Algebra Package ---
                 {LAPACK90}",
  type =         "LAPACK Working Note",
  number =       "134",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        apr,
  year =         "1998",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-98-384, April 1998. Published in
                 \cite{Dongarra:1998:HPL}.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn134.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn134.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{DAzevedo:1998:PSE,
  author =       "E. D'Azevedo and J. Dongarra",
  title =        "Packed Storage Extensions for {ScaLAPACK}",
  type =         "LAPACK Working Note",
  number =       "135",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        apr,
  year =         "1998",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-98-385, April 1998.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn135.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn135.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Blackford:1998:SEP,
  author =       "L. S. Blackford and R. C. Whaley",
  title =        "{ScaLAPACK} Evaluation and Performance at the {DoD}
                 {MSRCs}",
  type =         "LAPACK Working Note",
  number =       "136",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        apr,
  year =         "1998",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-98-388, April 1998.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn136.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn136.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Blackford:1998:IGD,
  author =       "L. S. Blackford and J. J. Dongarra and C. A.
                 Papadopoulos and R. C. Whaley",
  title =        "Installation Guide and Design of the {HPF 1.1}
                 interface to {ScaLAPACK}, {SLHPF}",
  type =         "LAPACK Working Note",
  number =       "137",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        aug,
  year =         "1998",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-98-396, August 1998.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn137.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn137.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Dongarra:1998:TSL,
  author =       "J. Dongarra and W. Owczarz and J. Wa{\'s}niewski and
                 P. Yalamov",
  title =        "Testing Software for {LAPACK90}",
  type =         "LAPACK Working Note",
  number =       "138",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        sep,
  year =         "1998",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-98-401, Sept 1998.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn138.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn138.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Petitet:1998:NLA,
  author =       "A. Petitet and H. Casanova and J. Dongarra and Y.
                 Robert and R. C. Whaley",
  title =        "A Numerical Linear Algebra Problem Solving Environment
                 Designer's Perspective",
  type =         "LAPACK Working Note",
  number =       "139",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        oct,
  year =         "1998",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-98-405, Oct 1998. Published in
                 \cite{Petitet:1999:NLA,Petitet:2000:PDS}.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn139.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn139.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Casanova:1998:NVD,
  author =       "H. Casanova and J. Dongarra",
  title =        "{NetSolve version 1.2}: Design and Implementation",
  type =         "LAPACK Working Note",
  number =       "140",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        nov,
  year =         "1998",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-98-406, Nov 1998.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn140.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn140.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Eijkhout:1998:OIL,
  author =       "Victor Eijkhout",
  title =        "Overview of Iterative Linear System Solver Packages",
  type =         "LAPACK Working Note",
  number =       "141",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        dec,
  year =         "1998",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-98-411, Dec 1998.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn141.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn141.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Arbenz:1999:CPSa,
  author =       "P. Arbenz and A. Cleary and J. Dongarra and M.
                 Hegland",
  title =        "A Comparison of Parallel Solvers for Diagonally
                 Dominant and General Narrow-Banded Linear Systems",
  type =         "LAPACK Working Note",
  number =       "142",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        feb,
  year =         "1999",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-99-414, Feb 1999. Published in
                 \cite{Arbenz:1999:CPSc}.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn142.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn142.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Arbenz:1999:CPSb,
  author =       "P. Arbenz and A. Cleary and J. Dongarra and M.
                 Hegland",
  title =        "A Comparison of Parallel Solvers for Diagonally
                 Dominant and General Narrow-Banded Linear Systems
                 {II}",
  type =         "LAPACK Working Note",
  number =       "143",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        may,
  year =         "1999",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-99-415, May 1999.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn143.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn143.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Eijkhout:1999:EPI,
  author =       "V. Eijkhout",
  title =        "On the Existence Problem of Incomplete Factorisation
                 Methods",
  type =         "LAPACK Working Note",
  number =       "144",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        dec,
  year =         "1999",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-99-435, Dec 1999.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn144.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn144.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Eijkhout:1999:WMI,
  author =       "V. Eijkhout",
  title =        "The `weighted modification' incomplete factorisation
                 method",
  type =         "LAPACK Working Note",
  number =       "145",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        dec,
  year =         "1999",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-99-436, Dec 1999.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn145.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn145.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Andersen:2000:RFC,
  author =       "B. Andersen and F. Gustavson and J. Wa{\'s}niewski",
  title =        "A recursive formulation of {Cholesky} factorization of
                 a matrix in packed storage",
  type =         "LAPACK Working Note",
  number =       "146",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        may,
  year =         "2000",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-00-441, May 2000. Published in
                 \cite{Andersen:2001:RFC}.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn146.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn146.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Whaley:2000:AEO,
  author =       "R. C. Whaley and A. Petitet and J. Dongarra",
  title =        "Automated Empirical Optimization of Software and the
                 {ATLAS Project}",
  type =         "LAPACK Working Note",
  number =       "147",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        sep,
  year =         "2000",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-00-448, September 2000. Published in
                 \cite{Whaley:2001:AEO}.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn147.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn147.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Bindel:2000:CGR,
  author =       "D. Bindel and J. Demmel and W. Kahan and O. Marques",
  title =        "On Computing {Givens} rotations reliably and
                 efficiently",
  type =         "LAPACK Working Note",
  number =       "148",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        oct,
  year =         "2000",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-00-449, October 2000. Published in
                 \cite{Bindel:2002:CGR}.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn148.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn148.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Li:2000:DIT,
  author =       "X. Li and J. Demmel and D. Bailey and G. Henry and Y.
                 Hida and J. Iskandar and W. Kahan and A. Kapur and M.
                 Martin and T. Tung and D. J. Yoo",
  title =        "Design, Implementation and Testing of Extended and
                 Mixed Precision {BLAS}",
  type =         "LAPACK Working Note",
  number =       "149",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        oct,
  year =         "2000",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-00-451, October 2000. Published in
                 \cite{Li:2002:DIT}.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn149.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn149.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Anderson:2000:DPR,
  author =       "E. Anderson",
  title =        "Discontinuous Plane Rotations and the Symmetric
                 Eigenvalue Problem",
  type =         "LAPACK Working Note",
  number =       "150",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        dec,
  year =         "2000",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-00-454, December 2000.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn150.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn150.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Eijkhout:2001:ADM,
  author =       "V. Eijkhout",
  title =        "Automatic Determination of Matrix-Blocks",
  type =         "LAPACK Working Note",
  number =       "151",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        apr,
  year =         "2001",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-01-458, April 2001.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn151.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn151.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Cheng:2001:ILB,
  author =       "S. Cheng and N. Higham",
  title =        "Implementation for {LAPACK} of a Block Algorithm for
                 Matrix $1$-Norm Estimation",
  type =         "LAPACK Working Note",
  number =       "152",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        aug,
  year =         "2001",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-01-470, August 2001.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn152.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn152.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Fahey:2001:NCP,
  author =       "M. Fahey",
  title =        "New Complex Parallel Eigenvalue and Eigenvector
                 Routines",
  type =         "LAPACK Working Note",
  number =       "153",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        aug,
  year =         "2001",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-01-471, August 2001.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn153.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn153.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Dhillon:2002:OER,
  author =       "Inderjit S. Dhillon and Beresford N. Parlett",
  title =        "Orthogonal Eigenvectors and Relative Gaps",
  type =         "LAPACK Working Note",
  number =       "154",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        aug,
  year =         "2002",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-02-474, August 2002 Published in
                 \cite{Dhillon:2004:OER}.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn154.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn154.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Parlett:2002:IDA,
  author =       "Beresford N. Parlett and Osni A. Marques",
  title =        "An implementation of the $ d q d s $ algorithm
                 positive case",
  type =         "LAPACK Working Note",
  number =       "155",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        aug,
  year =         "2002",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "LBNL-43726, UT-CS-02-475, August 2002. Published in
                 \cite{Parlett:2000:IAP}.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn155.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn155.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Eijkhout:2002:PAO,
  author =       "Victor Eijkhout",
  title =        "Polynomial acceleration of optimised multi-grid
                 smoothers basic theory",
  type =         "LAPACK Working Note",
  number =       "156",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        aug,
  year =         "2002",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-02-477, August 2002",
  URL =          "http://www.netlib.org/lapack/lawns/lawn156.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn156.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Dongarra:2002:SAN,
  author =       "Jack Dongarra and Victor Eijkhout",
  title =        "Self-adapting Numerical Software for Next Generation
                 Applications",
  type =         "LAPACK Working Note",
  number =       "157",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        aug,
  year =         "2002",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-02-484, August 2002 Published in
                 \cite{Dongarra:2003:SANb}.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn157.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn157.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Anderson:2002:LFE,
  author =       "Edward Anderson",
  title =        "{LAPACK3E} --- {A} {Fortran 90}-enhanced version of
                 {LAPACK}",
  type =         "LAPACK Working Note",
  number =       "158",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        dec,
  year =         "2002",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-02-497, December 2002",
  URL =          "http://www.netlib.org/lapack/lawns/lawn158.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn158.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Dongarra:2003:FCA,
  author =       "Jack Dongarra and Victor Eijkhout",
  title =        "Finite-choice algorithm optimization in {Conjugate
                 Gradients}",
  type =         "LAPACK Working Note",
  number =       "159",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        jan,
  year =         "2003",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-03-502, January 2003",
  URL =          "http://www.netlib.org/lapack/lawns/lawn159.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn159.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Chen:2003:SAS,
  author =       "Zizhong Chen and Jack Dongarra and Piotr Luszczek and
                 Kenneth Roche",
  title =        "Self Adapting Software for Numerical Linear Algebra
                 and {LAPACK} for Clusters",
  type =         "LAPACK Working Note",
  number =       "160",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        jan,
  year =         "2003",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-03-499, January 2003",
  URL =          "http://www.netlib.org/lapack/lawns/lawn160.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn160.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Lucas:2003:LSC,
  author =       "Craig Lucas",
  title =        "{LAPack}-Style Codes for Level 2 and 3 Pivoted
                 {Cholesky} Factorizations",
  type =         "LAPACK Working Note",
  number =       "161",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        jan,
  year =         "2003",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-04-522, February 2004",
  URL =          "http://www.netlib.org/lapack/lawns/lawn161.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn161.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Dhillon:2004:DIM,
  author =       "Inderjit S. Dhillon and Beresford N. Parlett and
                 Christof V{\"o}mel",
  title =        "The Design and Implementation of the {MRRR}
                 Algorithm",
  type =         "LAPACK Working Note",
  number =       "162",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        feb,
  year =         "2004",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-04-541, December, 2004.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn162.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn162.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Parlett:2004:HMA,
  author =       "Beresford N. Parlett and Christof V{\"o}mel",
  title =        "How the {MRRR} Algorithm Can Fail on Tight Eigenvalue
                 Clusters",
  type =         "LAPACK Working Note",
  number =       "163",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  pages =        "15",
  month =        dec,
  year =         "2004",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-04-542, December, 2004.",
  URL =          "http://www.eecs.berkeley.edu/Pubs/TechRpts/2004/CSD-04-1367.pdf;
                 http://www.netlib.org/lapack/lawns/lawn163.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn163.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Demmel:2005:LPR,
  author =       "Jim Demmel and Jack Dongarra",
  title =        "{LAPACK 2005} Prospectus: Reliable and Scalable
                 Software for Linear Algebra Computations on High End
                 Computers",
  type =         "LAPACK Working Note",
  number =       "164",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        feb,
  year =         "2005",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-05-546, February 2005.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn164.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn164.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Demmel:2005:EBE,
  author =       "James Demmel and Yozo Hida and W. Kahan and Xiaoye S.
                 Li and Soni Mukherjee and E. Jason Riedy",
  title =        "Error Bounds from Extra Precise Iterative Refinement",
  type =         "LAPACK Working Note",
  number =       "165",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        feb,
  year =         "2005",
  bibdate =      "Fri Apr 22 17:06:37 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-05-547, February 2005.",
  URL =          "http://www.netlib.org/lapack/lawns/lawn165.ps;
                 http://www.netlib.org/lapack/lawnspdf/lawn165.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Willems:2005:CBS,
  author =       "Paul R. Willems and Bruno Lang and Christof
                 V{\"o}mel",
  title =        "Computing the Bidiagonal {SVD} Using Multiple
                 Relatively Robust Representations",
  type =         "LAPACK Working Note",
  number =       "166",
  institution =  "Computer Science Division, University of California,
                 Berkeley",
  address =      "Berkeley, CA, USA",
  pages =        "20",
  day =          "29",
  month =        aug,
  year =         "2005",
  MRclass =      "15A18, 65-04, 65F15",
  bibdate =      "Mon Mar 20 12:30:00 2006",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "Also issued as Technical Report Technical Report
                 UCB//CSD-05-1376",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn166.pdf",
  abstract =     "We describe the design and implementation of a new
                 algorithm for computing the singular value
                 decomposition of a real bidiagonal matrix. This
                 algorithm uses ideas developed by Gro{\ss}er and Lang
                 that extend Parlett's and Dhillon's MRRR algorithm for
                 the tridiagonal symmetric eigenproblem. One key feature
                 of our new implementation is, that $k$ singular
                 triplets can be computed using only {$ O(n k) $}
                 storage units and floating point operations, where $n$
                 is the dimension of the matrix. The algorithm will be
                 made available as routine xBDSCR in the upcoming new
                 release of the LAPACK library.",
  acknowledgement = ack-nhfb,
  keywords =     "Bidiagonal Singular Value Decomposition; Coupling
                 Relations; LAPACK library; MRRR algorithm; Tridiagonal
                 Symmetric Eigenproblem",
}
@TechReport{Marques:2005:SCM,
  author =       "Osni A. Marques and Beresford N. Parlett and Christof
                 V{\"o}mel",
  title =        "Subset Computations with the {MRRR} Algorithm",
  type =         "LAPACK Working Note",
  number =       "167",
  institution =  "Computer Science Division, University of California,
                 Berkeley",
  address =      "Berkeley, CA, USA",
  pages =        "9",
  day =          "26",
  month =        sep,
  year =         "2005",
  bibdate =      "Mon Mar 20 12:30:00 2006",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "Also issued as Technical Report UCB//CSD-05-1392",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn167.pdf",
  abstract =     "The main advantage of inverse iteration over the QR
                 algorithm and Divide \& Conquer for the symmetric
                 tridiagonal eigenproblem is that subsets of eigenpairs
                 can be computed at reduced cost.
                 The MRRR algorithm (MRRR = Multiple Relatively Robust
                 Representations) is a clever variant of inverse
                 iteration without the need for reorthogonalization.
                 {\tt stegr}, the current version of MRRR in LAPACK 3.0,
                 does not allow for subset computations. The next
                 release of {\tt stegr} is designed to compute a
                 (sub-)set of $k$ eigenpairs with {$ O(k n) $}
                 operations.
                 Because of the special way in which eigenvectors are
                 computed, MRRR subset computations are more complicated
                 than when using inverse iteration. Unlike the latter,
                 MRRR sometimes cannot ignore the unwanted part of the
                 spectrum.
                 We describe the problems with what we call false
                 singletons. These are eigenvalues that appear to be
                 isolated with respect to the wanted eigenvalues but in
                 fact belong to a tight cluster of unwanted eigenvalues.
                 This paper analyzes these complications and ways to
                 deal with them.",
  acknowledgement = ack-nhfb,
  keywords =     "false singleton; Multiple relatively robust
                 representations; numerically orthogonal eigenvectors;
                 subset computation; symmetric tridiagonal matrix",
}
@TechReport{Antonelli:2005:PSP,
  author =       "Dominic Antonelli and Christof V{\"o}mel",
  title =        "{PDSYEVR}. {ScaLAPACK}'s Parallel {MRRR} Algorithm for
                 the Symmetric Eigenvalue Problem",
  type =         "LAPACK Working Note",
  number =       "168",
  institution =  "Computer Science Division, University of California,
                 Berkeley",
  address =      "Berkeley, CA, USA",
  pages =        "18",
  day =          "29",
  month =        aug,
  year =         "2005",
  MRclass =      "65F15, 65Y15.",
  bibdate =      "Mon Mar 20 12:30:00 2006",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "Also issued as Technical Report UCB//CSD-05-1399.",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn168.pdf",
  abstract =     "In the 90s, Dhillon and Parlett devised a new
                 algorithm (Multiple Relatively Robust Representations,
                 MRRR) for computing numerically orthogonal eigenvectors
                 of a symmetric tridiagonal matrix {$T$} with {$ O(n^2)
                 $} cost. In this paper, we describe the design of
                 PDSYEVR, a ScaLAPACK implementation of the MRRR
                 algorithm to compute the eigenpairs in parallel. It
                 represents a substantial improvement over the symmetric
                 eigensolver PDSYEVX that is currently in ScaLAPACK and
                 is going to be part of the next ScaLAPACK release.",
  acknowledgement = ack-nhfb,
  keywords =     "design; implementation; Multiple relatively robust
                 representations; numerical software; parallel
                 computation; ScaLAPACK; symmetric eigenvalue problem",
}
@TechReport{Drmac:2005:NFA,
  author =       "Zlatko Drma{\v{c}} and Kre{\v{s}}imir Veseli{\'c}",
  title =        "New Fast and Accurate {Jacobi} {SVD} Algorithm: {I}",
  type =         "LAPACK Working Note",
  number =       "169",
  institution =  "Department of Mathematics, University of Zagreb",
  address =      "Bijeni{\v{c}}ka 30, 10000 Zagreb, Croatia.",
  pages =        "39",
  day =          "30",
  month =        aug,
  year =         "2005",
  MRclass =      "15A09, 15A12, 15A18, 15A23, 65F15, 65F22, 65F35",
  bibdate =      "Mon Mar 20 12:30:00 2006",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn169.pdf",
  abstract =     "This paper is the result of contrived efforts to break
                 the barrier between numerical accuracy and run time
                 efficiency in computing the fundamental decomposition
                 of numerical linear algebra the singular value
                 decomposition (SVD) of a general dense matrix. It is an
                 unfortunate fact that the numerically most accurate one
                 sided Jacobi SVD algorithm is several times slower than
                 generally less accurate bidiagonalization based methods
                 such as the QR or the divide and conquer algorithm.
                 Despite its sound numerical qualities, the Jacobi SVD
                 is not included in the state of the art matrix
                 computation libraries and it is even considered
                 obsolete by some leading researches. Our quest for a
                 highly accurate and efficient SVD algorithm has led us
                 to a new, superior variant of the Jacobi algorithm. The
                 new algorithm has inherited all good high accuracy
                 properties, and it outperforms not only the best
                 implementations of the one sided Jacobi algorithm but
                 also the QR algorithm. Moreover, it seems that the
                 potential of the new approach is yet to be fully
                 exploited.",
  acknowledgement = ack-nhfb,
  keywords =     "eigenvalues; Jacobi method; singular value
                 decomposition",
}
@TechReport{Drmac:2005:NFAb,
  author =       "Zlatko Drma{\v{c}} and Kre{\v{s}}imir Veseli{\'c}",
  title =        "New Fast and Accurate {Jacobi} {SVD} Algorithm: {II}",
  type =         "LAPACK Working Note",
  number =       "170",
  institution =  "Department of Mathematics, University of Zagreb",
  address =      "Bijeni{\v{c}}ka 30, 10000 Zagreb, Croatia.",
  pages =        "25",
  day =          "30",
  month =        aug,
  year =         "2005",
  bibdate =      "Mon Mar 20 12:30:00 2006",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn170.pdf",
  abstract =     "This paper presents new implementation of one sided
                 Jacobi SVD for triangular matrices and its use as the
                 core routine in a new preconditioned Jacobi SVD
                 algorithm, recently proposed by the authors. New pivot
                 strategy exploits the triangular form and uses the fact
                 that the input triangular matrix is the result of rank
                 revealing QR factorization. If used in the
                 preconditioned Jacobi SVD algorithm, described in the
                 first part of this report, it delivers superior
                 performance leading to the currently fastest method for
                 computing SVD decomposition with high relative
                 accuracy. Furthermore, the efficiency of the new
                 algorithm is comparable to the less accurate
                 bidiagonalization based methods. The paper also
                 discusses underflow issues in floating point
                 implementation, and shows how to use perturbation
                 theory to fix the imperfectness of machine arithmetic
                 on some systems.",
  acknowledgement = ack-nhfb,
  keywords =     "eigenvalues; Jacobi method; singular value
                 decomposition; underflow",
}
@TechReport{Kressner:2006:BAR,
  author =       "Daniel Kressner",
  title =        "Block Algorithms for Reordering Standard and
                 Generalized {Schur} Forms",
  type =         "LAPACK Working Note",
  number =       "171",
  institution =  "Department of Mathematics, University of Zagreb",
  address =      "Bijeni{\v{c}}ka 30, 10000 Zagreb, Croatia.",
  pages =        "11",
  day =          "17",
  month =        feb,
  year =         "2006",
  MRclass =      "65F15, 65Y20.",
  bibdate =      "Mon Mar 20 12:30:00 2006",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn171.pdf",
  abstract =     "Block algorithms for reordering a selected set of
                 eigenvalues in a standard or generalized Schur form are
                 proposed. Efficiency is achieved by delaying orthogonal
                 transformations and (optionally) making use of level 3
                 BLAS operations. Numerical experiments demonstrate that
                 existing algorithms, as currently implemented in
                 LAPACK, are outperformed by up to a factor of four.",
  acknowledgement = ack-nhfb,
  keywords =     "deflating subspace; invariant subspace; reordering;
                 Schur form",
}
@TechReport{Marques:2005:BIF,
  author =       "Osni A. Marques and E. Jason Riedy and Christof
                 V{\"o}mel",
  title =        "Benefits of {IEEE-754} Features in Modern Symmetric
                 Tridiagonal Eigensolvers",
  type =         "LAPACK Working Note",
  number =       "172",
  institution =  "Computer Science Division, University of California,
                 Berkeley",
  address =      "Berkeley, CA, USA",
  pages =        "22",
  day =          "30",
  month =        sep,
  year =         "2005",
  MRclass =      "15A18, 15A23.",
  bibdate =      "Mon Mar 20 12:18:56 2006",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "Also issued as Technical Report UCB//CSD-05-1414.",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn172.pdf",
  abstract =     "Bisection is one of the most common methods used to
                 compute the eigenvalues of symmetric tridiagonal
                 matrices. Bisection relies on the Sturm count: for a
                 given shift $ \sigma $, the number of negative pivots
                 in the factorization {$ T \sigma I = L D L^T $} equals
                 the number of eigenvalues of {$T$} that are smaller
                 than $ \sigma $. In IEEE-754 arithmetic, the value $
                 \infty $ permits the computation to continue past a
                 zero pivot, producing a correct Sturm count when {$T$}
                 is unreduced. Demmel and Li showed in the 90s that
                 using $ \infty $ rather than testing for zero pivots
                 within the loop could improve performance significantly
                 on certain architectures.
                 When eigenvalues are to be computed to high relative
                 accuracy, it is often preferable to work with {$ L D
                 L^T $} factorizations instead of the original
                 tridiagonal {$T$}, see for example the MRRR algorithm.
                 In these cases, the Sturm count has to be computed from
                 {$ L D L^T $} . The differential stationary and
                 progressive qds algorithms are the methods of
                 choice.
                 While it seems trivial to replace {$T$} by {$ L D L^T
                 $}, in reality these algorithms are more complicated:
                 in IEEE-754 arithmetic, a zero pivot produces an
                 overflow, followed by an invalid exception (NaN), that
                 renders the Sturm count incorrect.
                 We present alternative, safe formulations that are
                 guaranteed to produce the correct result.
                 Benchmarking these algorithms on a variety of platforms
                 shows that the original formulation without tests is
                 always faster provided no exception occurs. The
                 transforms see speed-ups of up to $ 2.6 \times $ over
                 the careful formulations.
                 Tests on industrial matrices show that encountering
                 exceptions in practice is rare. This leads to the
                 following design: First, compute the Sturm count by the
                 fast but unsafe algorithm. Then, if an exception
                 occurred, recompute the count by a safe, slower
                 alternative. The new Sturm count algorithms improve the
                 speed of bisection by up to $ 2 \times $ on our test
                 matrices. Furthermore, unlike the traditional
                 tiny-pivot substitution, proper use of IEEE-754
                 features provides a careful formulation that imposes no
                 input range restrictions.",
  acknowledgement = ack-nhfb,
  keywords =     "differential qds algorithms; IEEE-754 arithmetic;
                 IEEE-754 performance; LAPACK; MRRR algorithm; NaN
                 arithmetic",
}
@TechReport{Kaagstrom:2006:MVQ,
  author =       "Bo K{\aa}gstr{\"o}m and Daniel Kressner",
  title =        "Multishift Variants of the {$ Q Z $} Algorithm with
                 Aggressive Early Deflation",
  type =         "LAPACK Working Note",
  number =       "173",
  institution =  "Department of Computing Science, Ume{\aa} University",
  address =      "Ume{\aa}, Sweden",
  pages =        "42",
  day =          "20",
  month =        feb,
  year =         "2006",
  MRclass =      "65F15, 15A18, 15A22, 47A75",
  bibdate =      "Mon Mar 20 12:30:00 2006",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "Also appeared as technical report UMINF-05.11",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn173.pdf",
  abstract =     "New variants of the QZ algorithm for solving the
                 generalized eigenvalue problem are proposed. An
                 extension of the small-bulge multishift QR algorithm is
                 developed, which chases chains of many small bulges
                 instead of only one bulge in each QZ iteration. This
                 allows the effective use of level 3 BLAS operations,
                 which in turn can provide efficient utilization of high
                 performance computing systems with deep memory
                 hierarchies. Moreover, an extension of the aggressive
                 early deflation strategy is proposed, which can
                 identify and deflate converged eigenvalues long before
                 classic deflation strategies would. Consequently, the
                 number of overall QZ iterations needed until
                 convergence is considerably reduced. As a third
                 ingredient, we reconsider the deflation of infinite
                 eigenvalues and present a new deflation algorithm,
                 which is particularly effective in the presence of a
                 large number of infinite eigenvalues. Combining all
                 these developments, our implementation significantly
                 improves existing implementations of the QZ algorithm.
                 This is demonstrated by numerical experiments with
                 random matrix pairs as well as with matrix pairs
                 arising from various applications.",
  acknowledgement = ack-nhfb,
  keywords =     "aggressive early deflation; blocked algorithms;
                 Generalized eigenvalue problem; generalized Schur form;
                 multishifts; QZ algorithm",
}
@TechReport{Howell:2005:CEB,
  author =       "G. W. Howell and J. W. Demmel and C. T. Fulton and S.
                 Hammarling and K. Marmol",
  title =        "Cache Efficient Bidiagonalization Using {BLAS 2.5}
                 Operators",
  type =         "LAPACK Working Note",
  number =       "174",
  institution =  "North Carolina State University; University of
                 California, Berkeley; Florida Institute of Technology;
                 Numerical Algorithms Group; Harris Corporation",
  address =      "Raleigh, NC 27697, USA; Berkeley, CA 94720, USA;
                 Melbourne, FL 32901, USA; Oxford, UK; Melbourne, FL
                 32901",
  pages =        "39",
  day =          "1",
  month =        nov,
  year =         "2005",
  bibdate =      "Mon Mar 20 12:30:00 2006",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn174.pdf",
  abstract =     "In this paper we reorganize the sequence of operations
                 for Householder bidiagonalization of a general $ m
                 \times n $ matrix, so that two (\_GMEV) vector-matrix
                 multiplications can be done with one pass of the
                 unreduced trailing part of the matrix through cache.
                 Two new BLAS 2.5 operations approximately cut in half
                 the transfer of data from main memory to cache. We give
                 detailed algorithm descriptions and compare timings
                 with the current LAPACK bidiagonalization algorithm.",
  acknowledgement = ack-nhfb,
}
@TechReport{Langou:2006:EPB,
  author =       "Julie Langou and Julien Langou and Piotr Luszczek and
                 Jakub Kurzak and Alfredo Buttari and Jack Dongarra",
  title =        "Exploiting the Performance of 32 bit Floating Point
                 Arithmetic in Obtaining 64 bit Accuracy (Revisiting
                 Iterative Refinement for Linear Systems)",
  type =         "LAPACK Working Note",
  number =       "175",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  pages =        "17",
  month =        jun,
  year =         "2006",
  bibdate =      "Mon Oct 09 12:05:43 2006",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn175.pdf;
                 http://www.netlib.org/lapack/lawnspdf/lawn175.ps",
  abstract =     "Recent versions of microprocessors exhibit performance
                 characteristics for 32 bit floating point arithmetic
                 (single precision) that is substantially higher than 64
                 bit floating point arithmetic (double precision).
                 Examples include the Intel's Pentium IV and M
                 processors, AMD's Opteron architectures and the IBM's
                 Cell Broad Engine processor. When working in single
                 precision, floating point operations can be performed
                 up to two times faster on the Pentium and up to ten
                 times faster on the Cell over double precision. The
                 performance enhancements in these architectures are
                 derived by accessing extensions to the basic
                 architecture, such as SSE2 in the case of the Pentium
                 and the vector functions on the IBM Cell. The
                 motivation for this paper is to exploit single
                 precision operations whenever possible and resort to
                 double precision at critical stages while attempting to
                 provide the full double precision results. The results
                 described here are fairly general and can be applied to
                 various problems in linear algebra such as solving
                 large sparse systems, using direct or iterative methods
                 and some eigenvalue problems. There are limitations to
                 the success of this process, such as when the
                 conditioning of the problem exceeds the reciprocal of
                 the accuracy of the single precision computations. In
                 that case the double precision algorithm should be
                 used.",
  acknowledgement = ack-nhfb,
}
@TechReport{Drmac:2006:FRR,
  author =       "Zlatko Drma{\v{c}} and Zvonimir Bujanovi{\'c}",
  title =        "On the failure of rank revealing {$ Q R $}
                 factorization software --- a case study",
  type =         "LAPACK Working Note",
  number =       "176",
  institution =  "Department of Mathematics, University of Zagreb",
  address =      "Bijeni{\v{c}}ka 30, 10000 Zagreb, Croatia",
  pages =        "27",
  day =          "2",
  month =        jun,
  year =         "2006",
  bibdate =      "Mon Oct 09 12:05:43 2006",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn176.pdf;
                 http://www.netlib.org/lapack/lawnspdf/lawn176.ps",
  abstract =     "This note reports an unexpected and rather erratic
                 behavior of the LAPACK software implementation of the
                 QR factorization with Businger--Golub column pivoting.
                 It is shown that, due to finite precision arithmetic,
                 software implementation of the factorization can
                 catastrophically fail to produce triangular factor with
                 the structure characteristic to the Businger--Golub
                 pivot strategy. The failure of current state of the art
                 software, and a proposed alternative implementations
                 are analyzed in detail.",
  acknowledgement = ack-nhfb,
}
@TechReport{Kurzak:2006:IMP,
  author =       "Jakub Kurzak and Jack Dongarra",
  title =        "Implementation of the Mixed-Precision High Performance
                 {LINPACK} Benchmark on the {CELL Processor}",
  type =         "LAPACK Working Note",
  number =       "177",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  pages =        "12",
  month =        sep,
  year =         "2006",
  bibdate =      "Mon Oct 09 12:05:43 2006",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "Also available as UT-CS-06-580.",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn177.pdf;
                 http://www.netlib.org/lapack/lawnspdf/lawn177.ps",
  abstract =     "This paper describes the design concepts behind
                 implementations of mixed-precision linear algebra
                 routines targeted for the Cell processor. It describes
                 in detail the implementation of code to solve linear
                 system of equations using Gaussian elimination in
                 single precision with iterative refinement of the
                 solution to the full double precision accuracy. By
                 utilizing this approach the algorithm achieves close to
                 an order of magnitude higher performance on the Cell
                 processor than the performance offered by the standard
                 double precision algorithm. Effectively the code is an
                 implementation of the high performance LINPACK
                 benchmark, since it meets all the requirements
                 concerning the problem being solved and the numerical
                 properties of the solution.",
  acknowledgement = ack-nhfb,
}
@TechReport{Kurzak:2006:ILA,
  author =       "Jakub Kurzak and Jack Dongarra",
  title =        "Implementing Linear Algebra Routines on Multi-Core
                 Processors with Pipelining and a Look Ahead",
  type =         "LAPACK Working Note",
  number =       "178",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  pages =        "11",
  month =        sep,
  year =         "2006",
  bibdate =      "Mon Oct 09 12:05:43 2006",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "Also available as UT-CS-06-581.",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn178.pdf;
                 http://www.netlib.org/lapack/lawnspdf/lawn178.ps",
  abstract =     "Linear algebra algorithms commonly encapsulate
                 parallelism in Basic Linear Algebra Subroutines (BLAS).
                 This solution relies on the fork-join model of parallel
                 execution, which may result in suboptimal performance
                 on current and future generations of multi-core
                 processors. To overcome the shortcomings of this
                 approach a pipelined model of parallel execution is
                 presented, and the idea of the look ahead is utilized
                 in order to suppress the negative effects of sequential
                 formulation of the algorithms. Application to one-sided
                 matrix factorizations, LU, Cholesky and QR, is
                 described. Shared memory implementation using POSIX
                 threads is presented.",
  acknowledgement = ack-nhfb,
  keywords =     "linear algebra; look ahead; multi-core processors;
                 pipelining",
}
@TechReport{Baboulin:2006:PTS,
  author =       "Marc Baboulin and Luc Giraud and Serge Gratton and
                 Julien Langou",
  title =        "Parallel tools for solving incremental dense least
                 squares problems. Application to space geodesy",
  type =         "LAPACK Working Note",
  number =       "179",
  institution =  "CERFACS",
  address =      "42 avenue Gaspard Coriolis, 31057 Toulouse Cedex,
                 France",
  month =        sep,
  year =         "2006",
  bibdate =      "Mon Oct 09 12:05:43 2006",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "Also available as UT-CS-06-582.",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn179.pdf;
                 http://www.netlib.org/lapack/lawnspdf/lawn179.ps",
  abstract =     "We present a parallel distributed solver that enables
                 us to solve incremental dense least squares arising in
                 some parameter estimation problems. This solver is
                 based on ScaLAPACK [8] and PBLAS [9] kernel routines.
                 In the incremental process, the observations are
                 collected periodically and the solver updates the
                 solution with new observations using a QR factorization
                 algorithm. It uses a recently defined distributed
                 packed format [3] that handles symmetric or triangular
                 matrices in ScaLAPACK-based implementations. We provide
                 performance analysis on IBM pSeries 690. We also
                 present an example of application in the area of space
                 geodesy for gravity field computations with some
                 experimental results.",
  acknowledgement = ack-nhfb,
  keywords =     "dense linear algebra; gravity field computation;
                 parallel distributed algorithms; QR factorization;
                 ScaLAPACK; scientific computing",
}
@TechReport{Buttari:2006:UMP,
  author =       "Alfredo Buttari and Jack J. Dongarra and Jakub Kurzak
                 and Piotr Luszczek and Stanimire Tomov",
  title =        "Using Mixed Precision for Sparse Matrix Computations
                 to Enhance the Performance while Achieving 64-bit
                 Accuracy",
  type =         "LAPACK Working Note",
  number =       "180",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  day =          "22",
  month =        oct,
  year =         "2006",
  bibdate =      "Fri Apr 24 12:25:43 2009",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn180.pdf",
  abstract =     "By using a combination of 32-bit and 64-bit floating
                 point arithmetic the performance of many sparse linear
                 algebra algorithms can be significantly enhanced while
                 maintaining the 64-bit accuracy of the resulting
                 solution. These ideas can be applied to sparse
                 multifrontal and supernodal direct techniques, and
                 sparse iterative techniques such as Krylov subspace
                 methods. The approach presented here can apply not only
                 to conventional processors but also to exotic
                 technologies such as Field Programmable Gate Arrays
                 (FPGA), Graphical Processing Units (GPU), and the Cell
                 BE processor.",
  acknowledgement = ack-nhfb,
  note =         "UT-CS-06-584",
}
@TechReport{Demmel:2007:PNL,
  author =       "James W. Demmel and Jack J. Dongarra and Beresford N.
                 Parlett and William Kahan and Ming Gu and David S.
                 Bindel and Yozo Hida and Xiaoye S. Li and Osni A.
                 Marques and E. Jason Riedy and Christof V{\"o}mel and
                 Julien Langou and Piotr Luszczek and Jakub Kurzak and
                 Alfredo Buttari and Julie Langou and Stanimire Tomov",
  title =        "Prospectus for the Next {LAPACK} and {ScaLAPACK}
                 Libraries",
  type =         "LAPACK Working Note",
  number =       "181",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  day =          "11",
  month =        mar,
  year =         "2007",
  bibdate =      "Fri Apr 24 12:25:43 2009",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn181.pdf",
  acknowledgement = ack-nhfb,
  note =         "UT-CS-07-592",
}
@TechReport{Demmel:2007:TIL,
  author =       "James W. Demmel and Osni A. Marques and Beresford N.
                 Parlett and Christof V{\"o}mel",
  title =        "A Testing Infrastructure for {LAPACK}'s Symmetric
                 Eigensolvers",
  type =         "LAPACK Working Note",
  number =       "182",
  institution =  inst-UCB-EECS,
  address =      inst-UCB-EECS:adr,
  month =        apr,
  year =         "2007",
  MRclass =      "15A18, 15A23",
  bibdate =      "Fri Apr 24 12:25:43 2009",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn182.pdf",
  abstract =     "LAPACK is often mentioned as a positive example of a
                 software library that encapsulates complex, robust, and
                 widely used numerical algorithms for a wide range of
                 applications. At installation time, the user has the
                 option of running a (limited) number of test cases to
                 verify the integrity of the installation process. On
                 the algorithm developer's side, however, more
                 exhaustive tests are usually performed to study
                 algorithm behavior on a variety of problem settings and
                 also computer architectures. In this process, difficult
                 test cases need to be found that reflect particular
                 challenges of an application or push algorithms to
                 extreme behavior. These tests are then assembled into a
                 comprehensive collection, therefore making it possible
                 for any new or competing algorithm to be stressed in a
                 similar way. This note describes such an infrastructure
                 for exhaustively testing the symmetric tridiagonal
                 eigensolvers implemented in LAPACK. It consists of two
                 parts: a selection of carefully chosen test matrices
                 with particular idiosyncrasies and a portable testing
                 framework that allows easy testing and data processing.
                 The tester facilitates experiments with algorithmic
                 choices, parameter and threshold studies, and
                 performance comparisons on different architectures.",
  acknowledgement = ack-nhfb,
}
@TechReport{Demmel:2007:PAL,
  author =       "James W. Demmel and Osni A. Marques and Beresford N.
                 Parlett and Christof V{\"o}mel",
  title =        "Performance and Accuracy of {LAPACK}'s Symmetric
                 Tridiagonal Eigensolvers",
  type =         "LAPACK Working Note",
  number =       "183",
  institution =  inst-UCB-EECS,
  address =      inst-UCB-EECS:adr,
  month =        apr,
  year =         "2007",
  MRclass =      "15A18, 15A23",
  bibdate =      "Fri Apr 24 12:25:43 2009",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn183.pdf",
  abstract =     "We compare four algorithms from the latest LAPACK 3.1
                 release for computing eigenpairs of a symmetric
                 tridiagonal matrix. These include QR iteration,
                 bisection and inverse iteration (BI), the
                 Divide-and-Conquer method (DC), and the method of
                 Multiple Relatively Robust Representations
                 (MR).\par
                 Our evaluation considers speed and accuracy when
                 computing all eigenpairs, and additionally subset
                 computations. Using a variety of carefully selected
                 test problems, our study includes a variety of today's
                 computer architectures.\par
                 Our conclusions can be summarized as follows. (1) DC
                 and MR are generally much faster than QR and BI on
                 large matrices. (2) MR almost always does the fewest
                 floating point operations, but at a lower MFlop rate
                 than all the other algorithms. (3) The exact
                 performance of MR and DC strongly depends on the matrix
                 at hand. (4) DC and QR are the most accurate algorithms
                 with observed accuracy {$ O(\sqrt {n} \epsilon) $}. The
                 accuracy of BI and MR is generally {$ O(n \epsilon) $}.
                 (5) MR is preferable to BI for subset computations.",
  acknowledgement = ack-nhfb,
}
@TechReport{Kurzak:2007:SSL,
  author =       "Jakub Kurzak and Alfredo Buttari and Jack J.
                 Dongarra",
  title =        "Solving Systems of Linear Equations on the {CELL}
                 Processor Using {Cholesky} Factorization",
  type =         "LAPACK Working Note",
  number =       "184",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        may,
  year =         "2007",
  bibdate =      "Fri Apr 24 12:25:43 2009",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn184.pdf",
  abstract =     "The STI CELL processor introduces pioneering solutions
                 in processor architecture. At the same time it presents
                 new challenges for the development of numerical
                 algorithms. One is effective exploitation of the
                 differential between the speed of single and double
                 precision arithmetic; the other is efficient
                 parallelization between the short vector SIMD cores. In
                 this work, the first challenge is addressed by
                 utilizing a mixed-precision algorithm for the solution
                 of a dense symmetric positive definite system of linear
                 equations, which delivers double precision accuracy,
                 while performing the bulk of the work in single
                 precision. The second challenge is approached by
                 introducing much finer granularity of parallelization
                 than has been used for other architectures and using a
                 lightweight decentralized synchronization. The
                 implementation of the computationally intensive
                 sections gets within 90 percent of peak floating point
                 performance, while the implementation of the memory
                 intensive sections reaches within 90 percent of peak
                 memory bandwidth. On a single CELL processor, the
                 algorithm achieves over 170 Gflop/s when solving a
                 symmetric positive definite system of linear equation
                 in single precision and over 150 Gflop/s when
                 delivering the result in double precision accuracy.",
  acknowledgement = ack-nhfb,
  keywords =     "CELL BE; Cholesky factorization; iterative refinement;
                 mixed-precision algorithms",
  note =         "UT-CS-07-596",
}
@TechReport{Buttari:2007:LPH,
  author =       "Alfredo Buttari and Jack J. Dongarra and Jakub
                 Kurzak",
  title =        "Limitations of the {PlayStation 3} for High
                 Performance Cluster Computing",
  type =         "LAPACK Working Note",
  number =       "185",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        may,
  year =         "2007",
  bibdate =      "Fri Apr 24 12:25:43 2009",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn185.pdf",
  acknowledgement = ack-nhfb,
  note =         "UT-CS-07-597",
}
@TechReport{Demmel:2007:FLAa,
  author =       "James W. Demmel and Ioana Dumitriu and Olga Holtz",
  title =        "Fast Linear Algebra is Stable",
  type =         "LAPACK Working Note",
  number =       "186",
  institution =  inst-UCB-EECS,
  address =      inst-UCB-EECS:adr,
  day =          "18",
  month =        may,
  year =         "2007",
  bibdate =      "Fri Apr 24 12:25:43 2009",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "Published in \cite{Demmel:2007:FLAb}.",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn186.pdf",
  abstract =     "In [23] we showed that a large class of fast recursive
                 matrix multiplication algorithms is stable in a
                 normwise sense, and that in fact if multiplication of
                 $n$-by-$n$ matrices can be done by any algorithm in {$
                 O(n^{\omega + \eta }) $} operations for any $ \eta > 0
                 $, then it can be done stably in {$ O(n^{\omega + \eta
                 }) $} operations for any $ \eta > 0 $. Here we extend
                 this result to show that essentially all standard
                 linear algebra operations, including LU decomposition,
                 QR decomposition, linear equation solving, matrix
                 inversion, solving least squares problems,
                 (generalized) eigenvalue problems and the singular
                 value decomposition can also be done stably (in a
                 normwise sense) in {$ O(n^{\omega + \eta }) $}
                 operations.",
  acknowledgement = ack-nhfb,
}
@TechReport{Byers:2007:LXT,
  author =       "Ralph Byers",
  title =        "{LAPACK 3.1 xHSEQR}: Tuning and Implementation Notes
                 on the Small Bulge Multi-shift {$ Q R $} Algorithm with
                 Aggressive Early Deflation",
  type =         "LAPACK Working Note",
  number =       "187",
  institution =  "Department of Mathematics, University of Kansas",
  address =      "Lawrence, Kansas 66045, USA",
  month =        may,
  year =         "2007",
  bibdate =      "Fri Apr 24 12:25:43 2009",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn187.pdf",
  abstract =     "This note documents implementation details of the
                 small bulge, multi-shift QR algorithm with aggressive
                 early deflation that appears as LAPACK version 3.1
                 programs CHSEQR, DHSEQR, SHSEQR and ZHSEQR and the
                 subroutines they call. These codes calculate
                 eigenvalues and optionally a Schur factorization of a
                 Hessenberg matrix. They do the bulk of the work
                 required to calculate eigenvalues and optionally eigen-
                 vectors of a general non-symmetric matrix. This report
                 is intended to provide some guidance for setting the
                 machine dependent tuning parameters, to help
                 maintainers to identify and correct problems, and to
                 help developers improve upon this implementation.",
  acknowledgement = ack-nhfb,
}
@TechReport{Demmel:2007:EPI,
  author =       "James W. Demmel and Yozo Hida and Xiaoye S. Li and E.
                 Jason Riedy",
  title =        "Extra-precise Iterative Refinement for Overdetermined
                 Least Squares Problems",
  type =         "LAPACK Working Note",
  number =       "188",
  institution =  inst-UCB-EECS,
  address =      inst-UCB-EECS:adr,
  day =          "30",
  month =        may,
  year =         "2007",
  bibdate =      "Fri Apr 24 12:25:43 2009",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "Published in \cite{Demmel:2009:EPI}.",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn188.pdf",
  abstract =     "We present the algorithm, error bounds, and numerical
                 results for extra-precise iterative refinement applied
                 to overdetermined linear least squares (LLS) problems.
                 We apply our linear system refinement algorithm to
                 Bj{\"o}rck's augmented linear system formulation of an
                 LLS problem. Our algorithm reduces the forward normwise
                 and componentwise errors to {$ O(\epsilon) $} unless
                 the system is too ill conditioned. In contrast to
                 linear systems, we provide two separate error bounds
                 for the solution $x$ and the residual $r$. The
                 refinement algorithm requires only limited use of extra
                 precision and adds only {$ O(m n) $} work to the {$ O(m
                 n^2) $} cost of QR factorization for problems of size
                 $m$-by-$n$. The extra precision calculation is
                 facilitated by the new extended-precision BLAS standard
                 in a portable way, and the refinement algorithm will be
                 included in a future release of LAPACK and can be
                 extended to the other types of least squares
                 problems.",
  acknowledgement = ack-nhfb,
}
@TechReport{Alvaro:2008:FSS,
  author =       "Wesley Alvaro and Jakub Kurzak and Jack J. Dongarra",
  title =        "Fast and Small Short Vector {SIMD} Matrix
                 Multiplication Kernels for the {CELL} Processor",
  type =         "LAPACK Working Note",
  number =       "189",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        jan,
  year =         "2008",
  bibdate =      "Fri Apr 24 12:25:43 2009",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn189.pdf",
  abstract =     "Matrix multiplication is one of the most common
                 numerical operations, especially in the area of dense
                 linear algebra, where it forms the core of many
                 important algorithms, including solvers of linear
                 systems of equations, least square problems, and
                 singular and eigenvalue computations. The STI CELL
                 processor exceeds the capabilities of any other
                 processor available today in terms of peak single
                 precision, floating point performance. In order to
                 fully exploit the potential of the CELL processor for a
                 wide range of numerical algorithms, fast implementation
                 of the matrix multiplication operation is essential.
                 The crucial component is the matrix multiplication
                 kernel crafted for the short vector Single Instruction
                 Multiple Data architecture of the Synergistic
                 Processing Element of the CELL processor. In this
                 paper, single precision matrix multiplication kernels
                 are presented implementing the {$ C = C - A \times B T
                 $} operation and the {$ C = C - A \times B $} operation
                 for matrices of size $ 64 \times 64 $ elements. For the
                 latter case, the performance of 25.55 Gflop/s is
                 reported, or 99.80 percent of the peak, using as little
                 as 5.9 KB of storage for code and auxiliary data
                 structures.",
  acknowledgement = ack-nhfb,
  keywords =     "CELL BE; matrix multiplication; SGEMM; short vector
                 SIMD; SPE",
  note =         "UT-CS-08-609",
}
@TechReport{Buttari:2007:PTQ,
  author =       "Alfredo Buttari and Julien Langou and Jakub Kurzak and
                 Jack J. Dongarra",
  title =        "Parallel Tiled {$ Q R $} Factorization for Multicore
                 Architectures",
  type =         "LAPACK Working Note",
  number =       "190",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        jul,
  year =         "2007",
  bibdate =      "Fri Apr 24 12:25:43 2009",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-07-598. Published in \cite{Buttari:2008:PTF}.",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn190.pdf",
  abstract =     "As multicore systems continue to gain ground in the
                 High Performance Computing world, linear algebra
                 algorithms have to be reformulated or new algorithms
                 have to be developed in order to take advantage of the
                 architectural features on these new processors. Fine
                 grain parallelism becomes a major requirement and
                 introduces the necessity of loose synchronization in
                 the parallel execution of an operation. This paper
                 presents an algorithm for the QR factorization where
                 the operations can be represented as a sequence of
                 small tasks that operate on square blocks of data.
                 These tasks can be dynamically scheduled for execution
                 based on the dependencies among them and on the
                 availability of computational resources. This may
                 result in an out of order execution of the tasks which
                 will completely hide the presence of intrinsically
                 sequential tasks in the factorization. Performance
                 comparisons are presented with the LAPACK algorithm for
                 QR factorization where parallelism can only be
                 exploited at the level of the BLAS operations.",
  acknowledgement = ack-nhfb,
}
@TechReport{Buttari:2007:CPT,
  author =       "Alfredo Buttari and Julien Langou and Jakub Kurzak and
                 Jack J. Dongarra",
  title =        "A Class of Parallel Tiled Linear Algebra Algorithms
                 for Multicore Architectures",
  type =         "LAPACK Working Note",
  number =       "191",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        sep,
  year =         "2007",
  bibdate =      "Fri Apr 24 12:25:43 2009",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn191.pdf",
  abstract =     "As multicore systems continue to gain ground in the
                 High Performance Computing world, linear algebra
                 algorithms have to be reformulated or new algorithms
                 have to be developed in order to take advantage of the
                 architectural features on these new processors. Fine
                 grain parallelism becomes a major requirement and
                 introduces the necessity of loose synchronization in
                 the parallel execution of an operation. This paper
                 presents an algorithm for the Cholesky, LU and QR
                 factorization where the operations can be represented
                 as a sequence of small tasks that operate on square
                 blocks of data. These tasks can be dynamically
                 scheduled for execution based on the dependencies among
                 them and on the availability of computational
                 resources. This may result in an out of order execution
                 of the tasks which will completely hide the presence of
                 intrinsically sequential tasks in the factorization.
                 Performance comparisons are presented with the LAPACK
                 algorithms where parallelism can only be exploited at
                 the level of the BLAS operations and vendor
                 implementations.",
  acknowledgement = ack-nhfb,
  note =         "UT-CS-07-600",
}
@TechReport{Granat:2007:PER,
  author =       "Robert Granat and Bo K{\aa}gstr{\"o}m and Daniel
                 Kressner",
  title =        "Parallel eigenvalue reordering in real {Schur} forms",
  type =         "LAPACK Working Note",
  number =       "192",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        sep,
  year =         "2007",
  bibdate =      "Fri Apr 24 12:25:43 2009",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn192.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Baboulin:2007:CCC,
  author =       "Marc Baboulin and Jack J. Dongarra and Serge Gratton
                 and Julien Langou",
  title =        "Computing the Conditioning of the Components of a
                 Linear Least Squares Solution",
  type =         "LAPACK Working Note",
  number =       "193",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        sep,
  year =         "2007",
  bibdate =      "Fri Apr 24 12:25:43 2009",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn193.pdf",
  abstract =     "In this paper, we address the accuracy of the results
                 for the overdetermined full rank linear least squares
                 problem. We recall theoretical results obtained in [2]
                 on conditioning of the least squares solution and the
                 components of the solution when the matrix
                 perturbations are measured in Frobenius or spectral
                 norms. Then we define computable estimates for these
                 condition numbers and we interpret them in terms of
                 statistical quantities. In particular, we show that, in
                 the classical linear statistical model, the ratio of
                 the variance of one component of the solution by the
                 variance of the right-hand side is exactly the
                 condition number of this solution component when
                 perturbations on the right-hand side are considered. We
                 also provide fragment codes using LAPACK [1] routines
                 to compute the variance-covariance matrix and the least
                 squares conditioning and we give the corresponding
                 computational cost. Finally we present a small
                 historical numerical example that was used by Laplace
                 [19] for computing the mass of Jupiter and experiments
                 from the space industry with real physical data.",
  acknowledgement = ack-nhfb,
  keywords =     "condition number; LAPACK; Linear least squares;
                 parameter estimation; ScaLAPACK; statistical linear
                 least squares; variance-covariance matrix",
  note =         "UT-CS-07-604",
}
@TechReport{Vomel:2007:RRT,
  author =       "Christof V{\"o}mel",
  title =        "A Refined Representation Tree for {MRRR}",
  type =         "LAPACK Working Note",
  number =       "194",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        nov,
  year =         "2007",
  MRclass =      "65F15, 65Y15",
  bibdate =      "Fri Apr 24 12:25:43 2009",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn194.pdf",
  abstract =     "In order to compute orthogonal eigenvectors of a
                 symmetric tridiagonal matrix without Gram--Schmidt
                 orthogonalization, the MRRR algorithm finds a shifted
                 LDLT factorization (representation) for each eigenvalue
                 such that the local eigenvalue is a singleton, that is
                 defined to high relative accuracy and has a large
                 relative gap.\par
                 MRRR's representation tree describes how, by successive
                 shifting and refinement, each eigenvalue becomes
                 relatively isolated. Its shape plays a crucial role for
                 complexity: deeper trees are associated with more
                 eigenvalue refinement to resolve clustering of
                 eigenvalues.\par
                 Motivated by recently observed deteriorating complexity
                 of the LAPACK 3.1 MRRR kernels for certain matrices of
                 large dimension, we here re-examine and refine the
                 representation tree concept.\par
                 We first describe the discovery of what we call a
                 spectrum peeling problem: even though the matrix at
                 hand might not have a spectrum with clusters within
                 clusters, the representation tree might still contain a
                 long chain of large nodes.\par
                 We then formulate a refined proposal for the
                 representation tree that aims at avoiding the un-
                 warranted work while preserving tight accuracy bounds
                 where possible. The trade-off between performance and
                 accuracy in our solution is discussed by practical
                 examples.",
  acknowledgement = ack-nhfb,
  keywords =     "complexity; LAPACK; MRRR; Multiple relatively robust
                 representations; representation tree; ScaLAPACK;
                 spectrum peeling",
}
@TechReport{Vomel:2007:SMA,
  author =       "Christof V{\"o}mel",
  title =        "{ScaLAPACK}'s {MRRR} Algorithm",
  type =         "LAPACK Working Note",
  number =       "195",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        nov,
  year =         "2007",
  MRclass =      "65F15, 65Y15",
  bibdate =      "Fri Apr 24 12:25:43 2009",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn195.pdf",
  abstract =     "The sequential algorithm of Multiple Relatively Robust
                 Representations, MRRR, can compute numerically
                 orthogonal eigenvectors of an unreduced symmetric
                 tridiagonal matrix {$ T \subset R^{n \times n} $} with
                 {$ O(n^2) $} cost.\par
                 This paper describes the design of ScaLAPACK's parallel
                 MRRR algorithm. One emphasis is on the critical role of
                 the representation tree in achieving both numerical
                 accuracy and parallel scalability. A second point
                 concerns the favorable properties of this code: subset
                 computation, the use of static memory, and
                 scalability.\par
                 Unlike ScaLAPACK's Divide \& Conquer and QR, MRRR can
                 compute subsets of eigenpairs at reduced cost. And in
                 contrast to inverse iteration which can fail, it is
                 guaranteed to produce a numerically satisfactory answer
                 while maintaining memory scalability.\par
                 ParEig, the parallel MRRR algorithm for PLAPACK, uses
                 dynamic memory allocation. This is avoided by our code
                 at marginal additional cost. We also use a different
                 representation tree criterion that allows for more
                 accurate computation of the eigenvectors but can make
                 parallelization more difficult.",
  acknowledgement = ack-nhfb,
  keywords =     "multiple relatively robust representations; numerical
                 software; ScaLAPACK; Symmetric eigenproblem",
}
@TechReport{Drmac:2007:GCP,
  author =       "Zlatko Drma{\v{c}}",
  title =        "A global convergence proof of cyclic {Jacobi} methods
                 with block rotations",
  type =         "LAPACK Working Note",
  number =       "196",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  day =          "10",
  month =        dec,
  year =         "2007",
  bibdate =      "Fri Apr 24 12:25:43 2009",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn196.pdf",
  abstract =     "This paper introduces a globally convergent block
                 (column- and row-) cyclic Jacobi method for
                 diagonalization of Hermitian matrices and for
                 computation of the singular value decomposition of
                 general matrices. It is shown that a block rotation
                 (generalization of the Jacobi's 2 \times 2 rotation)
                 must be computed and implemented in a particular way to
                 guarantee global convergence. This solves a long
                 standing open problem of convergence of block cyclic
                 Jacobi methods. The proof includes the convergence of
                 the eigenspaces in the general case of multiple
                 eigenvalues.",
  acknowledgement = ack-nhfb,
}
@TechReport{Volkov:2008:UGA,
  author =       "Vasily Volkov and James W. Demmel",
  title =        "Using {GPUs} to Accelerate the Bisection Algorithm for
                 Finding Eigenvalues of Symmetric Tridiagonal Matrices",
  type =         "LAPACK Working Note",
  number =       "197",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        jan,
  year =         "2008",
  bibdate =      "Fri Apr 24 12:25:43 2009",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn197.pdf",
  abstract =     "Graphical Processing Units (GPUs) potentially promise
                 widespread and inexpensive high performance
                 computation. However, architectural limitations (only
                 some operations and memory access patterns can be
                 performed quickly, partial support for IEEE floating
                 point arithmetic) make it necessary to change existing
                 algorithms to attain high performance and correctness.
                 Here we show how to make the bisection algorithm for
                 eigenvalues of symmetric tridiagonal matrices (sstebz
                 from LAPACK) run both fast and correctly on an ATI
                 Radeon X1900 GPU. Our fastest algorithm takes up to
                 156! less time than IntelYs Math Kernel Library version
                 of sstebz running on the CPU, but does so by doing many
                 redundant floating point operations compared to the CPU
                 version. We use an automatic tuning procedure analogous
                 to ATLAS or PHiPAC to decide the optimal redundancy.
                 Correctness despite partial IEEE floating point
                 semantics required explicitly adding 0 in the inner
                 loop. The problems and solutions discussed here are of
                 interest on other GPU architectures.",
  acknowledgement = ack-nhfb,
  ucbnumber =    "UCB/EECS-2007-179",
}
@TechReport{Kaagstrom:2008:BAR,
  author =       "Bo K{\aa}gstr{\"o}m and Daniel Kressner and Enrique S.
                 Quintana-Orti and Gregorio Quintana-Orti",
  title =        "Blocked Algorithms for the Reduction to
                 {Hessenberg}-Triangular Form Revisited",
  type =         "LAPACK Working Note",
  number =       "198",
  institution =  "Department of Computing Science and HPC2N",
  address =      "Ume{\aa} University, S-901 Ume{\aa}, Sweden",
  month =        feb,
  year =         "2008",
  bibdate =      "Fri Apr 24 12:25:43 2009",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn198.pdf",
  abstract =     "We present two variants of Moler and Stewart's
                 algorithm for reducing a matrix pair to
                 Hessenberg-triangular (HT) form with increased data
                 locality in the access to the matrices. In one of these
                 variants, a careful reoganization and accumulation of
                 Givens rotations enables the use of efficient level 3
                 BLAS. Experimental results on four different
                 architectures, representative of current high
                 performance processors, compare the performances of the
                 new variants with those of the implementation of Moler
                 and Stewart's algorithm in subroutine DGGHRD from
                 LAPACK, Dackland and K{\aa}gstr{\"o}m's two-stage
                 algorithm for the HT form, and a modified version of
                 the latter which requires considerably less flops.",
  acknowledgement = ack-nhfb,
  keywords =     "blocked algorithms; Generalized eigenvalue problems;
                 Hessenberg-triangular form; high-performance computing;
                 level 3 BLAS; orthogonal transformations; QZ
                 algorithm",
}
@TechReport{Gustavson:2008:RFP,
  author =       "Fred G. Gustavson and Jerzy Wasniewski and Jack J.
                 Dongarra and Julien Langou",
  title =        "Rectangular Full Packed Format for {Cholesky}'s
                 Algorithm: Factorization, Solution and Inversion",
  type =         "LAPACK Working Note",
  number =       "199",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        apr,
  year =         "2008",
  bibdate =      "Fri Apr 24 12:25:43 2009",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn199.pdf",
  abstract =     "We describe a new data format for storing triangular,
                 symmetric, and Hermitian matrices called RFPF
                 (Rectangular Full Packed Format). The standard two
                 dimensional arrays of Fortran and C (also known as full
                 format) that are used to represent triangular and
                 symmetric matrices waste nearly half of the storage
                 space but provide high performance via the use of Level
                 3 BLAS. Standard packed format arrays fully utilize
                 storage (array space) but provide low performance as
                 there is no Level 3 packed BLAS. We combine the good
                 features of packed and full storage using RFPF to
                 obtain high performance via using Level 3 BLAS as RFPF
                 is a standard full format representation. Also, RFPF
                 requires exactly the same minimal storage as packed
                 format. Each LAPACK full and/or packed triangular,
                 symmetric, and Hermitian routine becomes a single new
                 RFPF routine based on eight possible data layouts of
                 RFPF. This new RFPF routine usually consists of two
                 calls to the corresponding LAPACK full format routine
                 and two calls to Level 3 BLAS routines. This means no
                 new software is required. As examples, we present
                 LAPACK routines for Cholesky factorization, Cholesky
                 solution and Cholesky inverse computation in RFPF to
                 illustrate this new work and to describe its
                 performance on several commonly used computer
                 platforms. Performance of LAPACK full routines using
                 RFPF versus LAPACK full routines using standard format
                 for both serial and SMP parallel processing is about
                 the same while using half the storage. Performance
                 gains are roughly one to a factor of 43 for serial and
                 one to a factor of 97 for SMP parallel times faster
                 using vendor LAPACK full routines with RFPF than with
                 using vendor and/or reference packed routines.",
  acknowledgement = ack-nhfb,
  keywords =     "Algorithms; BLAS; Linear Algebra Libraries;
                 Performance",
  subject =      "G.1.3 [Numerical Analysis]: Numerical Linear Algebra -
                 Linear Systems (symmetric and Hermitian); G.4
                 [Mathematics of Computing]: Mathematical Software",
  note =         "UT-CS-08-614",
}
@TechReport{Baboulin:2008:SID,
  author =       "Marc Baboulin and Jack J. Dongarra and Stanimire
                 Tomov",
  title =        "Some Issues in Dense Linear Algebra for Multicore and
                 Special Purpose Architectures",
  type =         "LAPACK Working Note",
  number =       "200",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        may,
  year =         "2008",
  bibdate =      "Fri Apr 24 12:25:43 2009",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn200.pdf",
  abstract =     "We address some key issues in designing dense linear
                 algebra (DLA) algorithms that are common for both
                 multi/many-cores and special purpose architectures (in
                 particular GPUs). We present them in the context of an
                 LU factorization algorithm, where randomization
                 techniques are used as an alternative to pivoting. This
                 approach yields an algorithm based entirely on a
                 collection of small Level 3 BLAS type computational
                 tasks, which has emerged as a common goal in designing
                 DLA algorithms for new architectures. Other common
                 trends, also considered here, are block asynchronous
                 task execution and ``Block'' layouts for the data
                 associated with the separate tasks. We present
                 numerical results and other specific experiments with
                 DLA algorithms on NVIDIA GPUs using CUDA. The GPU
                 results are also of interest themselves as we show a
                 performance of up to 160 Glop/s on a single Quadro FX
                 5600 card. Keywords: dense linear algebra, parallel
                 algorithms, LU factorization, multicore processors,
                 graphic process units.",
  acknowledgement = ack-nhfb,
  note =         "UT-CS-08-615",
}
@TechReport{Kurzak:2008:QFC,
  author =       "Jakub Kurzak and Jack J. Dongarra",
  title =        "{$ Q R $} Factorization for the {CELL} Processor",
  type =         "LAPACK Working Note",
  number =       "201",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        may,
  year =         "2008",
  bibdate =      "Fri Apr 24 12:25:43 2009",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn201.pdf",
  abstract =     "The QR factorization is one of the most important
                 operations in dense linear algebra, offering a
                 numerically stable method for solving linear systems of
                 equations including overdetermined and underdetermined
                 systems. Classic implementation of the QR factorization
                 suffers from performance limitations due to the use of
                 matrix-vector type operations in the phase of panel
                 factorization. These limitations can be remedied by
                 using the idea of updating of QR factorization,
                 rendering an algorithm, which is much more scalable and
                 much more suitable for implementation on a multi-core
                 processor. It is demonstrated how the potential of the
                 CELL processor can be utilized to the fullest by
                 employing the new algorithmic approach and successfully
                 exploiting the capabilities of the CELL processor in
                 terms of Instruction Level Parallelism and Thread-Level
                 Parallelism.",
  acknowledgement = ack-nhfb,
  keywords =     "CELL processor; linear algebra; matrix factorization;
                 multi-core; numerical algorithms",
  note =         "UT-CS-08-616",
}
@TechReport{Volkov:2008:LQC,
  author =       "Vasily Volkov and James W. Demmel",
  title =        "{$ L U $}, {$ Q R $} and {Cholesky} Factorizations
                 using Vector Capabilities of {GPUs}",
  type =         "LAPACK Working Note",
  number =       "202",
  institution =  inst-UCB-EECS,
  address =      inst-UCB-EECS:adr,
  month =        may,
  year =         "2008",
  bibdate =      "Fri Apr 24 12:25:43 2009",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn202.pdf",
  abstract =     "We present performance results for dense linear
                 algebra using the 8-series NVIDIA GPUs. Our
                 matrix-matrix multiply routine (GEMM) runs 60\% faster
                 than the vendor implementation in CUBLAS 1.1 and
                 approaches the peak of hardware capabilities. Our LU,
                 QR and Cholesky factorizations achieve up to 80--90\%
                 of the peak GEMM rate. Our parallel LU running on two
                 GPUs achieves up to $ \approx $300 Gflop/s. These
                 results are accomplished by challenging the accepted
                 view of the GPU architecture and programming
                 guidelines. We argue that modern GPUs should be viewed
                 as multithreaded multicore vector units. We exploit
                 blocking similarly to vector computers and
                 heterogeneity of the system by computing both on GPU
                 and CPU. This study includes detailed benchmarking of
                 the GPU memory system that reveals sizes and latencies
                 of caches and TLB. We present a couple of algorithmic
                 optimizations aimed at increasing parallelism and
                 regularity in the problem that provide us with slightly
                 higher performance.",
  acknowledgement = ack-nhfb,
  ucbnumber =    "UCB/EECS-2008-49",
}
@TechReport{Demmel:2008:NND,
  author =       "James W. Demmel and Yozo Hida and Mark F. Hoemmen and
                 E. Jason Riedy",
  title =        "Non-Negative Diagonals and High Performance on
                 Low-Profile Matrices from Householder {$ Q R $}",
  type =         "LAPACK Working Note",
  number =       "203",
  institution =  inst-UCB-EECS,
  address =      inst-UCB-EECS:adr,
  day =          "30",
  month =        may,
  year =         "2008",
  bibdate =      "Fri Apr 24 12:25:43 2009",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn203.pdf",
  abstract =     "The Householder reflections used in LAPACK's {$ Q R $}
                 factorization leave positive and negative real entries
                 along {$R$}'s diagonal. This is sufficient for most
                 applications of {$ Q R $} factorizations, but a few
                 require that {$R$} have a non-negative diagonal. This
                 note provides a new Householder generation routine to
                 produce a non-negative diagonal. Additionally, we find
                 that scanning for trailing zeros in the generated
                 reflections leads to large performance improvements
                 when applying reflections with many trailing zeros.
                 Factoring low-profile matrices, those with non-zero
                 entries mostly near the diagonal (e.g. band matrices),
                 now requires far fewer operations. For example, {$ Q R
                 $} factorization of matrices with profile width $b$
                 that are stored densely in an $ n \times n $ matrix
                 improves from {$ O(n^3) $} to {$ O(n^2 + n b^2) $}.",
  acknowledgement = ack-nhfb,
  ucbnumber =    "UCB/EECS-2008-76",
}
@TechReport{Demmel:2008:COP,
  author =       "James W. Demmel and Laura Grigori and Mark F. Hoemmen
                 and Julien Langou",
  title =        "Communication-optimal parallel and sequential {$ Q R
                 $} and {$ L U $} factorizations",
  type =         "LAPACK Working Note",
  number =       "204",
  institution =  inst-UCB-EECS,
  address =      inst-UCB-EECS:adr,
  month =        aug,
  year =         "2008",
  bibdate =      "Fri Apr 24 12:25:43 2009",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  URL =          "http://www.eecs.berkeley.edu/Pubs/TechRpts/2008/EECS-2008-89.html;
                 http://www.netlib.org/lapack/lawnspdf/lawn204.pdf",
  abstract =     "We present parallel and sequential dense QR
                 factorization algorithms that are both optimal (up to
                 polylogarithmic factors) in the amount of communication
                 they perform, and just as stable as Householder QR. Our
                 first algorithm, Tall Skinny QR (TSQR), factors m
                 \times n matrices in a one-dimensional (1-D) block
                 cyclic row layout, and is optimized for m n. Our second
                 algorithm, CAQR (Communication-Avoiding QR), factors
                 general rectangular matrices distributed in a
                 two-dimensional block cyclic layout. It invokes TSQR
                 for each block column factorization.\par
                 The new algorithms are superior in both theory and
                 practice. We have extended known lower bounds on
                 communication for sequential and parallel matrix
                 multiplication to provide latency lower bounds, and
                 show these bounds apply to the LU and QR
                 decompositions. We not only show that our QR algorithms
                 attain these lower bounds (up to polylogarithmic
                 factors), but that existing LAPACK and ScaLAPACK
                 algorithms perform asymptotically more communication.
                 We also point out recent LU algorithms in the
                 literature that attain at least some of these lower
                 bounds.\par
                 Both TSQR and CAQR have asymptotically lower latency
                 cost in the parallel case, and asymptotically lower
                 latency and bandwidth costs in the sequential case. In
                 practice, we have implemented parallel TSQR on several
                 machines, with speedups of up to 6.7 \times on 16
                 processors of a Pentium III cluster, and up to 4 \times
                 on 32 processors of a BlueGene/L. We have also
                 implemented sequential TSQR on a laptop for matrices
                 that do not fit in DRAM, so that slow memory is disk.
                 Our out-of-DRAM implementation was as little as 2
                 \times slower than the predicted runtime as though DRAM
                 were infinite.\par
                 We have also modeled the performance of our parallel
                 CAQR algorithm, yielding predicted speedups over
                 ScaLAPACK's PDGEQRF of up to 9.7 \times on an IBM
                 Power5, up to 22.9 \times on a model Petascale machine,
                 and up to 5.3 \times on a model of the Grid.",
  acknowledgement = ack-nhfb,
  ucbnumber =    "UCB/EECS-2008-89",
}
@TechReport{Bosilca:2008:ABF,
  author =       "George Bosilca and Remi Delmas and Jack J. Dongarra
                 and Julien Langou",
  title =        "Algorithmic Based Fault Tolerance Applied to High
                 Performance Computing",
  type =         "LAPACK Working Note",
  number =       "205",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  day =          "23",
  month =        may,
  year =         "2008",
  bibdate =      "Fri Apr 24 12:25:43 2009",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn205.pdf",
  abstract =     "We present a new approach to fault tolerance for High
                 Performance Computing system. Our approach is based on
                 a careful adaptation of the Algorithmic Based Fault
                 Tolerance technique (Huang and Abraham, 1984) to the
                 need of parallel distributed computation. We obtain a
                 strongly scalable mechanism for fault tolerance. We can
                 also detect and correct errors (bit-flip) on the fly of
                 a computation. To assess the viability of our approach,
                 we have developed a fault tolerant matrix-matrix
                 multiplication subroutine and we propose some models to
                 predict its running time. Our parallel fault-tolerant
                 matrix-matrix multiplication scores 1.4 TFLOPS on 484
                 processors (cluster {\tt jacquard.nersc.gov}) and
                 returns a correct result while one process failure has
                 happened. This represents 65\% of the machine peak
                 efficiency and less than 12\% overhead with respect to
                 the fastest failure-free implementation. We predict
                 (and have observed) that, as we increase the processor
                 count, the overhead of the fault tolerance drops
                 significantly.",
  acknowledgement = ack-nhfb,
  note =         "UT-CS-08-620",
}
@TechReport{Dongarra:2008:PLB,
  author =       "Jack J. Dongarra and Julien Langou",
  title =        "The Problem with the {Linpack} Benchmark Matrix
                 Generator",
  type =         "LAPACK Working Note",
  number =       "206",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  day =          "12",
  month =        jun,
  year =         "2008",
  bibdate =      "Fri Apr 24 12:25:43 2009",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "Version 1; version 2 is dated 18 September 2008.",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn206.pdf",
  abstract =     "We characterize the matrix sizes for which the Linpack
                 Benchmark 1.0 matrix generator constructs a matrix with
                 identical columns.",
  acknowledgement = ack-nhfb,
  ucdenvernumber = "UCD-CCM-271",
}
@TechReport{Baboulin:2008:UDT,
  author =       "Marc Baboulin and Serge Gratton",
  title =        "Using dual techniques to derive componentwise and
                 mixed condition numbers for a linear functional of a
                 linear least squares solution",
  type =         "LAPACK Working Note",
  number =       "207",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        aug,
  year =         "2008",
  MRclass =      "65F35",
  bibdate =      "Fri Apr 24 12:25:43 2009",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn207.pdf",
  abstract =     "We prove duality results for adjoint operators and
                 product norms in the framework of Euclidean spaces. We
                 show how these results can be used to derive condition
                 numbers especially when perturbations on data are
                 measured componentwise relatively to the original data.
                 We apply this technique to obtain formulas for
                 componentwise and mixed condition numbers for a linear
                 functional of a linear least squares solution. These
                 expressions are closed when perturbations of the
                 solution are measured using a componentwise norm or the
                 infinity norm and we get an upper bound for the
                 Euclidean norm.",
  acknowledgement = ack-nhfb,
  keywords =     "adjoint operator; componentwise perturbations;
                 condition number; Dual norm; linear least squares",
  note =         "UT-CS-08-622",
}
@TechReport{Ltaief:2008:PBH,
  author =       "Hatem Ltaief and Jakub Kurzak and Jack Dongarra",
  title =        "Parallel Block {Hessenberg} Reduction using
                 Algorithms-By-Tiles for Multicore Architectures
                 Revisited",
  type =         "LAPACK Working Note",
  number =       "208",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        aug,
  year =         "2008",
  bibdate =      "Fri Apr 24 12:25:43 2009",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn208.pdf",
  abstract =     "The objective of this paper is to extend and redesign
                 the block matrix reduction applied for the family of
                 two-sided factorizations, introduced by Dongarra et al.
                 [9], to the context of multicore architectures using
                 algorithms-by-tiles. In particular, the Block
                 Hessenberg Reduction is very often used as a
                 pre-processing step in solving dense linear algebra
                 problems, such as the standard eigenvalue problem.
                 Although expensive, orthogonal transformations are
                 commonly used for this reduction because they guarantee
                 stability, as opposed to Gaussian Elimi- nation. Two
                 versions of the Block Hessenberg Reduction are
                 presented in this paper, the first one with Householder
                 reflectors and the second one with Givens rotations. A
                 short investigation on variants of Fast Givens
                 Rotations is also mentioned. Furthermore, in the last
                 Top500 list from June 2008, 98\% of the fastest
                 parallel systems in the world are based on multicores.
                 The emerging petascale systems consisting of hundreds
                 of thousands of cores have exacerbated the problem even
                 more and it becomes judicious to efficiently integrate
                 existing or new numerical linear algebra algorithms
                 suitable for such hardware. By exploiting the concepts
                 of algorithms-by-tiles in the multicore environment
                 (i.e., high level of parallelism with fine granularity
                 and high performance data representation combined with
                 a dynamic data driven execution), the Block Hessenberg
                 Reduction presented here achieves 72\% of the DGEMM
                 peak on a 12000 \times 12000 matrix with 16 Intel
                 Tigerton 2.4 GHz processors.",
  acknowledgement = ack-nhfb,
  note =         "UT-CS-08-624",
}
@TechReport{Ltaief:2008:PBT,
  author =       "Hatem Ltaief and Jakub Kurzak and Jack Dongarra",
  title =        "Parallel Band Two-Sided Matrix Bidiagonalization for
                 Multicore Architectures",
  type =         "LAPACK Working Note",
  number =       "209",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        oct,
  year =         "2008",
  bibdate =      "Fri Apr 24 12:25:43 2009",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn209.pdf",
  abstract =     "The objective of this paper is to extend, in the
                 context of multicore architectures, the concepts of
                 algorithms-by-tiles [Buttari et al., 2007] for
                 Cholesky, LU, QR factorizations to the family of two-
                 sided factorizations. In particular, the bidiagonal
                 reduction of a general, dense matrix is very often used
                 as a pre-processing step for calculating the singular
                 value decomposition. Furthermore, in the last Top500
                 list from June 2008, 98\% of the fastest parallel
                 systems in the world were based on multicores. The
                 manycore trend has increasingly exacerbated the
                 problem, and it becomes critical to efficiently
                 integrate existing or new numerical linear algebra
                 algorithms suitable for such hardware. By exploiting
                 the concept of algorithms-by-tiles in the multicore
                 environment (i.e., high level of parallelism with fine
                 granularity and high performance data representation
                 combined with a dynamic data driven execution), the
                 band bidiagonal reduction presented here achieves 94
                 Gflop/s on a 12000 \times 12000 matrix with 16 Intel
                 Tigerton 2.4 GHz processors.",
  acknowledgement = ack-nhfb,
  note =         "UT-CS-08-631",
}
@TechReport{Tomov:2008:TDL,
  author =       "Stanimire Tomov and Jack Dongarra and Marc Baboulin",
  title =        "Towards Dense Linear Algebra for Hybrid {GPU}
                 Accelerated Manycore Systems",
  type =         "LAPACK Working Note",
  number =       "210",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        oct,
  year =         "2008",
  bibdate =      "Fri Apr 24 12:25:43 2009",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn210.pdf",
  abstract =     "If multicore is a disruptive technology, try to
                 imagine hybrid multicore systems enhanced with
                 accelerators! This is happening today as accelerators,
                 in particular Graphics Processing Units (GPUs), are
                 steadily making their way into the high performance
                 computing (HPC) world. We highlight the trends leading
                 to the idea of hybrid manycore/GPU systems, and we
                 present a set of techniques that can be used to
                 efficiently program them. The presentation is in the
                 context of Dense Linear Algebra (DLA), a major building
                 block for many scientific computing applications. We
                 motivate the need for new algorithms that would split
                 the computation in a way that would fully exploit the
                 power that each of the hybrid components offers. As the
                 area of hybrid multicore/GPU computing is still in its
                 infancy, we also argue for its importance in view of
                 what future architectures may look like. We therefore
                 envision the need for a DLA library similar to LAPACK
                 but for hybrid manycore/GPU systems. We illustrate the
                 main ideas with an LU-factorization algorithm where
                 particular techniques are used to reduce the amount of
                 pivoting, resulting in an algorithm achieving up to 388
                 GFlop/s for single and up to 99.4 GFlop/s for double
                 precision factorization on a hybrid Intel Xeon (2x4
                 cores @ 2.33 GHz) --- NVIDIA GeForce GTX 280 5 (240
                 cores @ 1.30 GHz) system.",
  acknowledgement = ack-nhfb,
  keywords =     "dense linear algebra; graphics processing units.;
                 hybrid computing; LU factorization; multicore
                 processors; parallel algorithms",
  note =         "UT-CS-08-632",
}
@TechReport{Gustavson:2008:LCK,
  author =       "Fred G. Gustavson and Jerzy Wasniewski and Jack
                 Dongarra",
  title =        "Level-3 {Cholesky} kernel subroutine of a fully
                 portable High Performance minimal storage hybrid format
                 {Cholesky} algorithm",
  type =         "LAPACK Working Note",
  number =       "211",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        dec,
  year =         "2008",
  bibdate =      "Fri Apr 24 12:25:43 2009",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn211.pdf",
  acknowledgement = ack-nhfb,
  note =         "UT-CS-08-634",
}
@TechReport{Li:2009:NAT,
  author =       "Yinan Li and Jack Dongarra and Stanimire Tomov",
  title =        "A Note on Auto-tuning {GEMM} for {GPUs}",
  type =         "LAPACK Working Note",
  number =       "212",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        jan,
  year =         "2009",
  bibdate =      "Fri Apr 24 12:25:43 2009",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn212.pdf",
  abstract =     "The development of high performance dense linear
                 algebra (DLA) critically depends on highly optimized
                 BLAS, and especially on the matrix multiplication
                 routine (GEMM). This is especially true for Graphics
                 Processing Units (GPUs), as evidenced by recently
                 published results on DLA for GPUs that rely on highly
                 optimized GEMM [13, 11]. However, the current best GEMM
                 performance, e.g. of up to 375 GFlop/s in single
                 precision and of up to 75 GFlop/s in double precision
                 arithmetic on NVIDIA's GTX 280, is difficult to
                 achieve. The development involves extensive GPU
                 knowledge and even backward engineering to understand
                 some undocumented insides about the architecture that
                 have been of key importance in the development [12]. In
                 this paper, we describe some GPU GEMM auto-tuning
                 optimization techniques that allow us to keep up with
                 changing hardware by rapidly reusing, rather than
                 reinventing, the existing ideas. Auto-tuning, as we
                 show in this paper, is a very practical solution where
                 in addition to getting an easy portability, we can
                 often get substantial speedups even on current GPUs
                 (e.g. up to 27\% in certain cases for both single and
                 double precision GEMMs on the GTX 280).",
  acknowledgement = ack-nhfb,
  keywords =     "Auto-tuning; dense linear algebra; GPUs; matrix
                 multiply",
  note =         "UT-CS-09-635",
}
@TechReport{Kurzak:2009:SLA,
  author =       "Jakub Kurzak and Hatem Ltaief and Jack Dongarra and
                 Rosa M. Badia",
  title =        "Scheduling Linear Algebra Operations on Multicore
                 Processors",
  type =         "LAPACK Working Note",
  number =       "213",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        feb,
  year =         "2009",
  bibdate =      "Fri Apr 24 12:25:43 2009",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn213.pdf",
  abstract =     "We present performance results for dense linear
                 algebra using the 8-series NVIDIA GPUs. Our
                 matrix-matrix multiply routine (GEMM) runs 60\% faster
                 than the vendor implementation in CUBLAS 1.1 and
                 approaches the peak of hardware capabilities. Our LU,
                 QR and Cholesky factorizations achieve up to 80--90\%
                 of the peak GEMM rate. Our parallel LU running on two
                 GPUs achieves up to $ \approx $300 Gflop/s. These
                 results are accomplished by challenging the accepted
                 view of the GPU architecture and
                 programming guidelines. We argue that modern GPUs
                 should be viewed as multithreaded multicore vector
                 units. We exploit blocking similarly to vector
                 computers and heterogeneity of the system by computing
                 both on GPU and CPU. This study includes detailed
                 benchmarking of the GPU memory system that reveals
                 sizes and latencies of caches and TLB. We present a
                 couple of algorithmic optimizations aimed at increasing
                 parallelism and regularity in the problem that provide
                 us with slightly higher performance.",
  acknowledgement = ack-nhfb,
  keywords =     "Cholesky; factorization; linear algebra; LU;
                 multicore; QR; scheduling; task graph",
  note =         "UT-CS-09-636",
}
@TechReport{Kurzak:2009:STS,
  author =       "Jakub Kurzak and Hatem Ltaief and Jack Dongarra",
  title =        "Scheduling Two-sided Transformations using
                 Algorithms-by-Tiles on Multicore Architectures",
  type =         "LAPACK Working Note",
  number =       "214",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        feb,
  year =         "2009",
  bibdate =      "Fri Apr 24 12:25:43 2009",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn214.pdf",
  abstract =     "The objective of this paper is to describe, in the
                 context of multicore architectures, different scheduler
                 implementations for the two-sided linear algebra
                 transformations, in particular the Hessenberg and
                 Bidiagonal reductions which are the first steps for the
                 standard eigenvalue problems and the singular value
                 decompositions respectively. State-of-the-art dense
                 linear algebra software, such as the LAPACK and
                 ScaLAPACK libraries, suffer performance losses on
                 multicore processors due to their inability to fully
                 exploit thread-level parallelism. At the same time the
                 coarse-grain dataflow model gains popularity as a
                 paradigm for programming multicore architectures. By
                 using the concepts of algorithms-by-tiles [Buttari et
                 al., 2007] along with efficient mechanisms for
                 data-driven execution, these two-sided reductions
                 achieve high performance computing. The main drawback
                 of the algorithms-by-tiles approach for two-sided
                 transformations is that the full reduction can not be
                 obtained in one stage. Other methods have to be
                 considered to further reduce the band matrices to the
                 required forms.",
  acknowledgement = ack-nhfb,
  note =         "UT-CS-09-637",
}
@TechReport{Ballard:2009:COP,
  author =       "Grey Ballard and James Demmel and Olga Holtz and Oded
                 Schwartz",
  title =        "Communication-optimal Parallel and Sequential
                 {Cholesky} decomposition",
  type =         "LAPACK Working Note",
  number =       "215",
  institution =  inst-UCB-EECS,
  address =      inst-UCB-EECS:adr,
  day =          "13",
  month =        feb,
  year =         "2009",
  bibdate =      "Fri Apr 24 12:25:43 2009",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn215.pdf",
  abstract =     "Numerical algorithms have two kinds of costs:
                 arithmetic and communication, by which we mean either
                 moving data between levels of a memory hierarchy (in
                 the sequential case) or over a network connecting
                 processors (in the parallel case). Communication costs
                 often dominate arithmetic costs, so it is of interest
                 to design algorithms minimizing communication. In this
                 paper we first extend known lower bounds on the
                 communication cost (both for bandwidth and for latency)
                 of conventional ({$ O(n^3) $}) matrix multiplication to
                 Cholesky factorization, which is used for solving dense
                 symmetric positive definite linear systems. Second, we
                 compare the cost of various Cholesky decomposition
                 implementations to this lower bound, and draw the
                 following conclusions:\par
                 \begin{itemize} \item ``Naive'' sequential algorithms
                 for Cholesky attain neither the bandwidth nor latency
                 lower bounds. \item The sequential blocked algorithm in
                 LAPACK (with the right block size), as well as various
                 recursive algorithms [AP00, GJ01, AGW01, ST04], and one
                 based on work of Toledo [Tol97], can attain the
                 bandwidth lower bound. \item The LAPACK algorithm can
                 also attain the latency bound if used with blocked data
                 structures rather than column-wise or row-wise matrix
                 data structures, though the Toledo algorithm cannot.
                 \item The recursive sequential algorithm due to [AP00]
                 attains the bandwidth and latency lower bounds at every
                 level of a multi-level memory hierarchy, in a
                 `cache-oblivious' way. \item The parallel
                 implementation of Cholesky in the ScaLAPACK library
                 (again with the right block-size) attains both the
                 bandwidth and latency lower bounds to within a poly-
                 logarithmic factor. \end{itemize}
                 Combined with prior results in [DGHL08a, DGHL08b,
                 DGX08] this gives a complete set of
                 communication-optimal algorithms for {$ O(n^3) $}
                 implementations of three basic factorizations of dense
                 linear algebra: LU with pivoting, QR and Cholesky. But
                 it goes beyond this prior work on sequential LU and QR
                 by optimizing communication for any number of levels of
                 memory hierarchy.",
  acknowledgement = ack-nhfb,
  ucbnumber =    "UCB/EECS-2009-29",
}
@TechReport{Granat:2009:NPQ,
  author =       "Robert Granat and Bo K{\aa}gstr{\"o}m and Daniel
                 Kressner",
  title =        "A novel parallel {$ Q R $} algorithm for hybrid
                 distributed memory {HPC} systems",
  type =         "LAPACK Working Note",
  number =       "216",
  institution =  "Department of Computing Science and HPC2N",
  address =      "Ume{\aa} University, S-901 Ume{\aa}, Sweden",
  month =        apr,
  year =         "2009",
  bibdate =      "Fri Apr 24 12:25:43 2009",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn216.pdf",
  abstract =     "A novel variant of the parallel QR algorithm for
                 solving dense nonsymmetric eigenvalue problems on
                 hybrid distributed high performance computing (HPC)
                 systems is presented. For this purpose, we introduce
                 the concept of multi-window bulge chain chasing and
                 parallelize aggressive early deflation. The
                 multi-window approach ensures that most computations
                 when chasing chains of bulges are performed in level 3
                 BLAS operations, while the aim of aggressive early
                 deflation is to speed up the convergence of the QR
                 algorithm. Mixed MPI-OpenMP coding techniques are
                 utilized for porting the codes to distributed memory
                 platforms with multithreaded nodes, such as multicore
                 processors. Numerous numerical experiments confirm the
                 superior performance of our parallel QR algorithm in
                 comparison with the existing ScaLAPACK code, leading to
                 an implementation that is one to two orders of
                 magnitude faster for sufficiently large problems,
                 including a number of examples from applications.",
  acknowledgement = ack-nhfb,
  keywords =     "aggressive early deflation; bulge chasing; Eigenvalue
                 problem; hybrid distributed memory systems.; level 3
                 performance; multishift; nonsymmetric QR algorithm;
                 parallel algorithms; parallel computations",
  note =         "UMINF-09.06",
}
@TechReport{Agullo:2009:CSO,
  author =       "Emmanuel Agullo and Bilel Hadri and Hatem Ltaief and
                 Jack Dongarra",
  title =        "Comparative Study of One-Sided Factorizations with
                 Multiple Software Packages on Multi-Core Hardware",
  type =         "LAPACK Working Note",
  number =       "217",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  day =          "28",
  month =        apr,
  year =         "2009",
  bibdate =      "Wed Aug 24 12:36:41 MDT 2011",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-09-640.",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn217.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Ballard:2009:MCL,
  author =       "Grey Ballard and James Demmel and Olga Holtz and Oded
                 Schwartz",
  title =        "Minimizing Communication in Linear Algebra",
  type =         "LAPACK Working Note",
  number =       "218",
  institution =  inst-UCB-EECS,
  address =      inst-UCB-EECS:adr,
  day =          "15",
  month =        may,
  year =         "2009",
  bibdate =      "Wed Aug 24 12:36:41 MDT 2011",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UCB/EECS-2009-62",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn218.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Tomov:2009:ARU,
  author =       "Stanimire Tomov and Jack Dongarra",
  title =        "Accelerating the reduction to upper {Hessenberg} form
                 through hybrid {GPU}-based computing",
  type =         "LAPACK Working Note",
  number =       "219",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  day =          "24",
  month =        may,
  year =         "2009",
  bibdate =      "Wed Aug 24 12:36:41 MDT 2011",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-09-642.",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn219.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Kurzak:2009:FDS,
  author =       "Jakub Kurzak and Jack Dongarra",
  title =        "Fully Dynamic Scheduler for Numerical Computing on
                 Multicore Processors",
  type =         "LAPACK Working Note",
  number =       "220",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  day =          "4",
  month =        jun,
  year =         "2009",
  bibdate =      "Wed Aug 24 12:36:41 MDT 2011",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-09-643.",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn220.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Song:2009:DTS,
  author =       "Fengguang Song and Asim YarKhan and Jack Dongarra",
  title =        "Dynamic Task Scheduling for Linear Algebra Algorithms
                 on Distributed-Memory Multicore Systems",
  type =         "LAPACK Working Note",
  number =       "221",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  day =          "13",
  month =        apr,
  year =         "2009",
  bibdate =      "Wed Aug 24 12:36:41 MDT 2011",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-09-638.",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn221.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Hadri:2009:EPT,
  author =       "Bilel Hadri and Hatem Ltaief and Emmanuel Agullo and
                 Jack Dongarra",
  title =        "Enhancing Parallelism of Tile {$ Q R $} Factorization
                 for Multicore Architectures",
  type =         "LAPACK Working Note",
  number =       "222",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  day =          "4",
  month =        sep,
  year =         "2009",
  bibdate =      "Wed Aug 24 12:36:41 MDT 2011",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-09-645.",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn222.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Ltaief:2009:SHP,
  author =       "Hatem Ltaief and Stanimire Tomov and Rajib Nath and
                 Peng Du and Jack Dongarra",
  title =        "A Scalable High Performant {Cholesky} Factorization
                 for Multicore with {GPU} Accelerators",
  type =         "LAPACK Working Note",
  number =       "223",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  day =          "25",
  month =        nov,
  year =         "2009",
  bibdate =      "Wed Aug 24 12:36:41 MDT 2011",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-09-646.",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn223.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Agullo:2010:QFT,
  author =       "Emmanuel Agullo and Camille Coti and Jack Dongarra and
                 Thomas Herault and Julien Langou",
  title =        "{$ Q R $} Factorization of Tall and Skinny Matrices in
                 a Grid Computing Environment",
  type =         "LAPACK Working Note",
  number =       "224",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  day =          "6",
  month =        apr,
  year =         "2010",
  bibdate =      "Wed Aug 24 12:36:41 MDT 2011",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-10-651. Published in the Proceedings of IPDPS
                 2010: 24th IEEE International Parallel and Distributed
                 Processing Symposium Atlanta GA April 2010.",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn224.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Tomov:2010:DLA,
  author =       "Stanimire Tomov and Rajib Nath and Hatem Ltaief and
                 Jack Dongarra",
  title =        "Dense Linear Algebra Solvers for Multicore with {GPU}
                 Accelerators",
  type =         "LAPACK Working Note",
  number =       "225",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  day =          "18",
  month =        apr,
  year =         "2010",
  bibdate =      "Wed Aug 24 12:36:41 MDT 2011",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-09-649. Published in the Proceedings of IPDPS
                 2010: 24th IEEE International Parallel and Distributed
                 Processing Symposium Atlanta GA April 2010.",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn225.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Grigori:2010:CCO,
  author =       "Laura Grigori and James W. Demmel and Hua Xiang",
  title =        "{CALU}: a communication optimal {$ L U $}
                 factorization algorithm",
  type =         "LAPACK Working Note",
  number =       "226",
  institution =  inst-UCB-EECS,
  address =      inst-UCB-EECS:adr,
  day =          "15",
  month =        mar,
  year =         "2010",
  bibdate =      "Wed Aug 24 12:36:41 MDT 2011",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UCB/EECS-2010-29. Submitted to SIAM Journal on Matrix
                 Analysis and Applications (SIMAX).",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn226.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Nath:2010:IMG,
  author =       "Rajib Nath and Stanimire Tomov and Jack Dongarra",
  title =        "An Improved {MAGMA GEMM} for {Fermi GPUs}",
  type =         "LAPACK Working Note",
  number =       "227",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  day =          "29",
  month =        jul,
  year =         "2010",
  bibdate =      "Wed Aug 24 12:36:41 MDT 2011",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-10-655.",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn227.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Du:2010:COT,
  author =       "Peng Du and Rick Weber and Piotr Luszczek and
                 Stanimire Tomov and Gregory Peterson and Jack
                 Dongarra",
  title =        "From {CUDA} to {OpenCL}: Towards a
                 Performance-portable Solution for Multi-platform {GPU}
                 Programming",
  type =         "LAPACK Working Note",
  number =       "228",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  day =          "6",
  month =        sep,
  year =         "2010",
  bibdate =      "Wed Aug 24 12:36:41 MDT 2011",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-10-656.",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn228.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Kurzak:2010:ITF,
  author =       "Jakub Kurzak and Rajib Nath and Peng Du and Jack
                 Dongarra",
  title =        "An Implementation of the Tile {$ Q R $} Factorization
                 for a {GPU} and Multiple {CPUs}",
  type =         "LAPACK Working Note",
  number =       "229",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  day =          "15",
  month =        sep,
  year =         "2010",
  bibdate =      "Wed Aug 24 12:36:41 MDT 2011",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-10-657. Submitted to PARA'10",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn229.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Agullo:2010:FCB,
  author =       "Emmanuel Agullo and Cedric Augonnet and Jack Dongarra
                 and Hatem Ltaief and Raymond Namyst and Samuel Thibault
                 and Stanimire Tomov",
  title =        "Faster, Cheaper, Better --- a Hybridization
                 Methodology to Develop Linear Algebra Software for
                 {GPUs}",
  type =         "LAPACK Working Note",
  number =       "230",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  day =          "15",
  month =        sep,
  year =         "2010",
  bibdate =      "Wed Aug 24 12:36:41 MDT 2011",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-10-658. To appear in GPU Computing GEMs, vol.
                 2",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn230.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Bosilca:2010:DGD,
  author =       "G. Bosilca and A. Bouteiller and A. Danalis and T.
                 Herault and P. Lemarinier and J. Dongarra",
  title =        "{DAGuE}: {A} generic distributed {DAG} engine for high
                 performance computing",
  type =         "LAPACK Working Note",
  number =       "231",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  day =          "15",
  month =        sep,
  year =         "2010",
  bibdate =      "Wed Aug 24 12:36:41 MDT 2011",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-10-659.",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn231.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Bosilca:2010:DMT,
  author =       "G. Bosilca and A. Bouteiller and A. Danalis and M.
                 Faverge and H. Haidar and T. Herault and J. Kurzak and
                 J. Langou and P. Lemarinier and H. Ltaief and P.
                 Luszczekl and A. YarKhan and J. Dongarra",
  title =        "Distributed-Memory Task Execution and Dependence
                 Tracking within {DAGuE} and the {DPLASMA Project}",
  type =         "LAPACK Working Note",
  number =       "232",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  day =          "15",
  month =        sep,
  year =         "2010",
  bibdate =      "Wed Aug 24 12:36:41 MDT 2011",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-10-660.",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn232.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Agullo:2010:FMN,
  author =       "E. Agullo and C. Augonnet and J. Dongarra and M.
                 Faverge and H. Ltaief and S. Thibault and S. Tomov",
  title =        "{$ Q R $} Factorization on a Multicore Node Enhanced
                 with Multiple {GPU} Accelerators",
  type =         "LAPACK Working Note",
  number =       "233",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        oct,
  year =         "2010",
  bibdate =      "Wed Aug 24 12:36:41 MDT 2011",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-10-XXX, published in Proceedings of IPDPS
                 2011.",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn233.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Dongarra:2010:RTT,
  author =       "Jack Dongarra and Piotr Luszczek",
  title =        "Reducing the time to tune parallel dense linear
                 algebra routines with partial execution and performance
                 modelling",
  type =         "LAPACK Working Note",
  number =       "235",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  day =          "8",
  month =        oct,
  year =         "2010",
  bibdate =      "Wed Aug 24 12:36:41 MDT 2011",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-10-661.",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn235.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Baboulin:2010:CCT,
  author =       "Marc Baboulin and Serge Gratton",
  title =        "A contribution to the conditioning of the total least
                 squares problem",
  type =         "LAPACK Working Note",
  number =       "236",
  institution =  inst-INRIA,
  address =      inst-INRIA:adr,
  day =          "5",
  month =        nov,
  year =         "2010",
  bibdate =      "Wed Aug 24 12:36:41 MDT 2011",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "INRIA report.",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn236.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Ballard:2010:MCE,
  author =       "Grey Ballard and James Demmel and Ioana Dumitriu",
  title =        "Minimizing Communication for Eigenproblems and the
                 Singular Value Decomposition",
  type =         "LAPACK Working Note",
  number =       "237",
  institution =  inst-UCB-EECS,
  address =      inst-UCB-EECS:adr,
  day =          "13",
  month =        nov,
  year =         "2010",
  bibdate =      "Wed Aug 24 12:36:41 MDT 2011",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UCB/EECS-2010-136.",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn237.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Solomonik:2011:COPa,
  author =       "Edgar Solomonik and James Demmel",
  title =        "Communication-optimal parallel {$ 2.5 $D} matrix
                 multiplication and {$ L U $} factorization algorithms",
  type =         "LAPACK Working Note",
  number =       "238",
  institution =  inst-UCB-EECS,
  address =      inst-UCB-EECS:adr,
  day =          "9",
  month =        feb,
  year =         "2011",
  bibdate =      "Wed Aug 24 12:36:41 MDT 2011",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UCB/EECS-2011-10.",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn238.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Ballard:2011:CBH,
  author =       "Grey Ballard and James Demmel and Andrew Gearhart",
  title =        "Communication bounds for heterogeneous architectures",
  type =         "LAPACK Working Note",
  number =       "239",
  institution =  inst-UCB-EECS,
  address =      inst-UCB-EECS:adr,
  day =          "11",
  month =        feb,
  year =         "2011",
  bibdate =      "Wed Aug 24 12:36:41 MDT 2011",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UCB/EECS-2011-13.",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn239.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Anderson:2011:CAD,
  author =       "Michael Anderson and Grey Ballard and James Demmel and
                 Kurt Keutzer",
  title =        "Communication-Avoiding {$ Q R $} Decomposition for
                 {GPUs}",
  type =         "LAPACK Working Note",
  number =       "240",
  institution =  inst-UCB-EECS,
  address =      inst-UCB-EECS:adr,
  day =          "18",
  month =        feb,
  year =         "2011",
  bibdate =      "Wed Aug 24 12:36:41 MDT 2011",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "Update of UCB/EECS-2010-131. To appear in IPDPS'11.",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn240.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Song:2011:STC,
  author =       "Fengguang Song and Hatem Ltaief and Bilel Hadri and
                 Jack Dongarra",
  title =        "Scalable Tile Communication-Avoiding {$ Q R $}
                 Factorization on Multicore Cluster Systems",
  type =         "LAPACK Working Note",
  number =       "241",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  day =          "4",
  month =        mar,
  year =         "2011",
  bibdate =      "Wed Aug 24 12:36:41 MDT 2011",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-10-653. Published at SC'10",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn241.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Agullo:2011:FEA,
  author =       "Emmanuel Agullo and Jack Dongarra and Rajib Nath and
                 Stanimire Tomov",
  title =        "A Fully Empirical Autotuned Dense {$ Q R $}
                 Factorization For Multicore Architectures",
  type =         "LAPACK Working Note",
  number =       "242",
  institution =  inst-INRIA,
  address =      inst-INRIA:adr,
  day =          "9",
  month =        mar,
  year =         "2011",
  bibdate =      "Wed Aug 24 12:36:41 MDT 2011",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "INRIA-7526.",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn242.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Haidar:2011:ADS,
  author =       "Azzam Haidar and Hatem Ltaief and Asim YarKhan and
                 Jack Dongarra",
  title =        "Analysis of Dynamically Scheduled Tile Algorithms for
                 Dense Linear Algebra on Multicore Architectures",
  type =         "LAPACK Working Note",
  number =       "243",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  day =          "10",
  month =        mar,
  year =         "2011",
  bibdate =      "Wed Aug 24 12:36:41 MDT 2011",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-11-666. Submitted at Concurrency and
                 Computations.",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn243.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Luszczek:2011:TST,
  author =       "Piotr Luszczek and Hatem Ltaief and Jack Dongarra",
  title =        "Two-Stage Tridiagonal Reduction for Dense Symmetric
                 Matrices using Tile Algorithms on Multicore
                 Architectures",
  type =         "LAPACK Working Note",
  number =       "244",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  day =          "18",
  month =        apr,
  year =         "2011",
  bibdate =      "Wed Aug 24 12:36:41 MDT 2011",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-11-670.",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn244.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Kurzak:2011:AGF,
  author =       "Jakub Kurzak and Stanimire Tomov and Jack Dongarra",
  title =        "Autotuning {GEMMs} for {Fermi}",
  type =         "LAPACK Working Note",
  number =       "245",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  day =          "18",
  month =        apr,
  year =         "2011",
  bibdate =      "Wed Aug 24 12:36:41 MDT 2011",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-11-671. Submitted at SC11 November 12-18, 2011,
                 Seattle, Washington, USA.",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn245.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Baboulin:2011:ALS,
  author =       "Marc Baboulin and Jack Dongarra and Julien Herrmann
                 and Stanimire Tomov",
  title =        "Accelerating linear system solutions using
                 randomization techniques",
  type =         "LAPACK Working Note",
  number =       "246",
  institution =  inst-INRIA,
  address =      inst-INRIA:adr,
  day =          "15",
  month =        may,
  year =         "2011",
  bibdate =      "Wed Aug 24 12:36:41 MDT 2011",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "INRIA RR-7616.",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn246.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Ltaief:2011:HPB,
  author =       "Hatem Ltaief and Piotr Luszczek and Jack Dongarra",
  title =        "High Performance Bidiagonal Reduction using Tile
                 Algorithms on Homogeneous Multicore Architectures",
  type =         "LAPACK Working Note",
  number =       "247",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  day =          "18",
  month =        may,
  year =         "2011",
  bibdate =      "Wed Aug 24 12:36:41 MDT 2011",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-11-673. Submitted at TOMS.",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn247.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Solomonik:2011:COPb,
  author =       "Edgar Solomonik and James Demmel",
  title =        "Communication-optimal parallel {$ 2.5 $D} matrix
                 multiplication and {$ L U $} factorization algorithms",
  type =         "LAPACK Working Note",
  number =       "248",
  institution =  inst-UCB-EECS,
  address =      inst-UCB-EECS:adr,
  day =          "7",
  month =        jun,
  year =         "2011",
  bibdate =      "Wed Aug 24 12:36:41 MDT 2011",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UCB/EECS-2011-72.",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn248.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Gustavson:2011:LCF,
  author =       "Fred G. Gustavson and Jerzy W{\'a}sniewski and Jack J.
                 Dongarra and Jos{\'e} R. Herrero and Julien Langou",
  title =        "Level-3 {Cholesky} Factorization Routines as Part of
                 Many {Cholesky} Algorithms",
  type =         "LAPACK Working Note",
  number =       "249",
  institution =  "????",
  address =      "????",
  year =         "2011",
  bibdate =      "Wed Aug 24 12:36:41 MDT 2011",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "DTU/IMM-Technical-Report-2011-11, submitted at TOMS.",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn249.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Song:2011:ESM,
  author =       "Fengguang Song and Stanimire Tomov and Jack Dongarra",
  title =        "Efficient Support for Matrix Computations on
                 Heterogeneous Multi-core and Multi-{GPU}
                 Architectures",
  type =         "LAPACK Working Note",
  number =       "250",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  day =          "16",
  month =        jun,
  year =         "2011",
  bibdate =      "Wed Aug 24 12:36:41 MDT 2011",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-11-668.",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn250.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Ltaief:2011:PHP,
  author =       "Hatem Ltaief and Piotr Luszczek and Jack Dongarra",
  title =        "Profiling High Performance Dense Linear Algebra
                 Algorithms on Multicore Architectures for Power and
                 Energy Efficiency",
  type =         "LAPACK Working Note",
  number =       "251",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  day =          "21",
  month =        jun,
  year =         "2011",
  bibdate =      "Wed Aug 24 12:36:41 MDT 2011",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-11-674.",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn251.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Du:2011:SER,
  author =       "Peng Du and Piotr Luszczek and Stanimire Tomov and
                 Jack Dongarra",
  title =        "Soft Error Resilient {$ Q R $} Factorization for
                 Hybrid System",
  type =         "LAPACK Working Note",
  number =       "252",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  day =          "1",
  month =        jul,
  year =         "2011",
  bibdate =      "Wed Aug 24 12:36:41 MDT 2011",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-11-675.",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn252.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Du:2011:ABF,
  author =       "Peng Du and Aurelien Bouteiller and George Bosilca and
                 Thomas Herault and Jack Dongarra",
  title =        "Algorithm-based Fault Tolerance for Dense Matrix
                 Factorizations",
  type =         "LAPACK Working Note",
  number =       "253",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  day =          "5",
  month =        aug,
  year =         "2011",
  bibdate =      "Wed Aug 24 12:36:41 MDT 2011",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-11-676.",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn253.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Haidar:2011:PRC,
  author =       "Azzam Haidar and Hatem Ltaief and Jack Dongarra",
  title =        "Parallel Reduction to Condensed Forms for Symmetric
                 Eigenvalue Problems using Aggregated Fine-Grained and
                 Memory-Aware Kernels",
  type =         "LAPACK Working Note",
  number =       "254",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  day =          "5",
  month =        aug,
  year =         "2011",
  bibdate =      "Wed Aug 24 12:36:41 MDT 2011",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-11-677 Aug 5 2011.",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn254.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Solomonik:2011:ICP,
  author =       "Edgar Solomonik and Abhinav Bhatele and James Demmel",
  title =        "Improving communication performance in dense linear
                 algebra via topology aware collectives",
  type =         "LAPACK Working Note",
  number =       "255",
  institution =  inst-UCB-EECS,
  address =      inst-UCB-EECS:adr,
  day =          "15",
  month =        aug,
  year =         "2011",
  bibdate =      "Wed Aug 24 12:36:41 MDT 2011",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UCB/EECS-2011-92.",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn255.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Du:2011:HPL,
  author =       "Peng Du and Piotr Luszczek and Jack Dongarra",
  title =        "High Performance Linear System Solver with Resilience
                 to Multiple Soft Errors",
  type =         "LAPACK Working Note",
  number =       "256",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        oct,
  year =         "2011",
  bibdate =      "Sun May 5 11:20:19 2013",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-11-683.",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn256.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Dongarra:2011:HFA,
  author =       "Jack Dongarra and Mathieu Faverge and Thomas Herault
                 and Julien Langou and Yves Robert",
  title =        "Hierarchical {$ Q R $} factorization algorithms for
                 multi-core cluster systems",
  type =         "LAPACK Working Note",
  number =       "257",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        oct,
  year =         "2011",
  bibdate =      "Sun May 5 11:20:19 2013",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-11-684.",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn257.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Anzt:2011:BAR,
  author =       "Hartwig Anzt and Stanimire Tomov and Jack Dongarra and
                 Vincent Heuveline",
  title =        "A Block-Asynchronous Relaxation Method for Graphics
                 Processing Units",
  type =         "LAPACK Working Note",
  number =       "258",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        dec,
  year =         "2011",
  bibdate =      "Sun May 5 11:20:19 2013",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-11-687.",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn258.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Dongarra:2011:ANA,
  author =       "Jack Dongarra and Mathieu Faverge and Hatem Ltaief and
                 Piotr Luszczek",
  title =        "Achieving Numerical Accuracy and High Performance
                 using Recursive Tile {$ L U $} Factorization",
  type =         "LAPACK Working Note",
  number =       "259",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        dec,
  year =         "2011",
  bibdate =      "Sun May 5 11:20:19 2013",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-11-688.",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn259.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Anzt:2011:GAA,
  author =       "Hartwig Anzt and Piotr Luszczek and Jack Dongarra and
                 Vincent Heuveline",
  title =        "{GPU}-Accelerated Asynchronous Error Correction for
                 Mixed Precision Iterative Refinement",
  type =         "LAPACK Working Note",
  number =       "260",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        dec,
  year =         "2011",
  bibdate =      "Sun May 5 11:20:19 2013",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-11-690.",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn260.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Baboulin:2011:PTS,
  author =       "Marc Baboulin and Dulceneia Becker and Jack Dongarra",
  title =        "A parallel tiled solver for dense symmetric indefinite
                 systems on multicore architectures",
  type =         "LAPACK Working Note",
  number =       "261",
  institution =  inst-INRIA,
  address =      inst-INRIA:adr,
  month =        dec,
  year =         "2011",
  bibdate =      "Sun May 5 11:20:19 2013",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "INRIA-7762.",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn261.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Bougeret:2011:URR,
  author =       "Marin Bougeret and Henri Casanova and Yves Robert and
                 Fr{\'e}d{\'e}ric Vivien and Dounia Zaidouni",
  title =        "Using replication for resilience on exascale systems",
  type =         "LAPACK Working Note",
  number =       "262",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        dec,
  year =         "2011",
  bibdate =      "Sun May 5 11:20:19 2013",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-11-691.",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn262.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Khabou:2012:FPR,
  author =       "Amal Khabou and James W. Demmel and Laura Grigori and
                 Ming Gu",
  title =        "{$ L U $} factorization with panel rank revealing
                 pivoting and its communication avoiding version",
  type =         "LAPACK Working Note",
  number =       "263",
  institution =  inst-UCB-EECS,
  address =      inst-UCB-EECS:adr,
  month =        jan,
  year =         "2012",
  bibdate =      "Sun May 5 11:20:19 2013",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UCB/EECS-2012-XX.",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn263.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Bosilca:2012:DLA,
  author =       "George Bosilca and Aurelien Bouteiller and Anthony
                 Danalis and Thomas Herault and Piotr Luszczek and Jack
                 J. Dongarra",
  title =        "Dense Linear Algebra on Distributed Heterogeneous
                 Hardware with a Symbolic {DAG} Approach",
  type =         "LAPACK Working Note",
  number =       "264",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        jan,
  year =         "2012",
  bibdate =      "Sun May 5 11:20:19 2013",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn264.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Bougeret:2012:UGR,
  author =       "Marin Bougeret and Henri Casanova and Yves Robert and
                 Fr{\'e}d{\'e}ric Vivien and Dounia Zaidouni",
  title =        "Using group replication for resilience on exascale
                 systems",
  type =         "LAPACK Working Note",
  number =       "265",
  institution =  inst-INRIA,
  address =      inst-INRIA:adr,
  month =        mar,
  year =         "2012",
  bibdate =      "Sun May 5 11:20:19 2013",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn265.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Kurzak:2012:FPP,
  author =       "Jakub Kurzak and Piotr Luszczek and Mathieu Faverge
                 and Jack Dongarra",
  title =        "{$ L U $} Factorization with Partial Pivoting for a
                 Multi-{CPU}, Multi-{GPU} Shared Memory System",
  type =         "LAPACK Working Note",
  number =       "266",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        apr,
  year =         "2012",
  bibdate =      "Sun May 5 11:20:19 2013",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn266.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Kurzak:2012:PRA,
  author =       "Jakub Kurzak and Piotr Luszczek and Stanimire Tomov
                 and Jack Dongarra",
  title =        "Preliminary Results of Autotuning {GEMM} Kernels for
                 the {NVIDIA Kepler Architecture GeForce GTX 680}",
  type =         "LAPACK Working Note",
  number =       "267",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        apr,
  year =         "2012",
  bibdate =      "Sun May 5 11:20:19 2013",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn267.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Robert:2012:CPR,
  author =       "Yves Robert and Fr{\'e}d{\'e}ric Vivien and Dounia
                 Zaidouni",
  title =        "Combining Process Replication and Checkpointing for
                 Resilience on Exascale Systems",
  type =         "LAPACK Working Note",
  number =       "268",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        jun,
  year =         "2012",
  bibdate =      "Sun May 5 11:20:19 2013",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-12-696.",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn268.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Bosilca:2012:UMA,
  author =       "George Bosilca and Aurelien Bouteiller and Elisabeth
                 Brunet and Franck Cappello and Jack Dongarra and Amina
                 Guermouche and Thomas Herault and Yves Robert and
                 Frederic Vivien and Dounia Zaidouni",
  title =        "Unified Model for Assessing Checkpointing Protocols at
                 Extreme-Scale",
  type =         "LAPACK Working Note",
  number =       "269",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        jun,
  year =         "2012",
  bibdate =      "Sun May 5 11:20:19 2013",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-12-697.",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn269.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Langou:2012:HLL,
  author =       "Julie Langou and Bill Hofman and Brad King",
  title =        "How {LAPACK} library enables {Microsoft Visual Studio}
                 support with {CMake} and {LAPACKE}",
  type =         "LAPACK Working Note",
  number =       "270",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        jul,
  year =         "2012",
  bibdate =      "Sun May 5 11:20:19 2013",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-12-698.",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn270.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Karlsson:2012:OPC,
  author =       "Lars Karlsson and Daniel Kressner",
  title =        "Optimally packed chains of bulges in multishift {$ Q R
                 $} algorithms",
  type =         "LAPACK Working Note",
  number =       "271",
  institution =  "Department of Computing Science, Ume{\aa} University
                 and EPF",
  address =      "Ume{\aa}, Sweden and Lausanne, Switzerland",
  month =        aug,
  year =         "2012",
  bibdate =      "Sun May 5 11:20:19 2013",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn271.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Du:2012:PGC,
  author =       "Peng Du and Stanimire Tomov and Jack Dongarra",
  title =        "Providing {GPU} Capability to {$ L U $} and {$ Q R $}
                 within the {ScaLAPACK} Framework",
  type =         "LAPACK Working Note",
  number =       "272",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        sep,
  year =         "2012",
  bibdate =      "Sun May 5 11:20:19 2013",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-12-699.",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn272.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Baboulin:2012:ECC,
  author =       "Marc Baboulin and Serge Gratton and Remi Lacroix and
                 Alan Laub",
  title =        "Efficient computation of condition estimates for
                 linear least squares problems",
  type =         "LAPACK Working Note",
  number =       "273",
  institution =  inst-INRIA,
  address =      inst-INRIA:adr,
  month =        sep,
  year =         "2012",
  bibdate =      "Sun May 5 11:20:19 2013",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "INRIA-8065.",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn273.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Dongarra:2012:RDC,
  author =       "Jack Dongarra and Thomas Herault and Yves Robert",
  title =        "Revisiting the double checkpointing algorithm",
  type =         "LAPACK Working Note",
  number =       "274",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        dec,
  year =         "2012",
  bibdate =      "Sun May 5 11:20:19 2013",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-13-705.",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn274.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Cao:2013:CHP,
  author =       "Chongxiao Cao and Jack Dongarra and Peng Du and Mark
                 Gates and Piotr Luszczek and Stanimire Tomov",
  title =        "{clMAGMA}: High Performance Dense Linear Algebra with
                 {OpenCL}",
  type =         "LAPACK Working Note",
  number =       "275",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        mar,
  year =         "2013",
  bibdate =      "Sun May 5 11:20:19 2013",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-13-706.",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn275.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Demmel:2013:CAR,
  author =       "James W. Demmel and Laura Grigori and Ming Gu and Hua
                 Xiang",
  title =        "Communication Avoiding Rank Revealing {$ Q R $}
                 Factorization With Column Pivoting",
  type =         "LAPACK Working Note",
  number =       "276",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        may,
  year =         "2013",
  bibdate =      "Sat Mar 15 07:08:58 2014",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UCB/EECS-2013-46.",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn276.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Aupy:2013:ISA,
  author =       "Guillaume Aupy and Mathieu Faverge and Yves Robert and
                 Jakub Kurzak and Piotr Luszczek and Jack Dongarra",
  title =        "Implementing a systolic algorithm for {$ Q R $}
                 factorization on multicore clusters with {PaRSEC}",
  type =         "LAPACK Working Note",
  number =       "277",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        may,
  year =         "2013",
  bibdate =      "Sat Mar 15 07:08:58 2014",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-13-709.",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn277.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Aupy:2013:CSE,
  author =       "Guillaume Aupy and Anne Benoit and Thomas H{\'e}rault
                 and Yves Robert and Fr{\'e}d{\'e}ric Vivien and Dounia
                 Zaidouni",
  title =        "On the Combination of Silent Error Detection and
                 Checkpointing",
  type =         "LAPACK Working Note",
  number =       "278",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        jun,
  year =         "2013",
  bibdate =      "Sat Mar 15 07:08:58 2014",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-13-710.",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn278.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Jia:2013:TER,
  author =       "Yulu Jia and Piotr Luszczek and Jack Dongarra",
  title =        "Transient Error Resilient {Hessenberg} Reduction on
                 {GPU}-based Hybrid Architectures",
  type =         "LAPACK Working Note",
  number =       "279",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        jun,
  year =         "2013",
  bibdate =      "Sat Mar 15 07:08:58 2014",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-13-712.",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn279.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Donfack:2013:AVP,
  author =       "Simplice Donfack and Jack Dongarra and Mathieu Faverge
                 and Mark Gates and Jakub Kurzak and Piotr Luszczek and
                 Ichitaro Yamazaki",
  title =        "On Algorithmic Variants of Parallel {Gaussian}
                 Elimination: Comparison of Implementations in Terms of
                 Performance and Numerical Properties",
  type =         "LAPACK Working Note",
  number =       "280",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        jul,
  year =         "2013",
  bibdate =      "Sat Mar 15 07:08:58 2014",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-CS-13-715",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn280.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Aupy:2013:OCP,
  author =       "Guillaume Aupy and Anne Benoit and Thomas Herault and
                 Yves Robert and Jack Dongarra",
  title =        "Optimal Checkpointing Period: Time vs. Energy",
  type =         "LAPACK Working Note",
  number =       "281",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        oct,
  year =         "2013",
  bibdate =      "Sat Mar 15 07:08:58 2014",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-EECS-13-718.",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn281.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Faverge:2013:DHS,
  author =       "Mathieu Faverge and Julien Herrmann and Julien Langou
                 and Bradley Lowery and Yves Robert and Jack Dongarra",
  title =        "Designing {$ L U $--$ Q R $} hybrid solvers for
                 performance and stability",
  type =         "LAPACK Working Note",
  number =       "282",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        oct,
  year =         "2013",
  bibdate =      "Sat Mar 15 07:08:58 2014",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-EECS-13-719.",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn282.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Haidar:2013:IPS,
  author =       "Azzam Haidar and Piotr Luszczek and Jakub Kurzak and
                 Jack Dongarra",
  title =        "An Improved Parallel Singular Value Algorithm and Its
                 Implementation for Multicore Hardware",
  type =         "LAPACK Working Note",
  number =       "283",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        oct,
  year =         "2013",
  bibdate =      "Sat Mar 15 07:08:58 2014",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "UT-EECS-13-720.",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn283.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Kohler:2013:FFB,
  author =       "Martin K{\"o}hler and Jens Saak",
  title =        "{FlexiBLAS} --- A flexible {BLAS} library with runtime
                 exchangeable backends",
  type =         "LAPACK Working Note",
  number =       "284",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        "????",
  year =         "2013",
  bibdate =      "Sat Mar 15 07:08:58 2014",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn284.pdf",
  acknowledgement = ack-nhfb,
}
@TechReport{Baboulin:2014:URB,
  author =       "Marc Baboulin and Xiaoye S. Li and
                 Fran{\c{c}}ois-Henry Rouet",
  title =        "Using Random Butterfly Transformations to Avoid
                 Pivoting in Sparse Direct Methods",
  type =         "LAPACK Working Note",
  number =       "285",
  institution =  inst-UTK-CS,
  address =      inst-UTK-CS:adr,
  month =        feb,
  year =         "2014",
  bibdate =      "Sat Mar 15 07:08:58 2014",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "Inria Research Report RR-8481.",
  URL =          "http://www.netlib.org/lapack/lawnspdf/lawn285.pdf",
  acknowledgement = ack-nhfb,
}
@Article{Brewer:1988:TAAb,
  author =       "Orlie Brewer and Jack Dongarra and Danny Sorensen",
  title =        "Tools to aid in the analysis of memory access patterns
                 for {FORTRAN} programs",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "9",
  number =       "1",
  pages =        "25--35",
  month =        dec,
  year =         "1988",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191",
  bibdate =      "Sat Mar 22 15:39:54 MST 1997",
  bibsource =    "Compendex database;
                 http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "See original LAPACK Working note in
                 \cite{Brewer:1988:TAAa}.",
  URL =          "http://www.netlib.org/utk/people/JackDongarra/PAPERS/Tools-to-Aid-Analysis-of-Memory-Access-Patterns-for-FORTRAN-Programs.pdf",
  abstract =     "In order to improve the performance of algorithms
                 implemented on high-performance computers, we must
                 consider not only the total number of memory
                 references, but also the pattern of memory references.
                 We would like our algorithms to observe the principle
                 of locality of reference, so that the data can be
                 effectively utilized. This paper describes a set of
                 tools that can be used as an aid in the analysis of
                 memory access patterns of FORTRAN programs.",
  acknowledgement = ack-nhfb,
  affiliation =  "Argonne",
  affiliationaddress = "Argonne, IL, USA",
  classcodes =   "C6115 (Programming support); C6110 (Systems analysis
                 and programming)",
  classification = "723",
  corpsource =   "Div. of Math. and Comput. Sci., Argonne Nat. Lab., IL,
                 USA",
  journalabr =   "Parallel Comput",
  keywords =     "Computer Programming Languages--FORTRAN; Computer
                 Software; Data Storage, Digital; FORTRAN Programs;
                 FORTRAN programs; Linear Algebra; Memory Access
                 Patterns; memory access patterns analysis; Parallel
                 Processing Computers; parallel programming; Software
                 Engineering; software tools; Visualization Tools",
  treatment =    "P Practical",
}
@Article{Bai:1989:BIHb,
  author =       "Z. Bai and J. Demmel",
  title =        "On a Block Implementation of {Hessenberg} Multishift
                 {$ Q R $} Iteration",
  journal =      j-INT-J-HIGH-SPEED-COMPUTING,
  volume =       "1",
  number =       "1",
  pages =        "97--112",
  year =         "1989",
  CODEN =        "IHSCEZ",
  ISSN =         "0129-0533",
  bibsource =    "ftp://ftp.ira.uka.de/bibliography/Parallel/par.lin.alg.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "See original LAPACK Working note in
                 \cite{Bai:1989:BIHa}.",
}
@Article{Dongarra:1989:BRM,
  author =       "J. J. Dongarra and D. C. Sorensen and S. J.
                 Hammarling",
  title =        "Block reduction of matrices to condensed forms for
                 eigenvalue computations",
  journal =      j-J-COMPUT-APPL-MATH,
  volume =       "27",
  number =       "1--2",
  pages =        "215--227",
  month =        sep,
  year =         "1989",
  CODEN =        "JCAMDI",
  ISSN =         "0377-0427 (print), 1879-1778 (electronic)",
  bibdate =      "Sat Mar 22 15:39:54 MST 1997",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "See original LAPACK Working note in
                 \cite{Dongarra:1987:BRM}.",
  acknowledgement = ack-nhfb,
  classcodes =   "C4140 (Linear algebra); C4240 (Programming and
                 algorithm theory)",
  corpsource =   "Math. and Comput. Sci. Div., Argonne Nat. Lab., IL,
                 USA",
  keywords =     "algorithms; bidiagonal; block algorithms; block
                 reduction of matrices; condensed; divide and conquer
                 technique; eigenvalue computations; eigenvalues and
                 eigenfunctions; forms; Hessenberg form; Householder
                 transformations; linear algebra; matrix-matrix
                 operations; parallel",
  treatment =    "T Theoretical or Mathematical",
}
@InProceedings{Anderson:1990:LPLb,
  author =       "E. Anderson and Z. Bai and C. Bischof and J. Demmel
                 and J. Dongarra and J. DuCroz and A. Greenbaum and S.
                 Hammarling and A. McKenney and D. Sorensen",
  title =        "{LAPACK}: {A} Portable Linear Algebra Library for
                 High-Performance Computers",
  crossref =     "IEEE:1990:PSN",
  pages =        "2--11",
  year =         "1990",
  bibdate =      "Mon Sep 9 14:47:18 1996",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "See original LAPACK Working note in
                 \cite{Anderson:1990:LPLa}.",
  abstract =     "The goal of the LAPACK project is to design and
                 implement a portable linear algebra library for
                 efficient use on a variety of high-performance
                 computers. The library is based on the widely used
                 LINPACK and EISPACK packages for solving linear
                 equations, eigenvalue problems, and linear
                 least-squares problems, but extends their functionality
                 in a number of ways. The major methodology for making
                 the algorithms run faster is to restructure them to
                 perform block matrix operations (e.g., matrix-matrix
                 multiplication) in their inner loops. These block
                 operations may be optimized to exploit the memory
                 hierarchy of a specific architecture. The LAPACK
                 project is also working on new algorithms that yield
                 higher relative accuracy for a variety of linear
                 algebra problems.",
  acknowledgement = ack-nhfb,
  affiliation =  "Tennessee Univ., Knoxville, TN, USA",
  classification = "C4140 (Linear algebra); C7310 (Mathematics)",
  keywords =     "Block matrix operations; Block operations; Eigenvalue
                 problems; Functionality; Inner loops; LAPACK; Linear
                 equations; Linear least-squares problems; Matrix-matrix
                 multiplication; Memory hierarchy; Portable linear
                 algebra library; Relative accuracy",
  page =         "1--10",
  thesaurus =    "Eigenvalues and eigenfunctions; Matrix algebra;
                 Software portability; Subroutines",
}
@Article{Barlow:1990:CAE,
  author =       "Jesse Barlow and James Demmel",
  title =        "Computing Accurate Eigensystems of Scaled Diagonally
                 Dominant Matrices",
  journal =      j-SIAM-J-NUMER-ANAL,
  volume =       "27",
  number =       "3",
  pages =        "762--791",
  month =        jun,
  year =         "1990",
  CODEN =        "SJNAAM",
  ISSN =         "0036-1429 (print), 1095-7170 (electronic)",
  MRclass =      "65F15",
  MRnumber =     "91g:65071",
  MRreviewer =   "Alan L. Andrew",
  bibdate =      "Fri Oct 16 06:57:22 MDT 1998",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib; JSTOR
                 database; Parallel/par.lin.alg.bib",
  note =         "See original LAPACK Working note in
                 \cite{Barlow:1988:CAE}.",
  acknowledgement = ack-nhfb,
}
@Article{Dongarra:1990:ASL,
  author =       "Jack J. Dongarra and Jeremy Du Croz and Sven
                 Hammarling and Iain Duff",
  title =        "{Algorithm 679}: {A} Set of Level 3 {Basic Linear
                 Algebra Subprograms}: Model Implementation and Test
                 Programs",
  journal =      j-TOMS,
  volume =       "16",
  number =       "1",
  pages =        "18--28",
  month =        mar,
  year =         "1990",
  CODEN =        "ACMSCU",
  DOI =          "https://doi.org/10.1145/77626.77627",
  ISSN =         "0098-3500 (print), 1557-7295 (electronic)",
  bibdate =      "Sat Aug 27 17:29:49 1994",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "See also
                 \cite{Higham:1990:EFM,Demmel:1992:SBA,Dayde:1994:PBI}.",
  URL =          "http://www.acm.org/pubs/citations/journals/toms/1990-16-1/p18-dongarra/",
  acknowledgement = ack-nhfb,
  keywords =     "algorithms; measurement; performance; reliability;
                 verification",
  subject =      "{\bf D.3.2}: Software, PROGRAMMING LANGUAGES, Language
                 Classifications, FORTRAN 8X. {\bf F.2.1}: Theory of
                 Computation, ANALYSIS OF ALGORITHMS AND PROBLEM
                 COMPLEXITY, Numerical Algorithms and Problems,
                 Computations on matrices. {\bf G.1.3}: Mathematics of
                 Computing, NUMERICAL ANALYSIS, Numerical Linear
                 Algebra, Linear systems (direct and iterative methods).
                 {\bf G.4}: Mathematics of Computing, MATHEMATICAL
                 SOFTWARE.",
}
@Article{Higham:1990:EFM,
  author =       "Nicholas J. Higham",
  title =        "Exploiting Fast Matrix Multiplication Within the Level
                 3 {BLAS}",
  journal =      j-TOMS,
  volume =       "16",
  number =       "4",
  pages =        "352--368",
  month =        dec,
  year =         "1990",
  CODEN =        "ACMSCU",
  DOI =          "https://doi.org/10.1145/98267.98290",
  ISSN =         "0098-3500 (print), 1557-7295 (electronic)",
  MRclass =      "65-04 (65F99)",
  MRnumber =     "1 095 133",
  bibdate =      "Sun Sep 04 23:21:57 1994",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "Describes algorithms based on Strassen's method which
                 are asymptotically faster than the standard {$ {N}^3 $}
                 algorithm, and in practice, faster for {$ {N} \approx
                 100 $}, and examines their numerical stability. See
                 \cite{Dongarra:1990:ASL,Demmel:1992:SBA,Dayde:1994:PBI}.",
  URL =          "http://www.acm.org/pubs/citations/journals/toms/1990-16-4/p352-higham/",
  abstract =     "The Level 3 BLAS (BLAS3) are a set of specifications
                 of FORTRAN 77 subprograms for carrying out matrix
                 multiplications and the solution of triangular systems
                 with multiple right-hand sides. They are intended to
                 provide efficient and portable building blocks for
                 linear algebra algorithms on high-performance
                 computers. We describe algorithms for the BLAS3
                 operations that are asymptotically faster than the
                 conventional ones. These algorithms are based on
                 Strassen's method for fast matrix multiplication, which
                 is now recognized to be a practically useful technique
                 once matrix dimensions exceed about 100. We pay
                 particular attention to the numerical stability of
                 these ``fast BLAS3.'' Error bounds are given and their
                 significance is explained and illustrated with the aid
                 of numerical experiments. Our conclusion is that the
                 fast BLAS3, although not as strongly stable as
                 conventional implementations, are stable enough to
                 merit careful consideration in many applications.",
  acknowledgement = ack-nhfb,
  keywords =     "algorithms",
  subject =      "{\bf G.1.3}: Mathematics of Computing, NUMERICAL
                 ANALYSIS, Numerical Linear Algebra. {\bf D.3.2}:
                 Software, PROGRAMMING LANGUAGES, Language
                 Classifications, FORTRAN 77.",
}
@Article{Deift:1991:BSV,
  author =       "Percy Deift and James Demmel and Luen Chau Li and
                 Carlos Tomei",
  title =        "The Bidiagonal Singular Value Decomposition and
                 {Hamiltonian} Mechanics",
  journal =      j-SIAM-J-NUMER-ANAL,
  volume =       "28",
  number =       "5",
  pages =        "1463--1516",
  month =        oct,
  year =         "1991",
  CODEN =        "SJNAAM",
  ISSN =         "0036-1429 (print), 1095-7170 (electronic)",
  MRclass =      "65F15 (58F05)",
  MRnumber =     "92i:65071",
  MRreviewer =   "T. Y. Li",
  bibdate =      "Fri Oct 16 06:57:22 MDT 1998",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib; JSTOR
                 database",
  note =         "See original LAPACK Working note in
                 \cite{Deift:1989:BSV}.",
  acknowledgement = ack-nhfb,
}
@Article{Dongarra:1991:IRS,
  author =       "J. J. Dongarra and P. Mayes and G. {Radicati di
                 Brozolo}",
  title =        "The {IBM RISC System\slash 6000} and Linear Algebra
                 Operations",
  journal =      j-SUPERCOMPUTER,
  volume =       "8",
  number =       "4",
  pages =        "15--30",
  month =        jul,
  year =         "1991",
  CODEN =        "SPCOEL",
  ISSN =         "0168-7875",
  bibdate =      "Sat Mar 22 15:39:54 MST 1997",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "See original LAPACK Working note in
                 \cite{Dongarra:1990:IRS}.",
  URL =          "http://www.netlib.org/utk/people/JackDongarra/PAPERS/The-IBM-RISC-System-6000-and-Linear-Algebra-Operations.pdf",
  abstract =     "The paper discusses the IBM RISC System/6000
                 workstation and a set of experiments with blocked
                 algorithms commonly used in solving problems in
                 numerical linear algebra. The authors describe the
                 performance of these algorithms and discuss the
                 techniques used in achieving high performance on such
                 an architecture.",
  acknowledgement = ack-nhfb,
  affiliation =  "Math. Sci. Section, Oak Ridge Nat. Lab., TN, USA",
  classcodes =   "C5420 (Mainframes and minicomputers); C5470
                 (Performance evaluation and testing); C4140 (Linear
                 algebra)",
  classification = "C4140 (Linear algebra); C5420 (Mainframes and
                 minicomputers); C5470 (Performance evaluation and
                 testing)",
  corpsource =   "Math. Sci. Section, Oak Ridge Nat. Lab., TN, USA",
  keywords =     "blocked algorithms; Blocked algorithms; Floating point
                 performance; floating point performance; IBM computers;
                 IBM RISC System/6000; IBM RISC System/6000 workstation;
                 linear algebra; numerical linear algebra; Numerical
                 linear algebra; performance evaluation; reduced
                 instruction set computing; workstation",
  pubcountry =   "Netherlands",
  thesaurus =    "IBM computers; Linear algebra; Performance evaluation;
                 Reduced instruction set computing",
  treatment =    "P Practical",
}
@Article{Anderson:1992:GFA,
  author =       "E. Anderson and Z. Bai and J. Dongarra",
  title =        "Generalized {$ Q R $} factorization and its
                 applications",
  journal =      j-LINEAR-ALGEBRA-APPL,
  volume =       "162/164",
  pages =        "243--271",
  year =         "1992",
  CODEN =        "LAAPAW",
  ISSN =         "0024-3795 (print), 1873-1856 (electronic)",
  MRclass =      "65F15 15A23",
  MRnumber =     "92j:65050",
  bibdate =      "Thu Dec 19 14:07:22 1996",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "Directions in matrix theory (Auburn, AL, 1990). See
                 original LAPACK Working note in
                 \cite{Anderson:1991:GQF}.",
  URL =          "http://www.netlib.org/utk/people/JackDongarra/PAPERS/Generalized-QR-Factorization-and-Its-Applications.pdf",
  acknowledgement = ack-nhfb,
}
@Article{Bischof:1992:GIC,
  author =       "Christian H. Bischof and Ping Tak Peter Tang",
  title =        "Generalizing incremental condition estimation",
  journal =      j-J-NUM-LIN-ALG-APPL,
  volume =       "1",
  number =       "2",
  pages =        "149--163",
  year =         "1992",
  CODEN =        "NLAAEM",
  ISSN =         "0129-3281",
  MRclass =      "65F30",
  MRnumber =     "93e:65068",
  bibdate =      "Thu Jan 23 19:03:25 MST 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "See original LAPACK Working note in
                 \cite{Bischof:1991:GIC}.",
  acknowledgement = ack-nhfb,
}
@InProceedings{Choi:1992:SSLb,
  author =       "J. Choi and J. J. Dongarra and R. Pozo and D. W.
                 Walker",
  title =        "{ScaLAPACK}: a scalable linear algebra library for
                 distributed memory concurrent computers",
  crossref =     "Siegel:1992:FSF",
  pages =        "120--127",
  year =         "1992",
  bibdate =      "Sat Mar 22 15:39:54 MST 1997",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "IEEE catalog number 92CH3185-6. See original LAPACK
                 Working note in \cite{Choi:1992:SSLa}.",
  acknowledgement = ack-nhfb,
  classcodes =   "C7310 (Mathematics); C4140 (Linear algebra); C6110J
                 (Object-oriented programming); C5470 (Performance
                 evaluation and testing); C5440 (Multiprocessor systems
                 and techniques)",
  conflocation = "McLean, VA, USA; 19-21 Oct. 1992",
  corpsource =   "Oak Ridge Nat. Lab., TN, USA",
  keywords =     "algorithm; computations; computing; distributed;
                 distributed memory systems; distributed memory version;
                 evaluation; Intel Delta multicomputer; Level 3 BLAS;
                 library routines; linear algebra; mathematics; matrix;
                 memory concurrent computers; object-oriented interface;
                 object-oriented programming; performance; right-looking
                 LU factorization; scalable linear algebra library;
                 ScaLAPACK; software package; software packages; square
                 block scattered decomposition",
  sponsororg =   "IEEE; NASA",
  treatment =    "A Application; P Practical",
}
@Article{Croz:1992:SMM,
  author =       "Jeremy J. Du Croz and Nicholas J. Higham",
  title =        "Stability of Methods for Matrix Inversion",
  journal =      j-IMA-J-NUMER-ANAL,
  volume =       "12",
  pages =        "1--19",
  year =         "1992",
  CODEN =        "IJNADH",
  ISSN =         "0272-4979 (print), 1464-3642 (electronic)",
  bibdate =      "Sat Dec 23 14:54:28 2000",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/h/higham-nicholas-john.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "See original LAPACK Working note in
                 \cite{DuCroz:1990:SMM}.",
  acknowledgement = ack-njh,
}
@InProceedings{Demmel:1992:DPH,
  author =       "J. Demmel and J. Dongarra and W. Kahan",
  title =        "On Designing Portable High Performance Numerical
                 Libraries",
  crossref =     "Griffiths:1992:NAP",
  pages =        "??--??",
  month =        jun,
  year =         "1991",
  bibdate =      "Tue Feb 26 10:10:44 2002",
  bibsource =    "ftp://ftp.ira.uka.de/pub/bibliography/Parallel/par.lin.alg.bib;
                 http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "See original LAPACK Working note in
                 \cite{Demmel:1991:DPH}.",
  acknowledgement = ack-nhfb,
}
@Article{Demmel:1992:JMM,
  author =       "James Demmel and Kre{\v{s}}imir Veseli{\'c}",
  title =        "{Jacobi}'s Method is More Accurate than {$ Q R $}",
  journal =      j-SIAM-J-MAT-ANA-APPL,
  volume =       "13",
  number =       "4",
  pages =        "1204--1245",
  month =        oct,
  year =         "1992",
  CODEN =        "SJMAEL",
  ISSN =         "0895-4798 (print), 1095-7162 (electronic)",
  MRclass =      "65F15 (65G05)",
  MRnumber =     "93e:65057",
  bibdate =      "Tue Jan 21 08:54:30 MST 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "See original LAPACK Working note in
                 \cite{Demmel:1989:JMM}.",
  acknowledgement = ack-nhfb,
}
@Article{Demmel:1992:SBA,
  author =       "James W. Demmel and Nicholas J. Higham",
  title =        "Stability of Block Algorithms with Fast Level-3
                 {BLAS}",
  journal =      j-TOMS,
  volume =       "18",
  number =       "3",
  pages =        "274--291",
  month =        sep,
  year =         "1992",
  CODEN =        "ACMSCU",
  ISSN =         "0098-3500 (print), 1557-7295 (electronic)",
  bibdate =      "Fri Sep 30 01:27:16 1994",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/duff-iain-s.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "See
                 \cite{Dongarra:1990:ASL,Higham:1990:EFM,Dayde:1994:PBI}.
                 See original LAPACK Working note in
                 \cite{Demmel:1990:SBA}.",
  URL =          "http://www.acm.org/pubs/toc/Abstracts/0098-3500/131769.html",
  abstract =     "Block algorithms are becoming increasingly popular in
                 matrix computations. Since their basic unit of data is
                 a submatrix rather than a scalar, they have a higher
                 level of granularity than point algorithms, and this
                 makes them well suited to high-performance computers.
                 The numerical stability of the block algorithms in the
                 new linear algebra program library LAPACK is
                 investigated here. It is shown that these algorithms
                 have backward error analyses in which the backward
                 error bounds are commensurate with the error bounds for
                 the underlying level-3 BLAS (BLAS3). One implication is
                 that the block algorithms are as stable as the
                 corresponding point algorithms when conventional BLAS3
                 are used. A second implication is that the use of BLAS3
                 based on fast matrix multiplication techniques affects
                 the stability only insofar as it increases the constant
                 terms in the normwise backward error bounds. For linear
                 equation solvers employing {\em LU} factorization, it
                 is shown that fixed precision iterative refinement
                 helps to mitigate the effect of the larger error
                 constants. Despite the positive results presented here,
                 not all plausible block algorithms are stable; we
                 illustrate this with the example of {\em LU}
                 factorization with block triangular factors and
                 describe how to check a block algorithm for stability
                 without doing a full error analysis.",
  acknowledgement = ack-nhfb,
  keywords =     "algorithms; performance",
  subject =      "{\bf G.1.3}: Mathematics of Computing, NUMERICAL
                 ANALYSIS, Numerical Linear Algebra. {\bf F.2.1}: Theory
                 of Computation, ANALYSIS OF ALGORITHMS AND PROBLEM
                 COMPLEXITY, Numerical Algorithms and Problems,
                 Computations on matrices.",
}
@InProceedings{Dongarra:1992:LASb,
  author =       "J. Dongarra and R. {van de Geijn} and D. Walker",
  title =        "A look at scalable dense linear algebra libraries",
  crossref =     "IEEE:1992:SHP",
  pages =        "??--??",
  year =         "1992",
  bibdate =      "Sat Mar 22 15:39:54 MST 1997",
  bibsource =    "ftp://ftp.ira.uka.de/pub/bibliography/Parallel/par.lin.alg.bib;
                 http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "IEEE catalog number 92TH0432-5. See original LAPACK
                 Working note in \cite{Anderson:1992:PLP}.",
  acknowledgement = ack-nhfb,
  classcodes =   "C4140 (Linear algebra); C7310 (Mathematics); C6110P
                 (Parallel programming)",
  conflocation = "Williamsburg, VA, USA; 26-29 April 1992",
  corpsource =   "Dept. of Comput. Sci., Tennessee Univ., TN, USA",
  keywords =     "14 GFLOPS; applications; concurrent computers; Delta
                 system; dense matrix problems; distributed memory;
                 double precision; Intel Touchstone; linear algebra; LU
                 factorization; mathematics computing; object-oriented;
                 object-oriented interface; parallel implementation;
                 parallel programming; portable; programming; scalable
                 dense linear algebra libraries; software portability;
                 square block scattered decomposition; subroutines; user
                 interfaces",
  sponsororg =   "IEEE",
  treatment =    "P Practical",
}
@Article{Dongarra:1992:NCC,
  author =       "Jack J. Dongarra and Sven Hammarling and James H.
                 Wilkinson",
  title =        "Numerical Considerations in Computing Invariant
                 Subspaces",
  journal =      j-SIAM-J-MAT-ANA-APPL,
  volume =       "13",
  number =       "1",
  pages =        "145--161",
  month =        jan,
  year =         "1992",
  CODEN =        "SJMAEL",
  ISSN =         "0895-4798 (print), 1095-7162 (electronic)",
  MRclass =      "65F15",
  MRnumber =     "93a:65049",
  MRreviewer =   "Colette Lebaud",
  bibdate =      "Tue Jan 21 08:54:30 MST 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "See original LAPACK Working note in
                 \cite{Dongarra:1990:NCC}.",
  acknowledgement = ack-nhfb,
}
@Article{Dongarra:1992:RCFb,
  author =       "Jack J. Dongarra and Robert A. {van de Geijn}",
  title =        "Reduction to condensed form for the eigenvalue problem
                 on distributed memory architectures",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "18",
  number =       "9",
  pages =        "973--982",
  month =        sep,
  year =         "1992",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191",
  MRclass =      "65Y05 (65F15)",
  MRnumber =     "1 190 458",
  bibdate =      "Thu Sep 16 09:30:12 1999",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "See original LAPACK Working note in
                 \cite{Dongarra:1991:RCF}.",
  URL =          "http://www.netlib.org/utk/people/JackDongarra/PAPERS/Reduction-to-Condensed-Form-for-the-Eigenvalue-Problem-on-Distributed-Memory.pdf",
  abstract =     "The authors describe a parallel implementation for the
                 reduction of general and symmetric matrices to
                 Hessenberg and tridiagonal form, respectively. The
                 methods are based on LAPACK sequential codes and use a
                 panel-wrapped mapping of matrices to nodes. Results
                 from experiments on the Intel Touchstone Delta are
                 given.",
  acknowledgement = ack-nhfb,
  affiliation =  "Dept. of Comput. Sci., Tennessee Univ., Knoxville, TN,
                 USA",
  classcodes =   "C7310 (Mathematics); C5220P (Parallel architecture);
                 C4140 (Linear algebra)",
  classification = "C4140 (Linear algebra); C5220P (Parallel
                 architecture); C7310 (Mathematics)",
  corpsource =   "Dept. of Comput. Sci., Tennessee Univ., Knoxville, TN,
                 USA",
  keywords =     "architectures; distributed memory; Distributed memory
                 architectures; distributed memory systems; Eigenvalue
                 problem; eigenvalue problem; eigenvalues and
                 eigenfunctions; Hessenberg form; Intel Touchstone
                 Delta; LAPACK sequential codes; linear algebra;
                 mapping; mathematics computing; panel-wrapped;
                 Panel-wrapped mapping; parallel; Parallel
                 implementation; parallel implementation; Symmetric
                 matrices; symmetric matrices; Tridiagonal form;
                 tridiagonal form",
  pubcountry =   "Netherlands",
  thesaurus =    "Distributed memory systems; Eigenvalues and
                 eigenfunctions; Linear algebra; Mathematics computing;
                 Parallel architectures",
  treatment =    "P Practical",
}
@Article{Anderson:1993:PLP,
  author =       "E. C. Anderson and J. Dongarra",
  title =        "Performance of {LAPACK}: a portable library of
                 numerical linear algebra routines",
  journal =      j-PROC-IEEE,
  volume =       "81",
  number =       "8",
  pages =        "1094--1102",
  month =        aug,
  year =         "1993",
  CODEN =        "IEEPAD",
  ISSN =         "0018-9219",
  bibdate =      "Sat Mar 22 15:39:54 MST 1997",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "See original LAPACK Working note in
                 \cite{Anderson:1992:PLP}.",
  URL =          "http://www.netlib.org/utk/people/JackDongarra/PAPERS/Performance-of-LAPACK-A-Portable-Library.pdf",
  acknowledgement = ack-nhfb,
  classcodes =   "C7310 (Mathematics); C4140 (Linear algebra); C5440
                 (Multiprocessor systems and techniques); C6150G
                 (Diagnostic, testing, debugging and evaluating
                 systems)",
  corpsource =   "Cray Res. Center, Eagan, MN, USA",
  keywords =     "algebra routines; computers; evaluation; LAPACK
                 project; library; linear algebra; mathematics
                 computing; numerical linear; numerical linear algebra;
                 parallel; parallel processors; performance; performance
                 tuning; portability; portable library; program testing;
                 shared memory systems; shared-memory vector; software",
  treatment =    "P Practical",
}
@Article{Bai:1993:CGS,
  author =       "Zhao Jun Bai and James W. Demmel",
  title =        "Computing the generalized singular value
                 decomposition",
  journal =      j-SIAM-J-SCI-COMP,
  volume =       "14",
  number =       "6",
  pages =        "1464--1486",
  month =        nov,
  year =         "1993",
  CODEN =        "SJOCE3",
  ISSN =         "1064-8275 (print), 1095-7197 (electronic)",
  ISSN-L =       "1064-8275",
  MRclass =      "65F30",
  MRnumber =     "94h:65043",
  bibdate =      "Tue Apr 29 18:15:07 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "See original LAPACK Working note in
                 \cite{Bai:1992:CGS}.",
  acknowledgement = ack-nhfb,
}
@Article{Bai:1993:SDB,
  author =       "Zhaojun Bai and James W. Demmel",
  title =        "On Swapping Diagonal Blocks in Real {Schur} Form",
  journal =      j-LINEAR-ALGEBRA-APPL,
  volume =       "186",
  pages =        "73--95",
  year =         "1993",
  CODEN =        "LAAPAW",
  ISSN =         "0024-3795 (print), 1873-1856 (electronic)",
  MRclass =      "15A18",
  MRnumber =     "94d:15006",
  bibdate =      "Wed Jan 22 17:57:24 MST 1997",
  bibsource =    "/usr/local/src/bib/bibliography/Parallel/par.lin.alg.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "See original LAPACK Working note in
                 \cite{Bai:1992:SDB}.",
  acknowledgement = ack-nhfb,
}
@Article{Demmel:1993:CAS,
  author =       "James W. Demmel and William Gragg",
  title =        "On Computing Accurate Singular Values and Eigenvalues
                 of Matrices With Acyclic Graphs",
  journal =      j-LINEAR-ALGEBRA-APPL,
  volume =       "185",
  pages =        "203--217",
  month =        may,
  year =         "1993",
  CODEN =        "LAAPAW",
  ISSN =         "0024-3795 (print), 1873-1856 (electronic)",
  MRclass =      "65F30 (15A18)",
  MRnumber =     "94h:65044",
  bibdate =      "Wed Jan 22 17:57:24 MST 1997",
  bibsource =    "/usr/local/src/bib/bibliography/Parallel/par.lin.alg.bib;
                 /usr/local/src/bib/bibliography/Theory/Matrix.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "See original LAPACK Working note in
                 \cite{Demmel:1992:CAS}.",
  acknowledgement = ack-nhfb,
  keywords =     "nla, la, pert, svd, eig, arrowhead matrix, acyclic
                 graph",
}
@Article{Demmel:1993:IEB,
  author =       "James W. Demmel and Nicholas J. Higham",
  title =        "Improved Error Bounds for Underdetermined System
                 Solvers",
  journal =      j-SIAM-J-MAT-ANA-APPL,
  volume =       "14",
  number =       "1",
  pages =        "1--14",
  month =        jan,
  year =         "1993",
  CODEN =        "SJMAEL",
  ISSN =         "0895-4798 (print), 1095-7162 (electronic)",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/h/higham-nicholas-john.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "See original LAPACK Working note in
                 \cite{Demmel:1990:IEB}.",
  acknowledgement = ack-njh,
  mynote =       "Also LAPACK Working Note \#23.",
}
@InCollection{Demmel:1993:PNLb,
  author =       "J. Demmel and M. Heath and H. {van der Vorst}",
  booktitle =    "Acta Numerica 1993",
  title =        "Parallel Numerical Linear Algebra",
  publisher =    pub-CAMBRIDGE,
  address =      pub-CAMBRIDGE:adr,
  pages =        "111--198",
  year =         "1993",
  bibdate =      "Thu Jun 8 12:55:05 MDT 1995",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/v/vandervorst-henk-a.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "See original LAPACK Working note in
                 \cite{Demmel:1993:PNLa}.",
}
@InProceedings{Demmel:1993:TPN,
  author =       "James W. Demmel",
  editor =       "Marc S. Moonen and Gene H. Golub and Bart L. De Moor",
  booktitle =    "Linear Algebra for Large Scale and Real-Time
                 Applications",
  title =        "Trading Off Parallelism and Numerical Stability",
  volume =       "232",
  publisher =    pub-KLUWER,
  address =      pub-KLUWER:adr,
  pages =        "49--68",
  year =         "1993",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "See original LAPACK Working note in
                 \cite{Demmel:1992:TPN}.",
  series =       "NATO ASI Series E",
}
@InProceedings{Dongarra:1993:TDB,
  author =       "J. J. Dongarra and R. A. {Van de Geijn} and R. {Clint
                 Whaley}",
  title =        "Two Dimensional Basic Linear Algebra Communication
                 Subprograms",
  crossref =     "Sincovec:1993:SCP",
  pages =        "347--352",
  year =         "1993",
  bibdate =      "Fri Mar 1 10:04:10 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "See original LAPACK Working note in
                 \cite{Dongarra:1991:TDB}.",
  acknowledgement = ack-nhfb,
}
@Article{Higham:1993:PTB,
  author =       "Nicholas J. Higham",
  title =        "Perturbation theory and backward error analysis for {$
                 A X - X B = C $}",
  journal =      j-BIT,
  volume =       "33",
  number =       "1",
  pages =        "124--136",
  year =         "1993",
  CODEN =        "BITTEL, NBITAB",
  ISSN =         "0006-3835 (print), 1572-9125 (electronic)",
  MRclass =      "65F05 (65G05)",
  MRnumber =     "96a:65036",
  bibdate =      "Fri Nov 13 07:00:34 MST 1998",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/h/higham-nicholas-john.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "See original LAPACK Working note in
                 \cite{Higham:1992:PTB}.",
  URL =          "http://www.mai.liu.se/BIT/contents/bit33.html",
  acknowledgement = ack-njh # " and " # ack-nhfb,
}
@InProceedings{Choi:1994:DPD,
  author =       "J. Choi and J. J. Dongarra and D. W. Walker",
  title =        "The design of a parallel, dense linear algebra
                 software library: reduction to {Hessenberg},
                 tridiagonal, and bidiagonal form",
  crossref =     "Dongarra:1994:PSW",
  pages =        "98--111",
  year =         "1994",
  bibdate =      "Sat Mar 22 15:39:54 MST 1997",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "See original LAPACK Working note in
                 \cite{Choi:1995:DPDa} and \cite{Choi:1995:DPDb}.",
  acknowledgement = ack-nhfb,
  classcodes =   "C7310 (Mathematics computing); C6110B (Software
                 engineering techniques); C5440 (Multiprocessing
                 systems); C4140 (Linear algebra); C6110P (Parallel
                 programming)",
  conflocation = "Townsend, TN, USA; 25-27 May 1994",
  conftitle =    "Proceedings of the Second Workshop on Environments and
                 Tools for Parallel Scientific Computing",
  corpsource =   "Dept. of Comput. Sci., Tennessee Univ., Knoxville, TN,
                 USA",
  keywords =     "algebra; Basic Linear Algebra Communication
                 Subprograms; bidiagonal form; distributed Level 3 BLAS
                 routines; distributed memory concurrent computers;
                 distributed memory systems; Hessenberg; higher level;
                 library routines; mathematics computing; matrix; panel
                 reduction phase; Parallel Block BLAS; parallel dense
                 linear algebra software library; parallel programming;
                 PB-BLAS; reduction algorithms; ScaLAPACK; sequential
                 BLAS; software engineering considerations; software
                 libraries; tridiagonal",
  treatment =    "P Practical",
}
@InProceedings{Choi:1994:PMT,
  author =       "Jaeyoung Choi and J. J. Dongarra and D. W. Walker",
  title =        "Parallel matrix transpose algorithms on distributed
                 memory concurrent computers",
  crossref =     "IEEE:1994:PSP",
  pages =        "245--252",
  year =         "1994",
  bibdate =      "Sat Mar 22 15:39:54 MST 1997",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "See original LAPACK Working note in
                 \cite{Choi:1993:PMT}.",
  acknowledgement = ack-nhfb,
  classcodes =   "C7310 (Mathematics); C4240P (Parallel programming and
                 algorithm theory); C4140 (Linear algebra); C5440
                 (Multiprocessor systems and techniques)",
  conflocation = "Mississippi State, MS, USA; 6-8 Oct. 1993",
  conftitle =    "Proceedings of Scalable Parallel Libraries
                 Conference",
  corpsource =   "Math. Sci. Sect., Oak Ridge Nat. Lab., TN, USA",
  keywords =     "algebra; block scattered data distribution; computer;
                 concurrent computers; distributed memory; distributed
                 memory systems; Intel Touchstone Delta; mathematics
                 computing; matrix; matrix multiplication routine;
                 parallel algorithms; parallel matrix transpose
                 algorithms; point-to-point communication; PUMMA
                 package; synchronisation; transposed matrices",
  sponsororg =   "Mississippi State Univ.; Nat. Sci. Found",
  treatment =    "A Application; P Practical",
}
@Article{Choi:1994:PPU,
  author =       "Jaeyoung Choi and Jack J. Dongarra and David W.
                 Walker",
  title =        "{PUMMA}: {Parallel Universal Matrix Multiplication
                 Algorithms} on distributed memory concurrent
                 computers",
  journal =      j-CPE,
  volume =       "6",
  number =       "7",
  pages =        "543--570",
  month =        oct,
  year =         "1994",
  CODEN =        "CPEXEI",
  ISSN =         "1040-3108",
  bibdate =      "Tue Feb 26 09:30:21 2002",
  bibsource =    "Compendex database;
                 http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "See original LAPACK Working note in
                 \cite{Choi:1993:PPU}",
  URL =          "http://www.netlib.org/utk/people/JackDongarra/PAPERS/PUMMA-Parallel-Universal-Matrix-Multiplication-Algorithms.pdf",
  abstract =     "The paper describes Parallel Universal Matrix
                 Multiplication Algorithms (PUMMA) on distributed memory
                 concurrent computers. The PUMMA package includes not
                 only the non-transposed matrix multiplication routine
                 {$ C = A \dot B $}, but also transposed multiplication
                 routines {$ C = A^T \DOT B $}, {$ C = A \dot B^T $},
                 and {$ C = A^T \dot B^T $}, for a block cyclic data
                 distribution. The routines perform efficiently for a
                 wide range of processor configurations and block sizes.
                 The PUMMA together provide the same functionality as
                 the Level 3 BLAS routine xGEMM. Details of the parallel
                 implementation of the routines are given, and results
                 are presented for runs on the Intel Touchstone Delta
                 computer.",
  acknowledgement = ack-nhfb,
  affiliation =  "Oak Ridge Natl Lab",
  affiliationaddress = "Oak Ridge, TN, USA",
  classcodes =   "C7310 (Mathematics); C5440 (Multiprocessor systems and
                 techniques); C4240P (Parallel programming and algorithm
                 theory); C4140 (Linear algebra)",
  classification = "722.4; 723.1; 921.1",
  corpsource =   "Math. Sci. Sect., Oak Ridge Nat. Lab., TN, USA",
  journalabr =   "Concurrency Pract Exper",
  keywords =     "algebra; Algorithms; block cyclic data distribution;
                 block sizes; Computer architecture; configurations;
                 Distributed memory concurrent computers; distributed
                 memory concurrent computers; distributed memory
                 systems; Intel Touchstone Delta Computer; level 3 BLAS
                 routine xGEMM; Mathematical operators; mathematics
                 computing; matrix; Matrix algebra; matrix
                 multiplication routine; Multiprogramming;
                 nontransposed; parallel algorithms; Parallel processing
                 systems; Parallel Universal Matrix Multiplication
                 Algorithm (PUMMA); parallel universal matrix
                 multiplication algorithms; processor; PUMMA; routines;
                 transposed multiplication",
  treatment =    "A Application; P Practical",
}
@Article{Dayde:1994:PBI,
  author =       "Michael J. Dayd{\'e} and Iain S. Duff and Antoine
                 Petitet",
  title =        "A Parallel Block Implementation of Level-3 {BLAS} for
                 {MIMD} Vector Processors",
  journal =      j-TOMS,
  volume =       "20",
  number =       "2",
  pages =        "178--193",
  month =        jun,
  year =         "1994",
  CODEN =        "ACMSCU",
  DOI =          "https://doi.org/10.1145/178365.174413",
  ISSN =         "0098-3500 (print), 1557-7295 (electronic)",
  bibdate =      "Fri Sep 09 13:52:29 1994",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "See
                 \cite{Dongarra:1990:ASL,Higham:1990:EFM,Demmel:1992:SBA}.",
  URL =          "http://www.acm.org/pubs/citations/journals/toms/1994-20-2/p178-dayde/",
  acknowledgement = ack-nhfb,
  keywords =     "algorithms; Level-3 BLAS; matrix-matrix kernels;
                 measurement; parallelization; performance;
                 vectorization",
  subject =      "F.2.1 [Analysis of Algorithms and Problem Complexity]:
                 Numerical Algorithms and Problems--computations on
                 matrices; G.1.0 [Numerical Analysis]:
                 General--numerical algorithms; G.1.3 [Numerical
                 Analysis]: Numerical Linear Algebra--linear systems
                 (direct and iterative methods); G.4 [Mathematics of
                 Computing]: Mathematical Software--certification and
                 testing; efficiency; portability; reliability and
                 robustness; verification",
}
@Article{Demmel:1994:FNA,
  author =       "James W. Demmel and Xiaoye Li",
  title =        "Faster Numerical Algorithms via Exception Handling",
  journal =      j-IEEE-TRANS-COMPUT,
  volume =       "43",
  number =       "8",
  pages =        "983--992",
  month =        aug,
  year =         "1994",
  CODEN =        "ITCOB4",
  ISSN =         "0018-9340 (print), 1557-9956 (electronic)",
  ISSN-L =       "0018-9340",
  bibdate =      "Mon May 20 06:16:49 MDT 2002",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib; OCLC
                 Proceedings database",
  note =         "This is an expanded version of
                 \cite{Demmel:1993:FNA}.",
  URL =          "http://www.cs.berkeley.edu/~xiaoye/ieee.ps.gz",
  acknowledgement = ack-nhfb,
  remark =       "Selected revised and extended papers from ARITH'11
                 \cite{Swartzlander:1993:PSC}.",
}
@InProceedings{Dongarra:1994:SMLb,
  author =       "J. Dongarra and A. Lumsdaine and X. Niu and R. Pozo
                 and K. Remington",
  title =        "A Sparse Matrix Library in {C++} For High Performance
                 Architectures",
  crossref =     "Anonymous:1994:OON",
  pages =        "214--218",
  year =         "1994",
  bibdate =      "Thu Sep 16 09:48:36 MDT 1999",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "See original LAPACK Working note in
                 \cite{Dongarra:1994:SMLa}.",
  URL =          "http://www.netlib.org/netlib/lapack/lawns/lawn74.ps;
                 http://www.netlib.org/netlib/lapack/lawnspdf/lawn74.pdf",
  acknowledgement = ack-nhfb,
}
@Article{vandeGeijn:1994:GCO,
  author =       "R. A. {van de Geijn}",
  title =        "On Global Combine Operations",
  journal =      j-J-PAR-DIST-COMP,
  volume =       "22",
  number =       "2",
  pages =        "324--328",
  month =        aug,
  year =         "1994",
  CODEN =        "JPDCER",
  DOI =          "https://doi.org/10.1006/jpdc.1994.1091",
  ISSN =         "0743-7315 (print), 1096-0848 (electronic)",
  bibdate =      "Thu Mar 9 09:18:55 MST 2000",
  bibsource =    "http://www.idealibrary.com/servlet/useragent?func=showAllIssues&curIssueID=jpdc;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "See original LAPACK Working note in
                 \cite{vandeGeijn:1991:GCO}.",
  URL =          "http://www.idealibrary.com/links/doi/10.1006/jpdc.1994.1091/production;
                 http://www.idealibrary.com/links/doi/10.1006/jpdc.1994.1091/production/pdf",
  acknowledgement = ack-nhfb,
  classification = "C4230M (Multiprocessor interconnection); C4240P
                 (Parallel programming and algorithm theory); C5220P
                 (Parallel architecture); C5440 (Multiprocessor systems
                 and techniques)",
  corpsource =   "Dept. of Comput. Sci., Texas Univ., Austin, TX, USA",
  keywords =     "algorithms; distributed memory multiple instruction
                 multiple data; distributed memory systems; global
                 combine operations; hybrid strategy; hypercube
                 networks; Intel iPSC/860; multicomputers; parallel",
  treatment =    "P Practical",
}
@Article{Bai:1995:TLAb,
  author =       "Z. Bai and D. Day and J. Demmel and J. Dongarra",
  title =        "Templates for Linear Algebra Problems",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1000",
  pages =        "115--??",
  year =         "1995",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  bibdate =      "Sat May 11 13:45:32 MDT 1996",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "See original LAPACK Working note in
                 \cite{Bai:1995:TLAa}.",
  URL =          "http://www.netlib.org/utk/papers/etemplates.ps;
                 http://www.netlib.org/utk/papers/etemplates/paper.html",
  acknowledgement = ack-nhfb,
}
@Article{Choi:1995:DPDb,
  author =       "Jaeyoung Choi and Jack J. Dongarra and David W.
                 Walker",
  title =        "The design of a parallel dense linear algebra software
                 library: reduction to {Hessenberg}, tridiagonal, and
                 bidiagonal form",
  journal =      j-NUMER-ALGORITHMS,
  volume =       "10",
  number =       "3--4",
  pages =        "379--399",
  month =        oct,
  year =         "1995",
  CODEN =        "NUALEG",
  ISSN =         "1017-1398 (print), 1572-9265 (electronic)",
  MRclass =      "65-04 (65Y10)",
  MRnumber =     "1 355 739",
  bibdate =      "Sat Mar 22 15:39:54 MST 1997",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "See original LAPACK Working note in
                 \cite{Choi:1995:DPDa,Choi:1994:DPD}.",
  acknowledgement = ack-nhfb,
  classcodes =   "B0290H (Linear algebra); C7310 (Mathematics
                 computing); C4140 (Linear algebra); C6110B (Software
                 engineering techniques); C6115 (Programming support)",
  corpsource =   "Dept. of Comput. Sci., Tennessee Univ., Knoxville, TN,
                 USA",
  keywords =     "Basic Linear Algebra Communication; bidiagonal; BLACS;
                 computers; computing; dense; distributed memory
                 concurrent; eigenproblems; eigenvalues and
                 eigenfunctions; form; Hessenberg form; LAPACK; linear
                 algebra; linear algebra computations; mathematics;
                 matrices; matrix reduction algorithms; parallel BLAS;
                 parallel dense linear algebra software library;
                 routine; ScaLAPACK; sequencing BLAS; software
                 engineering; software libraries; Subprograms;
                 tridiagonal form",
  treatment =    "A Application; P Practical",
}
@Article{Demmel:1995:CSB,
  author =       "James W. Demmel and Inderjit Dhillon and Huan Ren",
  title =        "On the Correctness of Some Bisection-Like Parallel
                 Eigenvalue Algorithms in Floating Point Arithmetic",
  journal =      j-ETNA,
  volume =       "3",
  pages =        "116--149",
  year =         "1995",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "See original LAPACK Working note in
                 \cite{Demmel:1994:CPB}.",
}
@Article{Heath:1995:CPN,
  author =       "Michael T. Heath and Padma Raghavan",
  title =        "A {Cartesian} Parallel Nested Dissection Algorithm",
  journal =      j-SIAM-J-MAT-ANA-APPL,
  volume =       "16",
  number =       "1",
  pages =        "235--253",
  month =        jan,
  year =         "1995",
  CODEN =        "SJMAEL",
  ISSN =         "0895-4798 (print), 1095-7162 (electronic)",
  MRclass =      "65F05 (65F50 65Y05)",
  MRnumber =     "95m:65046",
  MRreviewer =   "Ming Kui Chen",
  bibdate =      "Fri Dec 4 12:14:09 MST 1998",
  bibsource =    "http://epubs.siam.org/sam-bin/dbq/toc/SIMAX/16/1;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "See original LAPACK Working note in
                 \cite{Heath:1992:CPN}.",
  URL =          "http://epubs.siam.org/sam-bin/dbq/article/23827",
  acknowledgement = ack-nhfb,
}
@InProceedings{Plank:1995:ADC,
  author =       "James S. Plank and Youngbae Kim and Jack J. Dongarra",
  title =        "Algorithm-Based Diskless Checkpointing for
                 Fault-Tolerant Matrix Operations",
  crossref =     "IEEE:1995:DPT",
  pages =        "351--360",
  year =         "1995",
  bibdate =      "Mon Aug 26 07:58:57 1996",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "See original LAPACK Working note in
                 \cite{Plank:1994:ABD}.",
  URL =          "http://www.cs.utk.edu/~plank/plank/papers/FTCS25.1995.html;
                 http://www.netlib.org/utk/papers/fault.ps;
                 http://www.netlib.org/utk/people/JackDongarra/pdf/fault.pdf",
  abstract =     "This paper is an exploration of diskless
                 check-pointing for distributed scientific computations.
                 With the widespread use of the `Network Of Workstation'
                 (NOW) platform for distributed computing, long-running
                 scientific computations need to tolerate the changing
                 and often faulty nature of NOW environments. We present
                 high-performance implementations of several algorithms
                 for distributed scientific computing, including
                 Cholesky factorization, LU factorization, QR
                 factorization, and Preconditioned Conjugate Gradient.
                 These implementations are able to run on PVM networks
                 of at least N processors, and can complete with low
                 overhead as long as any N processors remain functional.
                 We discuss the details of how the algorithms are tuned
                 for fault-tolerance, and present the performance
                 results on a PVM network of SUN workstations, and on
                 the IBM SP2.",
  acknowledgement = ack-nhfb,
  affiliation =  "Univ of Tennessee",
  affiliationaddress = "TN, USA",
  classcodes =   "C6150N (Distributed systems software); C6110B
                 (Software engineering techniques); C4140 (Linear
                 algebra); C7300 (Natural sciences computing); C4130
                 (Interpolation and function approximation)",
  classification = "722.2; 722.4; 723.1",
  conference =   "Proceedings of the 25th International Symposium on
                 Fault-Tolerant Computing",
  conflocation = "Pasadena, CA, USA; 27-30 June 1995",
  conftitle =    "Twenty-Fifth International Symposium on Fault-Tolerant
                 Computing. Digest of Papers",
  corpsource =   "Dept. of Comput. Sci., Tennessee Univ., TN, USA",
  journalabr =   "Dig Pap Int Symp Fault Tolerant Comput",
  keywords =     "algebra; Algorithm based diskless checkpointing;
                 algorithm-based diskless checkpointing; Algorithms;
                 Cholesky; Cholesky factorization; computations;
                 Computer networks; Computer workstations; conjugate
                 gradient methods; Distributed computer systems;
                 Distributed scientific computations; distributed
                 scientific computations; factorization; fault tolerant;
                 Fault tolerant computer systems; Fault tolerant matrix
                 operations; fault-tolerance; high-performance
                 implementations; IBM SP2; local area networks;
                 long-running scientific; low overhead; LU
                 factorization; matrix; matrix operations; natural
                 sciences computing; Network of workstation (NOW)
                 platform; Parallel processing systems; performance;
                 preconditioned conjugate gradient; Preconditioned
                 conjugate gradient; processors; PVM networks; QR
                 factorization; software fault; subroutines; SUN;
                 tolerance; workstation network platform; workstations",
  meetingaddress = "Pasadena, CA, USA",
  meetingdate =  "Jun 27--30 1995",
  meetingdate2 = "06/27--30/95",
  sponsor =      "IEEE",
  sponsororg =   "IEEE Comput. Soc. Tech. Committee on Fault-Tolerant
                 Comput.; LAAS-CNRS, France; Univ. Illinois at
                 Urbana-Champaign; Univ. California at Los Angeles; Jep
                 Propulsion Lab.; IFIP WG 10.4",
  treatment =    "T Theoretical or Mathematical",
}
@Article{Raghavan:1995:DSG,
  author =       "Padma Raghavan",
  title =        "Distributed sparse {Gaussian} elimination and
                 orthogonal factorization",
  journal =      j-SIAM-J-SCI-COMP,
  volume =       "16",
  number =       "6",
  pages =        "1462--1477",
  month =        nov,
  year =         "1995",
  CODEN =        "SJOCE3",
  ISSN =         "1064-8275 (print), 1095-7197 (electronic)",
  ISSN-L =       "1064-8275",
  MRclass =      "65F50 (65F05 65F20)",
  MRnumber =     "96g:65046",
  MRreviewer =   "Zahari Zlatev",
  bibdate =      "Tue Apr 29 18:25:50 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "See original LAPACK Working note in
                 \cite{Raghavan:1993:DSG}.",
  acknowledgement = ack-nhfb,
}
@Article{Barrett:1996:ABI,
  author =       "R. Barrett and M. Berry and J. Dongarra and V.
                 Eijkhout and Romine and C.",
  title =        "Algorithmic bombardment for the iterative solution of
                 linear systems: a poly-iterative approach",
  journal =      j-J-COMPUT-APPL-MATH,
  volume =       "74",
  number =       "1--2",
  pages =        "91--109",
  day =          "5",
  month =        "????",
  year =         "1996",
  CODEN =        "JCAMDI",
  ISSN =         "0377-0427 (print), 1879-1778 (electronic)",
  MRclass =      "65F10 (65N22 65Y05)",
  MRnumber =     "97j:65052",
  bibdate =      "Sat Mar 22 15:39:54 MST 1997",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "See original LAPACK Working note in
                 \cite{Barrett:1994:ABI}.",
  URL =          "http://www.netlib.org/utk/people/JackDongarra/PAPERS/bombard.ps;
                 http://www.netlib.org/utk/people/JackDongarra/pdf/bombard.pdf",
  acknowledgement = ack-nhfb,
  classcodes =   "B0290H (Linear algebra); B0290F (Interpolation and
                 function approximation); C4140 (Linear algebra); C4130
                 (Interpolation and function approximation); C4240P
                 (Parallel programming and algorithm theory)",
  conflocation = "Austin, TX, USA; April 1995",
  conftitle =    "TICAM Symposium. Texas Institute for Computational and
                 Applied Mathematics",
  corpsource =   "Distributed Comput. Group, Los Alamos Nat. Lab., NM,
                 USA",
  keywords =     "algorithmic bombardment; convergence; cost;
                 environment; global communications; indefinite;
                 iterative methods; iterative solution; linear systems;
                 matrix; matrix algebra; matrix properties; nonsymmetric
                 matrix; parallel algorithms; parallel environment;
                 poly-iterative approach; sequential computing",
  treatment =    "T Theoretical or Mathematical",
}
@InProceedings{Blackford:1996:PEDb,
  author =       "L. S. Blackford and A. Cleary and J. Demmel and I.
                 Dhillon and J. Dongarra and S. Hammarling and A.
                 Petitet and H. Ren and K. Stanley and R. C. Whaley",
  title =        "Practical experience in the dangers of heterogeneous
                 computing",
  crossref =     "Wasniewski:1996:APC",
  pages =        "57--64",
  year =         "1996",
  bibdate =      "Tue Feb 26 08:49:09 2002",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "See original LAPACK Working note in
                 \cite{Blackford:1996:PEDa}.",
  URL =          "http://www.netlib.org/utk/papers/practical-hetro/paper.html;
                 http://www.netlib.org/utk/papers/practical-hetro/paper.ps;
                 http://www.netlib.org/utk/people/JackDongarra/pdf/prac-het.pdf",
  acknowledgement = ack-nhfb,
  classcodes =   "C7310 (Mathematics computing); C6110B (Software
                 engineering techniques); C6110P (Parallel programming);
                 C6150N (Distributed systems software); C6115
                 (Programming support)",
  conflocation = "Lyngby, Denmark; 18-21 Aug. 1996",
  conftitle =    "Applied Parallel Computing. Industrial Computation and
                 Optimization. Third International Workshop, PARA'96.
                 Proceedings",
  corpsource =   "Tennessee Univ., Knoxville, TN, USA",
  keywords =     "distributed memory systems; floating point arithmetic;
                 heterogeneous computing; libraries; mathematics
                 computing; numerical library software; parallel
                 algorithms; ScaLAPACK; software; software portability;
                 software reliability; software robustness",
  treatment =    "A Application; G General Review",
}
@InProceedings{Blackford:1996:SPL,
  author =       "Laura Susan Blackford and J. Choi and A. Cleary and A.
                 Petitet and R. C. Whaley and J. Demmel and I. Dhillon
                 and K. Stanley and J. Dongarra and S. Hammarling and G.
                 Henry and D. Walker",
  title =        "{ScaLAPACK}: {A} Portable Linear Algebra Library for
                 Distributed Memory Computers --- Design Issues and
                 Performance",
  crossref =     "ACM:1996:SCP",
  pages =        "??--??",
  year =         "1996",
  bibdate =      "Mon Mar 23 12:31:18 1998",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "See original LAPACK Working note in
                 \cite{Choi:1995:SPL}.",
  URL =          "http://www.netlib.org/utk/papers/sc96-scalapack/paper.html;
                 http://www.netlib.org/utk/papers/sc96-scalapack/paper.ps;
                 http://www.netlib.org/utk/people/JackDongarra/pdf/scala96.pdf;
                 http://www.supercomp.org/sc96/proceedings/SC96PROC/DONGARRA/INDEX.HTM",
  acknowledgement = ack-nhfb,
}
@Article{Choi:1996:DIS,
  author =       "Jaeyoung Choi and J. J. Dongarra and L. S. Ostrouchov
                 and Petitet and A. P. and D. W. Walker and R. C.
                 Whaley",
  title =        "Design and implementation of the {ScaLAPACK LU}, {$ Q
                 R $}, and {Cholesky} factorization routines",
  journal =      j-SCI-PROG,
  volume =       "5",
  number =       "3",
  pages =        "173--184",
  month =        "Fall",
  year =         "1996",
  CODEN =        "SCIPEV",
  ISSN =         "1058-9244 (print), 1875-919X (electronic)",
  bibdate =      "Sat Mar 22 15:39:54 MST 1997",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "See original LAPACK Working note in
                 \cite{Choi:1994:DIS}.",
  URL =          "http://www.netlib.org/netlib/lapack/lawns/lawn80.ps;
                 http://www.netlib.org/netlib/lapack/lawnspdf/lawn80.pdf;
                 http://www.netlib.org/utk/papers/factor/ftcover.html",
  acknowledgement = ack-nhfb,
  classcodes =   "C4140 (Linear algebra); C6110B (Software engineering
                 techniques); C6115 (Programming support); C5440
                 (Multiprocessing systems); C6150N (Distributed systems
                 software); C6110P (Parallel programming)",
  corpsource =   "Dept. of Comput. Sci., Tennessee Univ., Knoxville, TN,
                 USA",
  keywords =     "BLACS; BLAS; block; communication; cyclic data
                 distribution; de facto standard kernels; dense linear
                 equation system; distributed memory systems;
                 engineering; factorization routine; Intel; Intel
                 Touchstone Delta; Inter Paragon System; iPSC/860;
                 linear algebra; matrix; message passing; operations;
                 parallel implementations; parallel machines; parallel
                 programming; parallelized sequential LAPACK; PBLAS;
                 performance; performance evaluation; scalability;
                 ScaLAPACK Cholesky factorization routine; ScaLAPACK
                 library; ScaLAPACK LU factorization routine; ScaLAPACK
                 QR; software; software libraries; software packages;
                 vector operations",
  treatment =    "P Practical",
}
@InProceedings{Choi:1996:PSP,
  author =       "Jaeyoung Choi and J. Dongarra and S. Ostrouchov and A.
                 Petitet and D. Walker and R. C. Whaley",
  title =        "A proposal for a set of {Parallel Basic Linear Algebra
                 Subprograms}",
  crossref =     "Dongarra:1996:APC",
  pages =        "107--114",
  year =         "1996",
  bibdate =      "Sat Mar 22 15:39:54 MST 1997",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "See original LAPACK Working note in
                 \cite{Choi:1995:PSP}.",
  acknowledgement = ack-nhfb,
  classcodes =   "C7310 (Mathematics computing); C6110P (Parallel
                 programming); C4140 (Linear algebra)",
  conflocation = "Lyngby, Denmark; 21-24 Aug. 1995",
  conftitle =    "Applied Parallel Computing. Computations in Physics,
                 Chemistry and Engineering Science",
  corpsource =   "Sch. of Comput., Soongsil Univ., Seoul, South Korea",
  keywords =     "basic linear algebra; distributed memory; linear
                 algebra; linear algebra subprograms; parallel; parallel
                 programming; PBLAS; software libraries",
  treatment =    "T Theoretical or Mathematical",
}
@InProceedings{Dongarra:1996:PFI,
  author =       "J. J. Dongarra and J. {Du Croz} and S. Hammarling and
                 J. Wa{\'s}niewski and A. Zemla",
  title =        "A proposal for a {Fortran 90} interface for {LAPACK}",
  crossref =     "Dongarra:1996:APC",
  pages =        "158--165",
  year =         "1996",
  bibdate =      "Sat Mar 22 15:39:54 MST 1997",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "See original LAPACK Working note in
                 \cite{Dongarra:1995:PFI}.",
  acknowledgement = ack-nhfb,
  classcodes =   "C7310 (Mathematics computing); C4140 (Linear algebra);
                 C6140D (High level languages); C6180 (User
                 interfaces)",
  conflocation = "Lyngby, Denmark; 21-24 Aug. 1995",
  conftitle =    "Applied Parallel Computing. Computations in Physics,
                 Chemistry and Engineering Science",
  corpsource =   "Dept. of Comput. Sci., Tennessee Univ., Knoxville, TN,
                 USA",
  keywords =     "FORTRAN; Fortran 90 interface; LAPACK; LAPACK code;
                 linear algebra; mathematics computing; packages;
                 software; user interfaces; user-interface",
  treatment =    "P Practical; T Theoretical or Mathematical",
}
@Article{Henry:1996:PAU,
  author =       "Greg Henry and Robert van de Geijn",
  title =        "Parallelizing the {$ Q R $} Algorithm for the
                 Unsymmetric Algebraic Eigenvalue Problem: Myths and
                 Reality",
  journal =      j-SIAM-J-SCI-COMP,
  volume =       "17",
  number =       "4",
  pages =        "870--883",
  month =        jul,
  year =         "1996",
  CODEN =        "SJOCE3",
  ISSN =         "1064-8275 (print), 1095-7197 (electronic)",
  ISSN-L =       "1064-8275",
  MRclass =      "65F15 (15A18)",
  MRnumber =     "97b:65044",
  bibdate =      "Tue Apr 29 18:25:50 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "See original LAPACK Working note in
                 \cite{Henry:1994:PQA}.",
  acknowledgement = ack-nhfb,
}
@Article{Kaagstrom:1996:CES,
  author =       "Bo K{\aa}gstr{\"o}m and Peter Poromaa",
  title =        "Computing eigenspaces with specified eigenvalues of a
                 regular matrix pair {$ ({A}, {B}) $} and condition
                 estimation: theory, algorithms and software",
  journal =      j-NUMER-ALGORITHMS,
  volume =       "12",
  number =       "3--4",
  pages =        "369--407",
  month =        jul,
  year =         "1996",
  CODEN =        "NUALEG",
  ISSN =         "1017-1398 (print), 1572-9265 (electronic)",
  MRclass =      "65Fxx",
  MRnumber =     "1 402 856",
  bibdate =      "Tue Apr 29 08:56:05 MDT 1997",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "See original LAPACK Working note in
                 \cite{Kaagstrom:1994:CES}.",
  acknowledgement = ack-nhfb,
  classification = "B0290H (Linear algebra); C4140 (Linear algebra)",
  corpsource =   "Dept. of Comput. Sci., Umea Univ., Sweden",
  keywords =     "condition estimation; deflating sub-spaces;
                 eigenspaces; eigenvalues; eigenvalues and
                 eigenfunctions; error bounds; matrix algebra; numerical
                 stability; reciprocal values; regular matrix pair;
                 specified eigenvalues",
  pubcountry =   "Switzerland",
  treatment =    "T Theoretical or Mathematical",
}
@Article{Kaagstrom:1996:LSA,
  author =       "Bo K{\aa}gstr{\"o}m and Peter Poromaa",
  title =        "{LAPACK-style} algorithms and software for solving the
                 generalized {Sylvester} equation and estimating the
                 separation between regular matrix pairs",
  journal =      j-TOMS,
  volume =       "22",
  number =       "1",
  pages =        "78--103",
  month =        mar,
  year =         "1996",
  CODEN =        "ACMSCU",
  DOI =          "https://doi.org/10.1145/225545.225552",
  ISSN =         "0098-3500 (print), 1557-7295 (electronic)",
  MRclass =      "65-04 (65F30)",
  MRnumber =     "1 383 186",
  bibdate =      "Sat Aug 31 16:07:02 MDT 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "See original LAPACK Working note in
                 \cite{Kaagstrom:1994:LSA}.",
  URL =          "http://www.acm.org/pubs/citations/journals/toms/1996-22-1/p78-kagstrom/",
  abstract =     "Robust and fast software to solve the generalized
                 Sylvester equation ({$ A R - L B = C, D R - L E = F $})
                 for unknowns {$R$} and {$L$} is presented. This special
                 linear system of equations, and its transpose, arises
                 in computing error bounds for computed eigenvalues and
                 eigenspaces of the generalized eigenvalue problem {$ S
                 - \lambda T $}, in computing deflating subspaces of the
                 same problem, and in computing certain decompositions
                 of transfer matrices arising in control theory. Our
                 contributions are twofold. First, we reorganize the
                 standard algorithm for this problem to use Level 3 BLAS
                 operations, like matrix multiplication, in its inner
                 loop. This speeds up the algorithm by a factor of 9 on
                 an IBM RS6000. Second, we develop and compare several
                 condition estimation algorithms, which inexpensively
                 but accurately estimate the sensitivity of the solution
                 of this linear system.",
  acknowledgement = ack-nhfb,
  keywords =     "algorithms",
  subject =      "{\bf G.4}: Mathematics of Computing, MATHEMATICAL
                 SOFTWARE, Algorithm analysis. {\bf F.2.1}: Theory of
                 Computation, ANALYSIS OF ALGORITHMS AND PROBLEM
                 COMPLEXITY, Numerical Algorithms and Problems,
                 Computations on matrices. {\bf G.1.3}: Mathematics of
                 Computing, NUMERICAL ANALYSIS, Numerical Linear
                 Algebra, Linear systems (direct and iterative methods).
                 {\bf G.4}: Mathematics of Computing, MATHEMATICAL
                 SOFTWARE, Reliability and robustness. {\bf G.1.3}:
                 Mathematics of Computing, NUMERICAL ANALYSIS, Numerical
                 Linear Algebra, Conditioning. {\bf G.1.3}: Mathematics
                 of Computing, NUMERICAL ANALYSIS, Numerical Linear
                 Algebra, Eigenvalues. {\bf G.4}: Mathematics of
                 Computing, MATHEMATICAL SOFTWARE, Efficiency. {\bf
                 F.2.1}: Theory of Computation, ANALYSIS OF ALGORITHMS
                 AND PROBLEM COMPLEXITY, Numerical Algorithms and
                 Problems, Computations on matrices. {\bf G.1.3}:
                 Mathematics of Computing, NUMERICAL ANALYSIS, Numerical
                 Linear Algebra, Matrix inversion.",
}
@Article{Lehoucq:1996:CEU,
  author =       "R. B. Lehoucq",
  title =        "The Computation of Elementary Unitary Matrices",
  journal =      j-TOMS,
  volume =       "22",
  number =       "4",
  pages =        "393--400",
  month =        dec,
  year =         "1996",
  CODEN =        "ACMSCU",
  ISSN =         "0098-3500 (print), 1557-7295 (electronic)",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "See original LAPACK Working note in
                 \cite{Lehoucq:1995:CEU}.",
  abstract =     "The construction of elementary unitary matrices that
                 transform a complex vector to a multiple of $ e_1 $,
                 the first column of the identity matrix, is studied. We
                 present four variants and their software
                 implementation, including a discussion on the {LAPACK}
                 subroutine {CLARFG}. Comparisons are also given.",
  accepted =     "June 1996",
  acknowledgement = ack-rfb,
  keywords =     "algorithms",
  subject =      "{\bf F.2}: Theory of Computation, ANALYSIS OF
                 ALGORITHMS AND PROBLEM COMPLEXITY, Numerical Algorithms
                 and Problems, Computations on matrices. {\bf G.1.3}:
                 Mathematics of Computing, NUMERICAL ANALYSIS, Numerical
                 Linear Algebra. {\bf G.4}: Mathematics of Computing,
                 MATHEMATICAL SOFTWARE, Algorithm analysis.",
}
@Article{Bai:1997:SDN,
  author =       "Z. Bai and J. Demmel and J. Dongarra and A. Petitet
                 and H. Robinson and K. Stanley",
  title =        "The Spectral Decomposition of Nonsymmetric Matrices on
                 Distributed Memory Parallel Computers",
  journal =      j-SIAM-J-SCI-COMP,
  volume =       "18",
  number =       "5",
  pages =        "1446--1461",
  month =        sep,
  year =         "1997",
  CODEN =        "SJOCE3",
  ISSN =         "1064-8275 (print), 1095-7197 (electronic)",
  ISSN-L =       "1064-8275",
  MRclass =      "65F05 (65F30 65Y05)",
  MRnumber =     "98d:65027",
  bibdate =      "Tue Feb 26 10:04:07 2002",
  bibsource =    "http://epubs.siam.org/sam-bin/dbq/toc/SISC/18/5;
                 http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib;
                 http://www.math.utah.edu/pub/tex/bib/siamjscicomput.bib",
  note =         "See original LAPACK Working note in
                 \cite{Bai:1995:SDN}.",
  URL =          "http://epubs.siam.org/sam-bin/dbq/article/28136;
                 http://www.netlib.org/utk/papers/sign/sign.html;
                 http://www.netlib.org/utk/papers/sign/sign.ps;
                 http://www.netlib.org/utk/people/JackDongarra/pdf/sign.pdf",
  acknowledgement = ack-nhfb,
}
@Article{Blackford:1997:PEN,
  author =       "L. S. Blackford and A. Cleary and A. Petitet and R. C.
                 Whaley and J. Demmel and I. Dhillon and H. Ren and K.
                 Stanley and J. Dongarra and S. Hammarling",
  title =        "Practical Experience in the Numerical Dangers of
                 Heterogeneous Computing",
  journal =      j-TOMS,
  volume =       "23",
  number =       "2",
  pages =        "133--147",
  month =        jun,
  year =         "1997",
  CODEN =        "ACMSCU",
  DOI =          "https://doi.org/10.1145/264029.264030",
  ISSN =         "0098-3500 (print), 1557-7295 (electronic)",
  bibdate =      "Tue Feb 26 10:10:44 2002",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "See original LAPACK Working note in
                 \cite{Blackford:1996:PEDa} and
                 \cite{Blackford:1996:PEDb}.",
  URL =          "http://www.acm.org/pubs/citations/journals/toms/1997-23-2/p133-blackford/",
  abstract =     "Special challenges exist in writing reliable numerical
                 library software for heterogeneous computing
                 environments. Although a lot of software for
                 distributed-memory parallel computers has been written,
                 porting this software to a network of workstations
                 requires careful consideration. The symptoms of
                 heterogeneous computing failures can range from
                 erroneous results without warning to deadlock. Some of
                 the problems are straightforward to solve, but for
                 others the solutions are not so obvious, or incur an
                 unacceptable overhead. Making software robust on
                 heterogeneous systems often requires additional
                 communication. We describe and illustrate the problems
                 encountered during the development of ScaLAPACK and the
                 NAG Numerical PVM Library. Where possible, we suggest
                 ways to avoid potential pitfalls, or if that is not
                 possible, we recommend that the software not be used on
                 heterogeneous networks.",
  acknowledgement = ack-rfb # " and " # ack-kr,
  keywords =     "distributed-memory systems, floating-point arithmetic,
                 heterogeneous processor networks, message passing,
                 numerical software, reliability",
  subject =      "{\bf D.1.3} Software, PROGRAMMING TECHNIQUES,
                 Concurrent Programming, Distributed programming. {\bf
                 G.1.0} Mathematics of Computing, NUMERICAL ANALYSIS,
                 General, Computer arithmetic. {\bf G.1.0} Mathematics
                 of Computing, NUMERICAL ANALYSIS, General, Parallel
                 algorithms.",
}
@Article{Dongarra:1997:KCPb,
  author =       "Jack J. Dongarra and Sven Hammarling and David W.
                 Walker",
  title =        "Key concepts for parallel out-of-core {$ L U $}
                 factorization",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "23",
  number =       "1--2",
  pages =        "49--70",
  day =          "16",
  month =        apr,
  year =         "1997",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191",
  bibdate =      "Tue Oct 21 15:14:48 MDT 1997",
  bibsource =    "Compendex database;
                 http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "See original LAPACK Working note in
                 \cite{Dongarra:1996:KCP}.",
  acknowledgement = ack-nhfb,
  affiliation =  "Univ of Tennessee",
  affiliationaddress = "Knoxville, TN, USA",
  classification = "714.2; 722.1; 722.4; 723; 723.1; 921",
  conference =   "Proceedings of the 1996 International Workshop on
                 Environments and Tools for Parallel Scientific
                 Computing",
  journalabr =   "Parallel Comput",
  keywords =     "Algorithms; Computer architecture; Input output
                 programs; lu factorization; Microprocessor chips;
                 Parallel processing systems; Percolation (computer
                 storage); Storage allocation (computer)",
  meetingaddress = "Faverges de la Tour, Fr",
  meetingdate =  "Aug 22--23 1996",
  meetingdate2 = "08/22--23/96",
}
@Article{Higham:1997:IRL,
  author =       "Nicholas J. Higham",
  title =        "Iterative refinement for linear systems and {LAPACK}",
  journal =      j-IMA-J-NUMER-ANAL,
  volume =       "17",
  number =       "4",
  pages =        "495--509",
  month =        oct,
  year =         "1997",
  CODEN =        "IJNADH",
  ISSN =         "0272-4979 (print), 1464-3642 (electronic)",
  MRclass =      "65F30",
  MRnumber =     "98e:65036",
  bibdate =      "Sat Dec 23 17:06:35 MST 2000",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib;
                 http://www3.oup.co.uk/imanum/hdb/Volume_17/Issue_04/;
                 MathSciNet database",
  note =         "Preprint published as Numerical Analysis Report 277,
                 Manchester Centre for Computational Mathematics,
                 Manchester, England, and as LAPACK Working Note 104.
                 See original LAPACK Working note in
                 \cite{Higham:1995:IRL}.",
  URL =          "http://www3.oup.co.uk/imanum/hdb/Volume_17/Issue_04/170495.sgm.abs.html",
  acknowledgement = ack-nhfb,
}
@Article{Higham:1997:SDP,
  author =       "Nicholas J. Higham",
  title =        "Stability of the Diagonal Pivoting Method with Partial
                 Pivoting",
  journal =      j-SIAM-J-MAT-ANA-APPL,
  volume =       "18",
  number =       "1",
  pages =        "52--65",
  month =        jan,
  year =         "1997",
  CODEN =        "SJMAEL",
  ISSN =         "0895-4798 (print), 1095-7162 (electronic)",
  bibdate =      "Sun Mar 2 11:16:54 GMT 1997",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/h/higham-nicholas-john.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "See original LAPACK Working note in
                 \cite{Higham:1995:SDP}.",
  acknowledgement = ack-njh,
}
@Article{Li:1997:RPB,
  author =       "Ren-Cang Li",
  title =        "Relative perturbation bounds for the unitary polar
                 factor",
  journal =      j-BIT-NUM-MATH,
  volume =       "37",
  number =       "1",
  pages =        "67--75",
  month =        mar,
  year =         "1997",
  CODEN =        "BITTEL, NBITAB",
  ISSN =         "0006-3835 (print), 1572-9125 (electronic)",
  MRclass =      "15A18 (15A23 65F35)",
  MRnumber =     "97k:15026",
  MRreviewer =   "Roy Mathias",
  bibdate =      "Fri Nov 13 07:00:34 MST 1998",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "See original LAPACK Working note in
                 \cite{Li:1994:RPB}.",
  URL =          "http://www.mai.liu.se/BIT/contents/bit37.html",
  acknowledgement = ack-nhfb,
}
@Article{vandeGeijn:1997:SSU,
  author =       "R. A. van de Geijn and J. Watts",
  title =        "{SUMMA}: scalable universal matrix multiplication
                 algorithm",
  journal =      j-CPE,
  volume =       "9",
  number =       "4",
  pages =        "255--274",
  month =        apr,
  year =         "1997",
  CODEN =        "CPEXEI",
  ISSN =         "1040-3108",
  bibdate =      "Tue Sep 7 06:06:30 MDT 1999",
  bibsource =    "http://www.interscience.wiley.com/jpages/1040-3108/;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib;
                 http://www3.interscience.wiley.com/journalfinder.html",
  note =         "See original LAPACK Working note in
                 \cite{vandeGeijn:1995:SSU}.",
  URL =          "http://www3.interscience.wiley.com/cgi-bin/abstract?ID=13861;
                 http://www3.interscience.wiley.com/cgi-bin/fulltext?ID=13861&PLACEBO=IE.pdf",
  acknowledgement = ack-nhfb,
}
@Article{Choi:1998:NPM,
  author =       "Jaeyoung Choi",
  title =        "A new parallel matrix multiplication algorithm on
                 distributed-memory concurrent computers",
  journal =      j-CPE,
  volume =       "10",
  number =       "8",
  pages =        "655--670",
  month =        jul,
  year =         "1998",
  CODEN =        "CPEXEI",
  ISSN =         "1040-3108",
  bibdate =      "Tue Sep 7 06:06:42 MDT 1999",
  bibsource =    "http://www.interscience.wiley.com/jpages/1040-3108/;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib;
                 http://www3.interscience.wiley.com/journalfinder.html",
  note =         "See original LAPACK Working note in
                 \cite{Choi:1997:NPM}.",
  URL =          "http://www3.interscience.wiley.com/cgi-bin/abstract?ID=10008698;
                 http://www3.interscience.wiley.com/cgi-bin/fulltext?ID=10008698&PLACEBO=IE.pdf",
  acknowledgement = ack-nhfb,
}
@InProceedings{Desprez:1998:SBA,
  author =       "F. Desprez and J. Dongarra and A. Petitet and C.
                 Randriamaro",
  title =        "Scheduling Block-Cyclic Array Redistribution",
  crossref =     "DHollander:1998:PCF",
  pages =        "227--234",
  year =         "1998",
  bibdate =      "Thu Sep 16 09:48:36 MDT 1999",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "See original LAPACK Working note in
                 \cite{Desprez:1997:SBC} and \cite{Desprez:1998:SBC}.",
  acknowledgement = ack-nhfb,
}
@Article{Desprez:1998:SBC,
  author =       "F. Desprez and J. Dongarra and A. Petitet and C.
                 Randriamaro and Y. Robert",
  title =        "Scheduling Block-Cyclic Array Redistribution",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "9",
  number =       "2",
  pages =        "192--??",
  month =        feb,
  year =         "1998",
  CODEN =        "ITDSEO",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  bibdate =      "Fri Nov 6 12:31:15 MST 1998",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "See original LAPACK Working note in
                 \cite{Desprez:1997:SBC}.",
  URL =          "http://dlib.computer.org/td/books/td1998/pdf/l0192.pdf;
                 http://www.computer.org/tpds/td1998/l0192abs.htm",
  acknowledgement = ack-nhfb,
}
@Article{Dongarra:1998:HPL,
  author =       "J. Dongarra and J. Wa{\'s}niewski",
  title =        "High Performance Linear Algebra Package {LAPACK90}",
  journal =      j-LECT-NOTES-COMP-SCI,
  volume =       "1388",
  pages =        "387--391",
  year =         "1998",
  CODEN =        "LNCSD9",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  bibdate =      "Sat Oct 10 14:40:24 MDT 1998",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib;
                 http://www.math.utah.edu/pub/tex/bib/lncs1998a.bib",
  note =         "See original LAPACK Working note in
                 \cite{Wasniewski:1998:HPL}.",
  acknowledgement = ack-nhfb,
}
@Article{Kaagstrom:1998:GBL,
  author =       "Bo K{\aa}gstr{\"o}m and Per Ling and Charles {Van
                 Loan}",
  title =        "{GEMM-based} level 3 {BLAS}: high-performance model
                 implementations and performance evaluation benchmark",
  journal =      j-TOMS,
  volume =       "24",
  number =       "3",
  pages =        "268--302",
  month =        sep,
  year =         "1998",
  CODEN =        "ACMSCU",
  DOI =          "https://doi.org/10.1145/292395.292412",
  ISSN =         "0098-3500 (print), 1557-7295 (electronic)",
  bibdate =      "Mon Feb 8 17:51:43 MST 1999",
  bibsource =    "http://www.acm.org/pubs/contents/journals/toms/1998-24/;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "See original LAPACK Working note in
                 \cite{Kaagstrom:1995:GBLa}.",
  URL =          "http://www.acm.org:80/pubs/citations/journals/toms/1998-24-3/p268-kagstrom/",
  abstract =     "The level 3 Basic Linear Algebra Subprograms (BLAS)
                 are designed to perform various matrix multiply and
                 triangular system solving computations. Due to the
                 complex hardware organization of advanced computer
                 architectures the development of optimal level 3 BLAS
                 code is costly and time consuming. However, it is
                 possible to develop a portable and high-performance
                 level 3 BLAS library mainly relying on a highly
                 optimized GEMM, the routine for the general matrix
                 multiply and add operation. With suitable partitioning,
                 all the other level 3 BLAS can be defined in terms of
                 GEMM and a small amount of level 1 and level 2
                 computations. Our contribution is twofold. First, the
                 model implementations in Fortran 77 of the GEMM-based
                 level 3 BLAS are structured to reduced effectively data
                 traffic in a memory hierarchy. Second, the GEMM-based
                 level 3 BLAS performance evaluation benchmark is a tool
                 for evaluating and comparing different implementations
                 of the level 3 BLAS with the GEMM-based model
                 implementations.",
  acknowledgement = ack-nhfb,
  keywords =     "algorithms; measurement; performance",
  subject =      "{\bf G.1.3} Mathematics of Computing, NUMERICAL
                 ANALYSIS, Numerical Linear Algebra, Linear systems
                 (direct and iterative methods). {\bf D.3.2} Software,
                 PROGRAMMING LANGUAGES, Language Classifications,
                 FORTRAN 77. {\bf F.2.1} Theory of Computation, ANALYSIS
                 OF ALGORITHMS AND PROBLEM COMPLEXITY, Numerical
                 Algorithms and Problems, Computations on matrices. {\bf
                 G.4} Mathematics of Computing, MATHEMATICAL SOFTWARE,
                 Certification and testing. {\bf G.4} Mathematics of
                 Computing, MATHEMATICAL SOFTWARE, Efficiency. {\bf G.4}
                 Mathematics of Computing, MATHEMATICAL SOFTWARE,
                 Portability**. {\bf G.4} Mathematics of Computing,
                 MATHEMATICAL SOFTWARE, Reliability and robustness. {\bf
                 G.4} Mathematics of Computing, MATHEMATICAL SOFTWARE,
                 Verification**.",
}
@Article{Li:1998:RPT,
  author =       "Ren-Cang Li",
  title =        "Relative Perturbation Theory: {I}. Eigenvalue and
                 Singular Value Variations",
  journal =      j-SIAM-J-MAT-ANA-APPL,
  volume =       "19",
  number =       "4",
  pages =        "956--982",
  month =        oct,
  year =         "1998",
  CODEN =        "SJMAEL",
  ISSN =         "0895-4798 (print), 1095-7162 (electronic)",
  bibdate =      "Fri Dec 4 12:14:09 MST 1998",
  bibsource =    "http://epubs.siam.org/sam-bin/dbq/toc/SIMAX/19/4;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "See original LAPACK Working note in
                 \cite{Li:1994:RPTa}.",
  URL =          "http://epubs.siam.org/sam-bin/dbq/article/29849",
  acknowledgement = ack-nhfb,
}
@InProceedings{Whaley:1998:ATL,
  author =       "R. Clint Whaley and Jack J. Dongarra",
  title =        "{Automatically Tuned Linear Algebra Software}
                 ({ATLAS})",
  crossref =     "ACM:1998:SHP",
  year =         "1998",
  bibdate =      "Wed Mar 06 06:37:44 2002",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "Best Paper Award for Systems. See original LAPACK
                 Working note in \cite{Whaley:1997:ATL}.",
  URL =          "http://www.netlib.org/utk/people/JackDongarra/PAPERS/atlas-sc98.ps;
                 http://www.supercomp.org/sc98/TechPapers/sc98_FullAbstracts/Whaley814/INDEX.HTM",
  acknowledgement = ack-nhfb,
}
@Article{Arbenz:1999:CPSc,
  author =       "P. Arbenz and A. Cleary and J. Dongarra and M.
                 Hegland",
  title =        "A Comparison of Parallel Solvers for Diagonally
                 Dominant and General Narrow-Banded Linear Systems",
  journal =      j-PARALLEL-DIST-COMP-PRACT,
  volume =       "2",
  number =       "4",
  pages =        "??--??",
  month =        "????",
  year =         "1999",
  CODEN =        "????",
  ISSN =         "1097-2803",
  bibdate =      "Fri Dec 19 08:14:14 MST 2003",
  bibsource =    "http://www.cs.okstate.edu/~pdcp/vols/vol02/vol02no4.html;
                 http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "See original LAPACK Working note in
                 \cite{Arbenz:1999:CPSa}.",
  URL =          "http://www.cs.okstate.edu/~pdcp/vols/vol02/vol02no4abs.html#arbenz",
  acknowledgement = ack-nhfb,
}
@Article{Demmel:1999:APS,
  author =       "James W. Demmel and John R. Gilbert and Xiaoye S. Li",
  title =        "An Asynchronous Parallel Supernodal Algorithm for
                 Sparse {Gaussian} Elimination",
  journal =      j-SIAM-J-MAT-ANA-APPL,
  volume =       "20",
  number =       "4",
  pages =        "915--952",
  month =        oct,
  year =         "1999",
  CODEN =        "SJMAEL",
  ISSN =         "0895-4798 (print), 1095-7162 (electronic)",
  bibdate =      "Sat Jan 22 14:39:14 MST 2000",
  bibsource =    "http://epubs.siam.org/sam-bin/dbq/toc/SIMAX/20/4;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "See original LAPACK Working note in
                 \cite{Demmel:1997:APS}.",
  URL =          "http://epubs.siam.org/sam-bin/dbq/article/31768",
  acknowledgement = ack-nhfb,
}
@Article{Demmel:1999:CSV,
  author =       "James Demmel and others",
  title =        "Computing the singular value decomposition with high
                 relative accuracy",
  journal =      j-LINEAR-ALGEBRA-APPL,
  volume =       "299",
  number =       "1--3",
  pages =        "21--80",
  day =          "15",
  month =        sep,
  year =         "1999",
  CODEN =        "LAAPAW",
  ISSN =         "0024-3795 (print), 1873-1856 (electronic)",
  bibdate =      "Wed Nov 01 08:18:32 2000",
  bibsource =    "http://www.elsevier.com/locate/laa;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "See original LAPACK Working note in
                 \cite{Demmel:1997:CSV}.",
  URL =          "http://www.elsevier.nl/gej-ng/10/30/19/112/21/22/abstract.html;
                 http://www.elsevier.nl/gej-ng/10/30/19/112/21/22/article.pdf",
  acknowledgement = ack-nhfb,
}
@Article{Demmel:1999:SAS,
  author =       "James W. Demmel and Stanley C. Eisenstat and John R.
                 Gilbert and Xiaoye S. Li and Joseph W. H. Liu",
  title =        "A Supernodal Approach to Sparse Partial Pivoting",
  journal =      j-SIAM-J-MAT-ANA-APPL,
  volume =       "20",
  number =       "3",
  pages =        "720--755",
  month =        jul,
  year =         "1999",
  CODEN =        "SJMAEL",
  ISSN =         "0895-4798 (print), 1095-7162 (electronic)",
  bibdate =      "Sat Jan 22 14:39:12 MST 2000",
  bibsource =    "http://epubs.siam.org/sam-bin/dbq/toc/SIMAX/20/3;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "See original LAPACK Working note in
                 \cite{Demmel:1995:SAS}.",
  URL =          "http://epubs.siam.org/sam-bin/dbq/article/29176",
  acknowledgement = ack-nhfb,
}
@Article{Li:1999:RPT,
  author =       "Ren-Cang Li",
  title =        "Relative Perturbation Theory: {II}. Eigenspace and
                 Singular Subspace Variations",
  journal =      j-SIAM-J-MAT-ANA-APPL,
  volume =       "20",
  number =       "2",
  pages =        "471--492",
  month =        apr,
  year =         "1999",
  CODEN =        "SJMAEL",
  ISSN =         "0895-4798 (print), 1095-7162 (electronic)",
  bibdate =      "Fri Dec 4 12:14:09 MST 1998",
  bibsource =    "http://epubs.siam.org/sam-bin/dbq/toc/SIMAX/20/2;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "See original LAPACK Working note in
                 \cite{Li:1994:RPTb}.",
  URL =          "http://epubs.siam.org/sam-bin/dbq/article/29850",
  acknowledgement = ack-nhfb,
}
@Article{Petitet:1999:ARM,
  author =       "A. P. Petitet and J. J. Dongarra",
  title =        "Algorithmic Redistribution Methods for Block-Cyclic
                 Decompositions",
  journal =      j-IEEE-TRANS-PAR-DIST-SYS,
  volume =       "10",
  number =       "12",
  pages =        "201--??",
  month =        dec,
  year =         "1999",
  CODEN =        "ITDSEO",
  ISSN =         "1045-9219 (print), 1558-2183 (electronic)",
  bibdate =      "Thu Oct 12 18:48:32 MDT 2000",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "See original LAPACK Working note in
                 \cite{Petitet:1997:ARM}.",
  URL =          "http://dlib.computer.org/td/books/td1999/pdf/l1201.pdf;
                 http://www.computer.org/tpds/td1999/l1201abs.htm;
                 http://www.netlib.org/utk/people/JackDongarra/PAPERS/alg-dist.ps;
                 http://www.netlib.org/utk/people/JackDongarra/pdf/alg-dist.pdf",
  acknowledgement = ack-nhfb,
}
@InProceedings{Petitet:1999:NLA,
  author =       "A. Petitet and H. Casanova and R. Whaley and J.
                 Dongarra and Y. Robert",
  booktitle =    "SIAM Annual Meeting, Atlanta, GA, May 13, 1999",
  title =        "A Numerical Linear Algebra Problem Solving Environment
                 Designer's Perspective",
  publisher =    pub-SIAM,
  address =      pub-SIAM:adr,
  year =         "1999",
  bibdate =      "Tue Feb 26 10:10:44 2002",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "See original LAPACK Working note in
                 \cite{Petitet:1998:NLA} and \cite{Petitet:2000:PDS}.",
  URL =          "http://www.netlib.org/utk/people/JackDongarra/PAPERS/la-handbook-chp10.ps",
  acknowledgement = ack-nhfb,
}
@Article{DAzevedo:2000:DIP,
  author =       "Eduardo D'Azevedo and Jack Dongarra",
  title =        "The design and implementation of the parallel
                 out-of-core {ScaLAPACK} {$ L U $}, {$ Q R $}, and
                 {Cholesky} factorization routines",
  journal =      j-CPE,
  volume =       "12",
  number =       "15",
  pages =        "1481--1493",
  month =        "????",
  year =         "2000",
  CODEN =        "CPEXEI",
  DOI =          "https://doi.org/10.1002/1096-9128(20001225)12:15<1481::AID-CPE540>3.0.CO;2-V",
  ISSN =         "1040-3108",
  bibdate =      "Sat Apr 7 06:56:11 MDT 2001",
  bibsource =    "http://www.interscience.wiley.com/jpages/1040-3108;
                 http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib;
                 http://www3.interscience.wiley.com/journalfinder.html",
  note =         "See original LAPACK Working note in
                 \cite{Dongarra:1997:DIP}.",
  URL =          "http://www3.interscience.wiley.com/cgi-bin/abstract/76505648/START;
                 http://www3.interscience.wiley.com/cgi-bin/fulltext?ID=76505648&PLACEBO=IE.pdf",
  acknowledgement = ack-nhfb,
}
@Article{Parlett:2000:IAP,
  author =       "Beresford N. Parlett and Osni A. Marques",
  title =        "An implementation of the $ d q d s $ algorithm
                 (positive case)",
  journal =      j-LINEAR-ALGEBRA-APPL,
  volume =       "309",
  number =       "1--3",
  pages =        "217--259",
  day =          "15",
  month =        apr,
  year =         "2000",
  CODEN =        "LAAPAW",
  ISSN =         "0024-3795 (print), 1873-1856 (electronic)",
  bibdate =      "Mon Oct 9 10:54:41 MDT 2000",
  bibsource =    "http://www.elsevier.com/locate/laa;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "See original LAPACK Working note in
                 \cite{Parlett:2002:IDA}.",
  URL =          "http://www.elsevier.nl/gej-ng/10/30/19/126/25/37/abstract.html;
                 http://www.elsevier.nl/gej-ng/10/30/19/126/25/37/article.pdf",
  acknowledgement = ack-nhfb,
}
@InCollection{Petitet:2000:PDS,
  author =       "A. Petitet and H. Casanova and J. Dongarra and Y.
                 Robert and R. Whaley",
  editor =       "Jacek Blazewicz and others",
  booktitle =    "Handbook on Parallel and Distributed Processing",
  title =        "Parallel and Distributed Scientific Computing: {A}
                 Numerical Linear Algebra Problem Solving Environment
                 Designer's Perspective",
  publisher =    pub-SV,
  address =      pub-SV:adr,
  bookpages =    "635",
  pages =        "??--??",
  year =         "2000",
  ISBN =         "3-540-66441-6",
  ISBN-13 =      "978-3-540-66441-3",
  LCCN =         "QA76.58 .H36 2000",
  bibdate =      "Tue Feb 26 10:10:44 2002",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "See original LAPACK Working note in
                 \cite{Petitet:1998:NLA} and \cite{Petitet:1999:NLA}.",
  URL =          "http://www.netlib.org/utk/people/JackDongarra/PAPERS/la-handbook.ps",
  acknowledgement = ack-nhfb,
}
@Article{Andersen:2001:RFC,
  author =       "Bjarne S. Andersen and Jerzy Wa{\'s}niewski and Fred
                 G. Gustavson",
  title =        "A recursive formulation of {Cholesky} factorization of
                 a matrix in packed storage",
  journal =      j-TOMS,
  volume =       "27",
  number =       "2",
  pages =        "214--244",
  month =        jun,
  year =         "2001",
  CODEN =        "ACMSCU",
  DOI =          "https://doi.org/10.1145/383738.383741",
  ISSN =         "0098-3500 (print), 1557-7295 (electronic)",
  bibdate =      "Wed Feb 6 16:43:42 MST 2002",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "See original LAPACK Working note in
                 \cite{Andersen:2000:RFC}.",
  abstract =     "A new compact way to store a symmetric or triangular
                 matrix called RPF for Recursive Packed Format is fully
                 described. Novel ways to transform RPF to and from
                 standard packed format are included. A new algorithm,
                 called RPC for Recursive Packed Cholesky, that operates
                 on the RPG format is presented. Algorithm RPC is basd
                 on level-3 BLAS and requires variants of algorithms
                 TRSM and SYRK that work on RPF. We call these RP\_TRSM
                 and RP\_SYRK and find that they do most of their work
                 by calling GEMM. It follows that most of the execution
                 time of RPC lies in GEMM. The advantage of this storage
                 scheme compared to traditional packed and full storage
                 is demonstrated. First, the RPC storage format uses the
                 minimal amount of storage for the symmetric or
                 triangular matrix. Second, RPC gives a level-3
                 implementation of Cholesky factorization whereas
                 standard packed implementations are only level 2.
                 Hence, the performance of our RPC implementation is
                 decidedly superior. Third, unlike fixed block size
                 algorithms, RPC, requires no block size tuning
                 parameter. We present performance measurements on
                 several current architectures that demonstrate
                 improvements over the traditional packed routines. Also
                 MSP parallel computations on the IBM SMP computer are
                 made. The graphs that are attached in Section 7 show
                 that the RPC algorithms are superior by a factor
                 between 1.6 and 7.4 for order around 1000, and between
                 1.9 and 10.3 for order around 3000 over the traditional
                 packed algorithms. For some architectures, the RPC
                 performance results are almost the same or even better
                 than the traditional full-storage algorithms results.",
  accepted =     "15 March 2001",
  acknowledgement = ack-nhfb,
}
@Article{Whaley:2001:AEO,
  author =       "R. Clint Whaley and Antoine Petitet and Jack J.
                 Dongarra",
  title =        "Automated empirical optimizations of software and the
                 {ATLAS} project",
  journal =      j-PARALLEL-COMPUTING,
  volume =       "27",
  number =       "1--2",
  pages =        "3--35",
  month =        jan,
  year =         "2001",
  CODEN =        "PACOEJ",
  ISSN =         "0167-8191",
  bibdate =      "Wed Jul 18 06:31:14 MDT 2001",
  bibsource =    "http://www.elsevier.com/locate/issn/01678191;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "See original LAPACK Working note in
                 \cite{Whaley:2000:AEO}.",
  URL =          "http://www.elsevier.nl/gej-ng/10/35/21/47/25/23/abstract.html;
                 http://www.elsevier.nl/gej-ng/10/35/21/47/25/23/article.pdf;
                 http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib;
                 http://www.netlib.org/utk/people/JackDongarra/PAPERS/atlas_pub.pdf",
  acknowledgement = ack-nhfb,
}
@Article{Bindel:2002:CGR,
  author =       "David Bindel and James Demmel and William Kahan and
                 Osni Marques",
  title =        "On computing {Givens} rotations reliably and
                 efficiently",
  journal =      j-TOMS,
  volume =       "28",
  number =       "2",
  pages =        "206--238",
  month =        jun,
  year =         "2002",
  CODEN =        "ACMSCU",
  DOI =          "https://doi.org/10.1145/567806.567809",
  ISSN =         "0098-3500 (print), 1557-7295 (electronic)",
  bibdate =      "Sat Nov 9 11:16:50 MST 2002",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "See original LAPACK Working note in
                 \cite{Bindel:2000:CGR}.",
  URL =          "all previous codes occasionally suffer from large
                 inaccuracies due to over/underflow. For real Givens
                 rotations there are also improvements in speed and
                 accuracy, though not as striking. Third, the design
                 process that led to this reliable implementation is
                 quite systematic, and could be applied to the design of
                 similarly reliable subroutines.",
  abstract =     "We consider the efficient and accurate computation of
                 Givens rotations. When $f$ and $g$ are positive real
                 numbers, this simply amounts to computing the values of
                 $ c = f / \sqrt {f^2 + g^2} $, $ s = g / \sqrt {f^2 +
                 g^2} $, and $ r = \sqrt {f^2 + g^2} $. This apparently
                 trivial computation merits closer consideration for the
                 following three reasons. First, while the definitions
                 of $c$, $s$ and $r$ seem obvious in the case of two
                 nonnegative arguments $f$ and $g$, there is enough
                 freedom of choice when one or more of $f$ and $g$ are
                 negative, zero or complex that LAPACK auxiliary
                 routines SLARTG, CLARTG, SLARGV and CLARGV can compute
                 rather different values of $c$, $s$ and $r$ for
                 mathematically identical values of $f$ and $g$. To
                 eliminate this unnecessary ambiguity, the BLAS
                 Technical Forum chose a single consistent definition of
                 Givens rotations that we will justify here. Second,
                 computing accurate values of $c$, $s$ and $r$ as
                 efficiently as possible and reliably despite
                 over/underflow is surprisingly complicated. For complex
                 Givens rotations, the most efficient formulas require
                 only one real square root and one real divide (as well
                 as several much cheaper additions and multiplications),
                 but a reliable implementation using only working
                 precision has a number of cases. On a Sun Ultra-10, the
                 new implementation is slightly faster than the previous
                 LAPACK implementation in the most common case, and 2.7
                 to 4.6 times faster than the corresponding vendor,
                 reference or ATLAS routines. It is also more reliable",
  acknowledgement = ack-nhfb,
}
@Article{Henry:2002:PIN,
  author =       "Greg Henry and David Watkins and Jack Dongarra",
  title =        "A Parallel Implementation of the Nonsymmetric {$ Q R
                 $} Algorithm for Distributed Memory Architectures",
  journal =      j-SIAM-J-SCI-COMP,
  volume =       "24",
  number =       "1",
  pages =        "284--311",
  month =        jan,
  year =         "2002",
  CODEN =        "SJOCE3",
  DOI =          "https://doi.org/10.1137/S1064827597325165",
  ISSN =         "1064-8275 (print), 1095-7197 (electronic)",
  ISSN-L =       "1064-8275",
  bibdate =      "Tue Oct 22 18:24:38 MDT 2002",
  bibsource =    "http://epubs.siam.org/sam-bin/dbq/toc/SISC/24/1;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "See original LAPACK Working note in
                 \cite{Henry:1997:PIN}.",
  URL =          "http://epubs.siam.org/sam-bin/dbq/article/32516",
  acknowledgement = ack-nhfb,
  fjournal =     "SIAM Journal on Scientific Computing",
}
@Article{Li:2002:DIT,
  author =       "Xiaoye S. Li and James W. Demmel and David H. Bailey
                 and Greg Henry and Yozo Hida and Jimmy Iskandar and
                 William Kahan and Suh Y. Kang and Anil Kapur and
                 Michael C. Martin and Brandon J. Thompson and Teresa
                 Tung and Daniel J. Yoo",
  title =        "Design, implementation and testing of extended and
                 mixed precision {BLAS}",
  journal =      j-TOMS,
  volume =       "28",
  number =       "2",
  pages =        "152--205",
  month =        jun,
  year =         "2002",
  CODEN =        "ACMSCU",
  DOI =          "https://doi.org/10.1145/567806.567808",
  ISSN =         "0098-3500 (print), 1557-7295 (electronic)",
  bibdate =      "Sat Nov 9 11:16:50 MST 2002",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "See original LAPACK Working note in
                 \cite{Li:2000:DIT}.",
  abstract =     "This paper describes the design rationale, a C
                 implementation, and conformance testing of a subset of
                 the new Standard for the BLAS (Basic Linear Algebra
                 Subroutines): Extended and Mixed Precision BLAS.
                 Permitting higher internal precision and mixed
                 input\slash output types and precisions allows us to
                 implement some algorithms that are simpler, more
                 accurate, and sometimes faster than possible without
                 these features. The new BLAS are challenging to
                 implement and test because there are many more
                 subroutines than in the existing Standard, and because
                 we must be able to assess whether a higher precision is
                 used for internal computations than is used for either
                 input or output variables. We have therefore developed
                 an automated process of generating and systematically
                 testing these routines. Our methodology is applicable
                 to languages besides C. In particular, our algorithms
                 used in the testing code will be valuable to all other
                 BLAS implementors. Our extra precision routines achieve
                 excellent performance---close to half of the machine
                 peak Megaflop rate even for the Level 2 BLAS, when the
                 data access is stride one.",
  acknowledgement = ack-nhfb,
}
@Article{Dongarra:2003:SANb,
  author =       "Jack Dongarra and Victor Eijkhout",
  title =        "Self-Adapting Numerical Software for Next Generation
                 Applications",
  journal =      j-IJHPCA,
  volume =       "17",
  number =       "2",
  pages =        "125--131",
  month =        "Summer",
  year =         "2003",
  CODEN =        "IHPCFL",
  ISSN =         "1094-3420 (print), 1741-2846 (electronic)",
  ISSN-L =       "1094-3420",
  bibdate =      "Fri Nov 28 06:52:13 2003",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "See original LAPACK Working note in
                 \cite{Dongarra:2002:SAN}.",
  URL =          "http://www.math.utah.edu/pub/tex/bib/lawn.bib;
                 http://www.netlib.org/netlib/utk/people/JackDongarra/PAPERS/sans-position.pdf;
                 http://www.netlib.org/utk/people/JackDongarra/PAPERS/sans-ijhpca.pdf",
  acknowledgement = ack-nhfb,
}
@Article{Dhillon:2004:OER,
  author =       "Inderjit S. Dhillon and Beresford N. Parlett",
  title =        "Orthogonal Eigenvectors and Relative Gaps",
  journal =      j-SIAM-J-MAT-ANA-APPL,
  volume =       "25",
  number =       "3",
  pages =        "858--899",
  month =        jul,
  year =         "2004",
  CODEN =        "SJMAEL",
  ISSN =         "0895-4798 (print), 1095-7162 (electronic)",
  bibdate =      "Sat Apr 16 10:32:32 MDT 2005",
  bibsource =    "http://epubs.siam.org/sam-bin/dbq/toc/SIMAX/25/3;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "See original LAPACK Working note in
                 \cite{Dhillon:2002:OER}.",
  URL =          "http://epubs.siam.org/sam-bin/dbq/article/37011",
  acknowledgement = ack-nhfb,
}
@Article{Demmel:2007:FLAb,
  author =       "James Demmel and Ioana Dumitriu and Olga Holtz",
  title =        "Fast linear algebra is stable",
  journal =      j-NUM-MATH,
  volume =       "108",
  number =       "1",
  pages =        "59--91",
  month =        nov,
  year =         "2007",
  CODEN =        "NUMMA7",
  DOI =          "https://doi.org/10.1007/s00211-007-0114-x;
                 https://doi.org/10.1007/s00211-007-0114-x",
  ISSN =         "0029-599X (print), 0945-3245 (electronic)",
  bibdate =      "Tue Jul 8 09:49:13 MDT 2008",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  abstract =     "In Demmel et al. (Numer. Math. 106(2), 199--224, 2007)
                 we showed that a large class of fast recursive matrix
                 multiplication algorithms is stable in a normwise
                 sense, and that in fact if multiplication of $n$-by-$n$
                 matrices can be done by any algorithm in {$ O(n^{\omega
                 + \eta }) $} operations for any $ \eta > 0 $, then it
                 can be done stably in {$ O(n^{\omega + \eta }) $}
                 operations for any $ \eta > 0 $. Here we extend this
                 result to show that essentially all standard linear
                 algebra operations, including LU decomposition, QR
                 decomposition, linear equation solving, matrix
                 inversion, solving least squares problems,
                 (generalized) eigenvalue problems and the singular
                 value decomposition can also be done stably (in a
                 normwise sense) in {$ O(n^{\omega + \eta }) $}
                 operations.",
  acknowledgement = ack-nhfb,
  remark =       "Journal publication of LAWN 186
                 \cite{Demmel:2007:FLAa}.",
}
@Article{Buttari:2008:PTF,
  author =       "Alfredo Buttari and Julien Langou and Jakub Kurzak and
                 Jack Dongarra",
  title =        "Parallel Tiled {$ Q R $} Factorization for Multicore
                 Architectures",
  journal =      j-CCPE,
  volume =       "20",
  number =       "13",
  pages =        "1573--1590",
  month =        SEP,
  year =         "2008",
  CODEN =        "CCPEBO",
  DOI =          "https://doi.org/10.1002/cpe.1301",
  ISSN =         "1532-0626 (print), 1532-0634 (electronic)",
  bibdate =      "Fri Apr 24 12:25:43 2009",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  acknowledgement = ack-nhfb,
  remark =       "Journal publication of LAWN 190
                 \cite{Buttari:2007:PTQ}.",
}
@Article{Demmel:2009:EPI,
  author =       "James Demmel and Yozo Hida and E. Jason Riedy and
                 Xiaoye S. Li",
  title =        "Extra-Precise Iterative Refinement for Overdetermined
                 Least Squares Problems",
  journal =      j-TOMS,
  volume =       "35",
  number =       "4",
  pages =        "28:1--28:32",
  month =        feb,
  year =         "2009",
  CODEN =        "ACMSCU",
  DOI =          "https://doi.org/10.1145/1462173.1462177",
  ISSN =         "0098-3500 (print), 1557-7295 (electronic)",
  ISSN-L =       "0098-3500",
  bibdate =      "Fri Feb 13 18:09:40 MST 2009",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib;
                 http://www.math.utah.edu/pub/tex/bib/toms.bib",
  abstract =     "We present the algorithm, error bounds, and numerical
                 results for extra-precise iterative refinement applied
                 to overdetermined linear least squares (LLS) problems.
                 We apply our linear system refinement algorithm to
                 Bj{\"o}rck's augmented linear system formulation of an
                 LLS problem. Our algorithm reduces the forward normwise
                 and componentwise errors to $ O(\epsilon_w) $, where $
                 \epsilon_w $ is the working precision, unless the
                 system is too ill conditioned. In contrast to linear
                 systems, we provide two separate error bounds for the
                 solution $x$ and the residual $r$. The refinement
                 algorithm requires only limited use of extra precision
                 and adds only $ O(m n)$ work to the $ O(m n^2)$ cost of
                 QR factorization for problems of size $ m \times n$.
                 The extra precision calculation is facilitated by the
                 new extended-precision BLAS standard in a portable way,
                 and the refinement algorithm will be included in a
                 future release of LAPACK and can be extended to the
                 other types of least squares problems.",
  acknowledgement = ack-nhfb,
  articleno =    "28",
  fjournal =     "ACM Transactions on Mathematical Software (TOMS)",
  journal-URL =  "http://dl.acm.org/pub.cfm?id=J782",
  keywords =     "BLAS; floating-point arithmetic; LAPACK; Linear
                 algebra",
  remark =       "Journal publication of LAWN 188
                 \cite{Demmel:2007:EPI}.",
}
@Proceedings{Burkhart:1990:CVI,
  editor =       "H. (Helmar) Burkhart",
  booktitle =    "{CONPAR 90-VAPP IV}: {Joint} International Conference
                 on Vector and Parallel Processing, {Zurich,
                 Switzerland, September 10--13, 1990}: proceedings",
  title =        "{CONPAR 90-VAPP IV}: {Joint} International Conference
                 on Vector and Parallel Processing, {Zurich,
                 Switzerland, September 10--13, 1990}: proceedings",
  volume =       "457",
  publisher =    pub-SV,
  address =      pub-SV:adr,
  pages =        "xi + 900",
  year =         "1990",
  ISBN =         "3-540-53065-7 (Berlin), 0-387-53065-7 (New York)",
  ISBN-13 =      "978-3-540-53065-7 (Berlin), 978-0-387-53065-9 (New
                 York)",
  LCCN =         "QA76.58 .J65 1990",
  bibdate =      "Sat Apr 23 06:53:59 MDT 2005",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib;
                 z3950.loc.gov:7090/Voyager",
  series =       "Lecture notes in computer science",
  acknowledgement = ack-nhfb,
  subject =      "Parallel processing (Electronic computers);
                 Congresses; Vector processing (Computer science);
                 Congresses",
}
@Proceedings{IEEE:1990:PSN,
  editor =       "{IEEE}",
  booktitle =    "Proceedings, Supercomputing '90: November 12--16,
                 1990, New York Hilton at Rockefeller Center, New York,
                 New York",
  title =        "Proceedings, Supercomputing '90: November 12--16,
                 1990, New York Hilton at Rockefeller Center, New York,
                 New York",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "xxv + 982",
  year =         "1990",
  ISBN =         "0-8186-2056-0 (paperback) (IEEE Computer Society),
                 0-89791-412-0 (paperback) (ACM)",
  ISBN-13 =      "978-0-8186-2056-0 (paperback) (IEEE Computer Society),
                 978-0-89791-412-3 (paperback) (ACM)",
  LCCN =         "QA 76.88 S87 1990",
  bibdate =      "Wed Aug 28 06:48:31 MDT 1996",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib;
                 University of California MELVYL catalog",
  note =         "ACM order number 415903. IEEE Computer Society Press
                 order number 2056. IEEE catalog number 90CH2916-5.",
  acknowledgement = ack-nhfb,
  classification = "C5440 (Multiprocessor systems and techniques); C5470
                 (Performance evaluation and testing); C6110 (Systems
                 analysis and programming); C7000 (Computer
                 applications)",
  keywords =     "biological applications; computer applications;
                 computer chess; innovative architectures; linear
                 algebra algorithms; memory; networking computing;
                 parallel languages; parallel processing; particle
                 transport; partitioning; performance evaluation;
                 performance visualizations; pipeline processing;
                 program analysis; program restructuring; scheduling;
                 supercomputers --- congresses; vector algorithms",
}
@Proceedings{Griffiths:1992:NAP,
  editor =       "D. F. Griffiths and G. A. Watson",
  booktitle =    "Numerical analysis, 1991: proceedings of the 14th
                 Dundee Conference, June 1991",
  title =        "Numerical analysis, 1991: proceedings of the 14th
                 Dundee Conference, June 1991",
  volume =       "260",
  publisher =    pub-LONGMAN,
  address =      pub-LONGMAN:adr,
  pages =        "292",
  year =         "1992",
  ISBN =         "0-582-08908-5",
  ISBN-13 =      "978-0-582-08908-2",
  LCCN =         "QA297.D85 1991",
  bibdate =      "Mon Jan 15 11:24:40 1996",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  series =       "Pitman Res. Notes Math. Ser.",
  acknowledgement = ack-nhfb,
}
@Proceedings{IEEE:1992:SHP,
  editor =       "{IEEE}",
  key =          "SHPCC-92",
  booktitle =    "Scalable High Performance Computing Conference,
                 SHPCC-92, April 26--29, 1992, Williamsburg, Virginia",
  title =        "Scalable High Performance Computing Conference,
                 {SHPCC}-92, April 26--29, 1992, Williamsburg,
                 Virginia",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "xiii + 448",
  year =         "1992",
  ISBN =         "0-8186-2775-1",
  ISBN-13 =      "978-0-8186-2775-0",
  LCCN =         "QA76.76.A65 S33 1992",
  bibdate =      "Fri Dec 30 11:18:38 1994",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  acknowledgement = ack-nhfb,
}
@Proceedings{Siegel:1992:FSF,
  editor =       "H. J. Siegel",
  booktitle =    "The Fourth Symposium on the Frontiers of Massively
                 Parallel Computation: Frontiers '92 / October 19--21,
                 1992, McLean, Virginia",
  title =        "The Fourth Symposium on the Frontiers of Massively
                 Parallel Computation: Frontiers '92 / October 19--21,
                 1992, McLean, Virginia",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "xviii + 592",
  year =         "1992",
  ISBN =         "0-8186-2772-7 (hardback), 0-8186-2771-9 (microfiche)",
  ISBN-13 =      "978-0-8186-2772-9 (hardback), 978-0-8186-2771-2
                 (microfiche)",
  LCCN =         "QA76.58 .S95 1992",
  bibdate =      "Mon Jan 15 11:06:11 1996",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  acknowledgement = ack-nhfb,
}
@Proceedings{Swartzlander:1993:PSC,
  editor =       "Earl {Swartzlander, Jr.} and Mary Jane Irwin and
                 Graham Jullien",
  booktitle =    "Proceedings: 11th Symposium on Computer Arithmetic,
                 June 29--July 2, 1993, Windsor, Ontario",
  title =        "Proceedings: 11th Symposium on Computer Arithmetic,
                 June 29--July 2, 1993, Windsor, Ontario",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "xii + 284",
  year =         "1993",
  ISBN =         "0-7803-1401-8 (softbound), 0-8186-3862-1 (casebound),
                 0-8186-3861-3 (microfiche)",
  ISBN-13 =      "978-0-7803-1401-6 (softbound), 978-0-8186-3862-6
                 (casebound), 978-0-8186-3861-9 (microfiche)",
  ISSN =         "0018-9340 (print), 1557-9956 (electronic)",
  ISSN-L =       "0018-9340",
  LCCN =         "QA 76.9 C62 S95 1993",
  bibdate =      "Thu Sep 01 22:58:49 1994",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "IEEE Transactions on Computers {\bf 43(8)}, 1994",
  acknowledgement = ack-nhfb,
  keywords =     "ARITH-11",
}
@Proceedings{Sincovec:1993:SCP,
  editor =       "Richard F. Sincovec",
  booktitle =    "SIAM Conference on Parallel Processing for Scientific
                 Computing (6th: 1993: Norfolk, VA, USA)",
  title =        "{SIAM} Conference on Parallel Processing for
                 Scientific Computing (6th: 1993: Norfolk, {VA},
                 {USA})",
  publisher =    pub-SIAM,
  address =      pub-SIAM:adr,
  pages =        "xix + 1041 + iv",
  year =         "1993",
  ISBN =         "0-89871-315-3",
  ISBN-13 =      "978-0-89871-315-2",
  LCCN =         "QA 76.58 S55 1993",
  bibdate =      "Wed Aug 14 10:36:11 1996",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "Two volumes.",
  acknowledgement = ack-nhfb,
  sponsor =      "Society for Industrial and Applied Mathematics.",
}
@Proceedings{Anonymous:1994:OON,
  editor =       "Anonymous",
  booktitle =    "{Object oriented numerics: Annual conference: 2nd ---
                 April 1994, Sunriver, OR}",
  title =        "{Object oriented numerics: Annual conference: 2nd ---
                 April 1994, Sunriver, OR}",
  publisher =    "RWS",
  address =      "Corvallis, OR",
  pages =        "????",
  year =         "1994",
  bibdate =      "Thu Sep 16 09:48:36 MDT 1999",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  acknowledgement = ack-nhfb,
}
@Proceedings{Dongarra:1994:PSW,
  editor =       "J. J. Dongarra and B. Tourancheau",
  booktitle =    "{Proceedings of the Second Workshop on Environments
                 and Tools for Parallel Scientific Computing, Townsend,
                 TN, USA, May 25--27, 1994}",
  title =        "{Proceedings of the Second Workshop on Environments
                 and Tools for Parallel Scientific Computing, Townsend,
                 TN, USA, May 25--27, 1994}",
  publisher =    pub-SIAM,
  address =      pub-SIAM:adr,
  pages =        "x + 292",
  year =         "1994",
  ISBN =         "0-89871-343-9",
  ISBN-13 =      "978-0-89871-343-5",
  LCCN =         "QA76.58.I568 1994",
  bibdate =      "Sat May 11 12:16:44 MDT 1996",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  series =       "Proceedings of the Workshop on Environments and Tools
                 for Parallel Scientific Computing",
  acknowledgement = ack-nhfb,
}
@Proceedings{IEEE:1994:PSP,
  editor =       "{IEEE}",
  booktitle =    "Proceedings of the Scalable Parallel Libraries
                 Conference, October 6--8, 1993, Mississippi State,
                 Mississippi",
  title =        "Proceedings of the Scalable Parallel Libraries
                 Conference, October 6--8, 1993, Mississippi State,
                 Mississippi",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "vii + 261",
  year =         "1994",
  ISBN =         "0-8186-4980-1 (paper), 0-8186-4981-X (microfiche)",
  ISBN-13 =      "978-0-8186-4980-6 (paper), 978-0-8186-4981-3
                 (microfiche)",
  LCCN =         "QA76.58 .S34 1993",
  bibdate =      "Sat Mar 22 18:40:38 1997",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  acknowledgement = ack-nhfb,
}
@Proceedings{IEEE:1995:DPT,
  editor =       "{IEEE}",
  booktitle =    "Digest of papers / the Twenty-fifth International
                 Symposium on Fault-Tolerant Computing, June 27--30,
                 1995, Pasadena, California",
  title =        "Digest of papers / the Twenty-fifth International
                 Symposium on Fault-Tolerant Computing, June 27--30,
                 1995, Pasadena, California",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  pages =        "xxiii + 547",
  year =         "1995",
  CODEN =        "DPFTDL",
  ISBN =         "0-8186-7079-7, 0-8186-7145-9",
  ISBN-13 =      "978-0-8186-7079-4, 978-0-8186-7145-6",
  ISSN =         "0731-3071",
  LCCN =         "QA 76.9 F38 I57 1995",
  bibdate =      "Fri Mar 1 10:04:10 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "IEEE catalog number 95CH35823.",
  series =       "FTCS 25th",
  acknowledgement = ack-nhfb,
  sponsor =      "IEEE. Computer Society. Technical Committee on Fault-
                 Tolerant Computing.",
}
@Proceedings{ACM:1996:SCP,
  editor =       "{ACM}",
  booktitle =    "{Supercomputing '96 Conference Proceedings: November
                 17--22, Pittsburgh, PA}",
  title =        "{Supercomputing '96 Conference Proceedings: November
                 17--22, Pittsburgh, PA}",
  publisher =    pub-ACM # " and " # pub-IEEE,
  address =      pub-ACM:adr # " and " # pub-IEEE:adr,
  pages =        "????",
  year =         "1996",
  ISBN =         "0-89791-854-1",
  ISBN-13 =      "978-0-89791-854-1",
  LCCN =         "A76.88 .S8573 1996",
  bibdate =      "Mon Mar 23 12:30:13 1998",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  note =         "ACM Order Number: 415962, IEEE Computer Society Press
                 Order Number: RS00126.",
  URL =          "http://www.supercomp.org/sc96/proceedings/",
  acknowledgement = ack-nhfb,
}
@Proceedings{Dongarra:1996:APC,
  editor =       "J. J. Dongarra and Kaj Madsen and Jerzy
                 Wa{\'s}niewski",
  booktitle =    "{Applied parallel computing: computations in physics,
                 chemistry, and engineering science: second
                 international workshop, PARA '95, Lyngby, Denmark,
                 August 21--24, 1995: proceedings}",
  title =        "{Applied parallel computing: computations in physics,
                 chemistry, and engineering science: second
                 international workshop, PARA '95, Lyngby, Denmark,
                 August 21--24, 1995: proceedings}",
  volume =       "1041",
  publisher =    pub-SV,
  address =      pub-SV:adr,
  pages =        "562",
  year =         "1996",
  CODEN =        "LNCSD9",
  ISBN =         "3-540-60902-4",
  ISBN-13 =      "978-3-540-60902-5",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  LCCN =         "QA76.58 .P35 1995",
  MRclass =      "65-06",
  MRnumber =     "1 320 056",
  bibdate =      "Thu Dec 19 14:25:58 1996",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  series =       ser-LNCS,
  acknowledgement = ack-nhfb,
  keywords =     "Chemistry -- Data processing -- Congresses;
                 Engineering -- Data processing -- Congresses.; Parallel
                 processing (Electronic computers) -- Congresses;
                 Physics -- Data processing -- Congresses",
}
@Proceedings{Wasniewski:1996:APC,
  editor =       "Jerzy Wa{\'s}niewski and J. Dongarra and K. Madsen and
                 D. Olesen",
  booktitle =    "Applied parallel computing: industrial-strength
                 computation and optimization: Third International
                 Workshop, {PARA} 96, Lyngby, Denmark, August 18--21,
                 1996: proceedings",
  title =        "Applied parallel computing: industrial-strength
                 computation and optimization: Third International
                 Workshop, {PARA} 96, Lyngby, Denmark, August 18--21,
                 1996: proceedings",
  volume =       "1184",
  publisher =    pub-SV,
  address =      pub-SV:adr,
  pages =        "xiii + 722",
  year =         "1996",
  ISBN =         "3-540-62095-8 (softcover)",
  ISBN-13 =      "978-3-540-62095-2 (softcover)",
  ISSN =         "0302-9743 (print), 1611-3349 (electronic)",
  LCCN =         "QA76.58 .P35 1996",
  bibdate =      "Sat Dec 21 16:06:37 MST 1996",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  series =       ser-LNCS,
  acknowledgement = ack-nhfb,
  keywords =     "Parallel processing (Electronic computers) --
                 Congresses.",
}
@Proceedings{ACM:1998:SHP,
  editor =       "{ACM}",
  booktitle =    "{SC'98: High Performance Networking and Computing:
                 Proceedings of the 1998 ACM\slash IEEE SC98 Conference:
                 Orange County Convention Center, Orlando, Florida, USA,
                 November 7--13, 1998}",
  title =        "{SC'98: High Performance Networking and Computing:
                 Proceedings of the 1998 ACM\slash IEEE SC98 Conference:
                 Orange County Convention Center, Orlando, Florida, USA,
                 November 7--13, 1998}",
  publisher =    pub-ACM # " and " # pub-IEEE,
  address =      pub-ACM:adr # " and " # pub-IEEE:adr,
  pages =        "????",
  year =         "1998",
  ISBN =         "????",
  ISBN-13 =      "????",
  LCCN =         "????",
  bibdate =      "Wed Oct 07 08:51:34 1998",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  URL =          "http://www.supercomp.org/sc98/papers/",
  acknowledgement = ack-nhfb,
}
@Proceedings{DHollander:1998:PCF,
  editor =       "E. D'Hollander and others",
  booktitle =    "{Parallel computing: fundamentals, applications, and
                 new directions: Papers from ParCo97, held in Bonn,
                 Germany, Sept. 19--22, 1997}",
  title =        "{Parallel computing: fundamentals, applications, and
                 new directions: Papers from ParCo97, held in Bonn,
                 Germany, Sept. 19--22, 1997}",
  volume =       "12",
  publisher =    pub-ELSEVIER,
  address =      pub-ELSEVIER:adr,
  pages =        "xx + 748",
  year =         "1998",
  ISBN =         "0-444-82882-6",
  ISBN-13 =      "978-0-444-82882-8",
  LCCN =         "QA76.58.P3795 1997",
  bibdate =      "Thu Sep 16 09:48:36 MDT 1999",
  bibsource =    "http://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 http://www.math.utah.edu/pub/tex/bib/lawn.bib",
  series =       "Advances in Parallel Computing",
  acknowledgement = ack-nhfb,
}