%%% -*-BibTeX-*-
%%% ====================================================================
%%%  BibTeX-file{
%%%     author          = "Nelson H. F. Beebe",
%%%     version         = "1.02",
%%%     date            = "13 May 2011",
%%%     time            = "18:06:48 MDT",
%%%     filename        = "supercomputing2003.bib",
%%%     address         = "University of Utah
%%%                        Department of Mathematics, 110 LCB
%%%                        155 S 1400 E RM 233
%%%                        Salt Lake City, UT 84112-0090
%%%                        USA",
%%%     telephone       = "+1 801 581 5254",
%%%     FAX             = "+1 801 581 4148",
%%%     URL             = "http://www.math.utah.edu/~beebe",
%%%     checksum        = "46413 2188 12588 123939",
%%%     email           = "beebe at math.utah.edu, beebe at acm.org,
%%%                        beebe at computer.org (Internet)",
%%%     codetable       = "ISO/ASCII",
%%%     keywords        = "BibTeX, bibliography, SC2003, Supercomputing
%%%                        2003",
%%%     supported       = "yes",
%%%     docstring       = "This is a complete bibliography of papers
%%%                        published in the proceedings of
%%%                        Supercomputing '2003.
%%%
%%%                        The conference World-Wide Web site is
%%%
%%%                            http://www.sc-conference.org/sc2003/
%%%
%%%                        The organizers of this conference series
%%%                        maintain a World-Wide Web site at
%%%
%%%                            http://www.supercomp.org/
%%%
%%%                        where pointers to Web pages for the
%%%                        conferences from 1988 to date may be found.
%%%
%%%                        At version 1.02, the year coverage looked
%%%                        like this:
%%%
%%%                             2003 (  61)
%%%
%%%                             InProceedings:   60
%%%                             Proceedings:      1
%%%
%%%                             Total entries:   61
%%%
%%%                        In this bibliography, entries are sorted in
%%%                        order of PDF file numbers.
%%%
%%%                        The on-line electronic proceedings do not
%%%                        contain sequential page numbers, although
%%%                        there is an ISBN assigned for the
%%%                        proceedings.  A pagecount field is given with
%%%                        each entry, extracted from the PDF file: some
%%%                        of the articles lack page numbers altogether,
%%%                        others number pages 1, 2, 3, ...
%%%
%%%                        The checksum field above contains a CRC-16
%%%                        checksum as the first value, followed by the
%%%                        equivalent of the standard UNIX wc (word
%%%                        count) utility output of lines, words, and
%%%                        characters.  This is produced by Robert
%%%                        Solovay's checksum utility.",
%%%  }
%%% ====================================================================

@Preamble{
"\ifx \undefined \TM \def \TM {${}^{\sc TM}$} \fi"
}

%%% ====================================================================
%%% Acknowledgement abbreviations:

@String{ack-nhfb = "Nelson H. F. Beebe,
University of Utah,
Department of Mathematics, 110 LCB,
155 S 1400 E RM 233,
Salt Lake City, UT 84112-0090, USA,
Tel: +1 801 581 5254,
FAX: +1 801 581 4148,
e-mail: \path|beebe@math.utah.edu|,
\path|beebe@acm.org|,
\path|beebe@computer.org| (Internet),
URL: \path|http://www.math.utah.edu/~beebe/|"}

%%% ====================================================================

@String{pub-ACM                 = "ACM Press"}

@String{pub-ACM:adr             = "New York, NY 10036, USA"}

@String{pub-IEEE                = "IEEE Computer Society Press"}

@String{pub-IEEE:adr            = "1109 Spring Street, Suite 300,
Silver Spring, MD 20910, USA"}

%%% ====================================================================
%%% Bibliography entries.

@InProceedings{Tang:2003:EDL,
author =       "Hong Tang and Tao Yang",
title =        "An Efficient Data Location Protocol for
Self-organizing Storage Clusters",
crossref =     "ACM:2003:SII",
pages =        "??--??",
year =         "2003",
bibdate =      "Wed Nov 26 07:34:20 2003",
URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10686#0;
http://www.sc-conference.org/sc2003/paperpdfs/pap299.pdf",
abstract =     "Component additions and failures are common for
large-scale storage clusters in production
environments. To improve availability and
manageability, we investigate and compare data location
schemes for a large self-organizing storage cluster
of storage nodes. We further present an efficient
location scheme that differentiates between small and
large file blocks for reduced management overhead
compared to uniform strategies. In our protocol, small
blocks, which are typically in large quantities, are
placed through consistent hashing. Large blocks, much
fewer in practice, are placed through a usage-based
policy, and their locations are tracked by Bloom
filters. The proposed scheme results in improved
storage utilization even with non-uniform cluster
nodes. To achieve high scalability and fault
resilience, this protocol is fully distributed, relies
only on soft states, and supports data replication. We
demonstrate the effectiveness and efficiency of this
protocol through trace-driven simulation.",
acknowledgement = ack-nhfb,
}

@InProceedings{Wu:2003:HHS,
author =       "Changxun Wu and Randal Burns",
title =        "Handling Heterogeneity in Shared-Disk File Systems",
crossref =     "ACM:2003:SII",
pages =        "??--??",
year =         "2003",
bibdate =      "Wed Nov 26 07:34:20 2003",
URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10686#1;
http://www.sc-conference.org/sc2003/paperpdfs/pap132.pdf",
abstract =     "We develop and evaluate a system for load management
in shared-disk file systems built on clusters of
heterogeneous computers. The system generalizes load
balancing and server provisioning. It balances file
server nodes. It also responds to changing server
resources that arise from failure and recovery and
dynamically adding or removing servers. The system is
adaptive and self-managing. It operates without any
a-priori knowledge of workload properties or the
capabilities of the servers. Rather, it continuously
non-uniform (ANU) randomization. ANU randomization
realizes the scalability and metadata reduction
benefits of hash-based, randomized placement
techniques. It also avoids hashing's drawbacks: load
skew, inability to cope with heterogeneity, and lack of
tunability. Simulation results show that our
load-management algorithm performs comparably to a
prescient algorithm.",
acknowledgement = ack-nhfb,
}

@InProceedings{Nagaraja:2003:QIA,
author =       "Kiran Nagaraja and Neeraj Krishnan and Ricardo
Bianchini and Richard P. Martin and Thu D. Nguyen",
title =        "Quantifying and Improving the Availability of
High-Performance Cluster-Based {Internet} Services",
crossref =     "ACM:2003:SII",
pages =        "??--??",
year =         "2003",
bibdate =      "Wed Nov 26 07:34:20 2003",
URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10686#2;
http://www.sc-conference.org/sc2003/paperpdfs/pap213.pdf",
abstract =     "Cluster-based servers can substantially increase
performance when nodes cooperate to globally manage
resources. However, in this paper we show that
cooperation results in a substantial availability loss,
in the absence of high-availability mechanisms.
Specifically, we show that a sophisticated
cluster-based Web server, which gains a factor of 3 in
performance through cooperation, increases service
unavailability by a factor of 10 over a non-cooperative
version. We then show how to augment this Web server
with software components embodying a small set of
high-availability techniques to regain the lost
availability. Among other interesting observations, we
show that the application of multiple high-availability
techniques, each implemented independently in its own
subsystem, can lead to inconsistent recovery actions.
We also show that a novel technique called Fault Model
Enforcement can be used to resolve such
inconsistencies. Augmenting the server with these
techniques led to a final expected availability of
close to 99.99\%.",
acknowledgement = ack-nhfb,
}

@InProceedings{Roth:2003:MSB,
author =       "Philip C. Roth and Dorian C. Arnold and Barton P.
Miller",
title =        "{MRNet}: {A} Software-Based Multicast\slash Reduction
Network for Scalable Tools",
crossref =     "ACM:2003:SII",
pages =        "??--??",
year =         "2003",
bibdate =      "Wed Nov 26 07:34:20 2003",
URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10687#0;
http://www.sc-conference.org/sc2003/paperpdfs/pap192.pdf",
abstract =     "We present MRNet, a software-based multicast/reduction
network for building scalable performance and system
simultaneous, asynchronous collective communication
operations. MRNet is flexible, allowing tool builders
to tailor its process network topology to suit their
tool's requirements and the underlying system's
capabilities. MRNet is extensible, allowing tool
builders to incorporate custom data reductions to
augment its collection of built-in reductions. We
evaluated MRNet in a simple test tool and also
integrated into an existing, real-world performance
tool with up to 512 tool back-ends. In the real-world
tool, we used MRNet not only for multicast and simple
data reductions but also with custom histogram and
clock skew detection reductions. In our experiments,
the MRNet-based tools showed significantly better
performance than the tools without MRNet for average
message latency and throughput, overall tool start-up
latency, and performance data processing throughput.",
acknowledgement = ack-nhfb,
keywords =     "aggregation; scalability; tools; multicast;
reduction",
}

@InProceedings{Miller:2003:TDP,
author =       "Barton Miller and Ana Cort{\'e}s and Miquel Senar and
Miron Livny",
title =        "The {Tool Daemon Protocol}",
crossref =     "ACM:2003:SII",
pages =        "??--??",
year =         "2003",
bibdate =      "Wed Nov 26 07:34:20 2003",
URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10687#1;
http://www.sc-conference.org/sc2003/paperpdfs/pap187.pdf",
abstract =     "Run-time tools are crucial to program development. In
our desktop computer environments, we take for granted
the availability of tools for operations such as
debugging, profiling, tracing, checkpointing, and
visualization. When programs move into distributed or
Grid environments, it is difficult to find such tools.
This difficulty is caused by the complex interactions
necessary between application program, operating system
and layers of job scheduling and process management
software. As a result, each run-time tool must be
individually ported to run under a particular job
management system; for $m$ tools and $n$ environments,
the problem becomes an $m \times n$ effort, rather than
the hoped-for $m + n$ effort. Variations in underlying
operating systems can make this problem even worse. The
consequence of this situation is a paucity of tools in
distributed and Grid computing environments. In
response to the problem, we have analyzed a variety of
job scheduling environments and run-time tools to
better understand their interactions. From this
analysis, we isolated what we believe are the essential
interactions between the runtime tool, job scheduler
and resource manager, and application program. We are
proposing a standard interface, called the Tool
D{\ae}mon Protocol (TDP) that codifies these
interactions and provides the necessary communication
functions. We have implemented a pilot TDP library and
experimented with Parador, a prototype using the
Paradyn Parallel Performance tools profiling jobs
running under the Condor batch-scheduling
environment.",
acknowledgement = ack-nhfb,
}

@InProceedings{Yang:2003:CSU,
author =       "Lingyun Yang and Jennifer M. Schopf and Ian Foster",
title =        "Conservative Scheduling: Using Predicted Variance to
Improve Scheduling Decisions in Dynamic Environments",
crossref =     "ACM:2003:SII",
pages =        "??--??",
year =         "2003",
bibdate =      "Wed Nov 26 07:34:20 2003",
URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10687#2;
http://www.sc-conference.org/sc2003/paperpdfs/pap236.pdf",
abstract =     "In heterogeneous and dynamic environments, efficient
execution of parallel computations can require mappings
of tasks to processors whose performance is both
irregular (because of heterogeneity) and time-varying
(because of dynamicity). While adaptive domain
decomposition techniques have been used to address
heterogeneous resource capabilities, temporal
variations in those capabilities have seldom been
considered. We propose a conservative scheduling policy
that uses information about expected future variance in
resource capabilities to produce more efficient data
mapping decisions. We first present techniques, based
on time series predictors that we developed in previous
work, for predicting CPU load at some future time
point, average CPU load for some future time interval,
and variation of CPU load over some future time
interval. We then present a family of stochastic
scheduling algorithms that exploit such predictions of
future availability and variability when making data
mapping decisions. Finally, we describe experiments in
which we apply our techniques to an astrophysics
application. The results of these experiments
demonstrate that conservative scheduling can produce
execution times that are both significantly faster and
less variable than other techniques.",
acknowledgement = ack-nhfb,
}

@InProceedings{Ding:2003:CAI,
author =       "Yonghua Ding and Zhiyuan Li",
title =        "A Compiler Analysis of Interprocedural Data
Communication",
crossref =     "ACM:2003:SII",
pages =        "??--??",
year =         "2003",
bibdate =      "Wed Nov 26 07:34:20 2003",
URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10692#0;
http://www.sc-conference.org/sc2003/paperpdfs/pap137.pdf",
abstract =     "This paper presents a compiler analysis for data
communication for the purpose of transforming ordinary
programs into ones that run on distributed systems.
Such transformations have been used for process
performance of mobile computing devices. In a
client-server distributed environment, the efficiency
of an application can be improved by careful
partitioning of tasks between the server and the
client. Optimal task partitioning depends on the
communication cost. Our compiler analysis, assisted by
a minimum set of user assertions, estimates the amount
of data communication between procedures. The paper
also presents experimental results based on an
implementation in the GCC compiler. The static
estimates for several multimedia programs are compared
against dynamic measurement performed using Shade, a
SUN Microsystem's instruction-level simulator. The
results show a high precision of the static analysis
for most pairs of the procedures.",
acknowledgement = ack-nhfb,
}

@InProceedings{Chauhan:2003:ATD,
author =       "Arun Chauhan and Cheryl McCosh and Ken Kennedy and
Richard Hanson",
title =        "Automatic Type-Driven Library Generation for
Telescoping Languages",
crossref =     "ACM:2003:SII",
pages =        "??--??",
year =         "2003",
bibdate =      "Wed Nov 26 07:34:20 2003",
URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10692#1;
http://www.sc-conference.org/sc2003/paperpdfs/pap296.pdf",
abstract =     "Telescoping languages is a strategy to automatically
generate highly-optimized domain-specific libraries.
The key idea is to create specialized variants of
library procedures through extensive offline
processing. This paper describes a telescoping system,
called ARGen, which generates high-performance Fortran
or C libraries from prototype Matlab code for the
linear algebra library, ARPACK. ARGen uses variable
types to guide procedure specializations on possible
calling contexts.\par

ARGen needs to infer Matlab types in order to speculate
on the possible variants of library procedures, as well
as to generate code. This paper shows that our
type-inference system is powerful enough to generate
all the variants needed for ARPACK automatically from
the Matlab development code. The ideas demonstrated
here provide a basis for building a more general
telescoping system for Matlab.",
acknowledgement = ack-nhfb,
}

@InProceedings{Du:2003:CSE,
author =       "Wei Du and Renato Ferreira and Gagan Agrawal",
title =        "Compiler Support for Exploiting Coarse-Grained
Pipelined Parallelism",
crossref =     "ACM:2003:SII",
pages =        "??--??",
year =         "2003",
bibdate =      "Wed Nov 26 07:34:20 2003",
URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10692#2;
http://www.sc-conference.org/sc2003/paperpdfs/pap133.pdf",
abstract =     "The emergence of grid and a new class of data-driven
applications is making a new form of parallelism
desirable, which we refer to as coarse-grained
pipelined parallelism. This paper reports on a
compilation system developed to exploit this form of
parallelism. We use a dialect of Java that exposes both
pipelined and data parallelism to the compiler. Our
compiler is responsible for selecting a set of
candidate filter boundaries, determining the volume of
communication required if a particular boundary is
chosen, performing the decomposition, and generating
code. We have developed a one-pass algorithm for
determining the required communication between
consecutive filters. We have developed a cost model for
estimating the execution time for a given
decomposition, and a dynamic programming algorithm for
performing the decomposition. Detailed evaluation of
our current compiler using four data-driven
applications demonstrate the feasibility of our
approach.",
acknowledgement = ack-nhfb,
}

@InProceedings{Lu:2003:SRC,
author =       "Dong Lu and Peter August Dinda",
title =        "Synthesizing Realistic Computational Grids",
crossref =     "ACM:2003:SII",
pages =        "??--??",
year =         "2003",
bibdate =      "Wed Nov 26 07:34:20 2003",
URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10690#0;
http://www.sc-conference.org/sc2003/paperpdfs/pap180.pdf",
abstract =     "Realistic workloads are essential in evaluating
middleware for computational grids. One important
component is the raw grid itself: a network topology
graph annotated with the hardware and software
available on each node and link. This paper defines our
requirements for grid generation and presents GridG,
our extensible generator. We describe GridG in two
steps: topology generation and annotation. For topology
generation, we have both model and mechanism. We extend
Tiers, an existing tool from the networking community,
to produce graphs that obey recently discovered power
laws of Internet topology. We also contribute to
network topology theory by illustrating a contradiction
between two laws and proposing a new version of one of
them. For annotation, GridG captures intra- and
inter-host correlations between attributes using
conditional probability rules. We construct a set of
rules, including one based on empirical evidence of OS
concentration in subnets, that produce sensible host
annotations.",
acknowledgement = ack-nhfb,
}

@InProceedings{Liu:2003:TBL,
author =       "Xin Liu and Andrew A. Chien",
title =        "Traffic-based Load Balance for Scalable Network
Emulation",
crossref =     "ACM:2003:SII",
pages =        "??--??",
year =         "2003",
bibdate =      "Wed Nov 26 07:34:20 2003",
URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10690#1;
http://www.sc-conference.org/sc2003/paperpdfs/pap260.pdf",
abstract =     "Load balance is critical to achieving scalability for
large network emulation studies, which are of
compelling interest for emerging Grid, Peer to Peer,
and other distributed applications and middleware.
Achieving load balance in emulation is difficult
because of irregular network structure and
unpredictable network traffic. We formulate load
balance as a graph partitioning problem and apply
classical graph partitioning algorithms to it. The
primary challenge in this approach is how to extract
useful information from the network emulation and
present it to the graph partitioning algorithms in a
way that reflects the load balance requirement in the
original emulation problem. Using a large-scale network
emulation system called MaSSF, we explore three
approaches for partitioning, based on purely static
topology information (TOP), combining topology and
application placement information (PLACE), and
combining topology and application profile data
(PROFILE). These studies show that exploiting static
topology and application placement information can
achieve reasonable load balance, but a profile-based
approach further improves load balance for even large
scale network emulation. In our experiments, PROFILE
improves load balance by 50\% to 66\% and emulation
time is reduced up to 50\% compared to purely static
topology-based approaches.",
acknowledgement = ack-nhfb,
}

@InProceedings{Butt:2003:SOF,
author =       "Ali Raza Butt and Rongmei Zhang and Y. Charlie Hu",
title =        "A Self-Organizing Flock of {Condors}",
crossref =     "ACM:2003:SII",
pages =        "??--??",
year =         "2003",
bibdate =      "Wed Nov 26 07:34:20 2003",
URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10690#2;
http://www.sc-conference.org/sc2003/paperpdfs/pap265.pdf",
abstract =     "Condor provides high throughput computing by
leveraging idle cycles on off-the-shelf desktop
machines. It also supports flocking, a mechanism for
sharing resources among Condor pools. Since Condor
pools distributed over a wide area can have dynamically
changing availability and sharing preferences, the
current flocking mechanism based on static
configurations can limit the potential of sharing
resources across Condor pools. This paper presents a
technique for resource discovery in distributed Condor
pools using peer-to-peer mechanisms that are
self-organizing, fault-tolerant, scalable, and
locality-aware. Locality-awareness guarantees that
applications are not shipped across long distances when
nearby resources are available. Measurements using a
synthetic job trace show that self-organized flocking
reduces the maximum job wait time in queue for a
heavily loaded pool by a factor of 10 compared to
without flocking. Simulations of 1000 Condor pools are
also presented and the results confirm that our
technique discovers and utilizes nearby resources in
the physical network.",
acknowledgement = ack-nhfb,
}

@InProceedings{Olson:2003:EEU,
author =       "Ryan M. Olson and Michael W. Schmidt and Mark S.
Gordon and Alistair P. Rendell",
title =        "Enabling the Efficient Use of {SMP} Clusters: The
{GAMESS\slash DDI} Model",
crossref =     "ACM:2003:SII",
pages =        "??--??",
year =         "2003",
bibdate =      "Wed Nov 26 07:34:20 2003",
URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10691#0;
http://www.sc-conference.org/sc2003/paperpdfs/pap263.pdf",
abstract =     "An important advance in cluster computing is the
evolution from single processor clusters to
multiprocessor SMP clusters. Due to the increased
complexity in the memory model on SMP clusters, new
approaches are needed for applications that make use of
distributed-memory paradigms. This paper presents new
communications software developments that are designed
to take advantage of SMP cluster hardware. Although the
specific focus is on the central field of computational
chemistry and materials science, as embodied in the
popular electronic structure package GAMESS (General
Atomic and Molecular Electronic Structure System), the
impact of these new developments will be far broader in
scope. Following a summary of the essential features of
the distributed data interface (DDI) in the current
implementation of GAMESS, the new developments for SMP
clusters are described. The advantages of these new
features are illustrated using timing benchmarks on
several hardware platforms, using a typical
computational chemistry application.",
acknowledgement = ack-nhfb,
}

@InProceedings{Ding:2003:RVB,
author =       "Jin Ding and Jian Huang and Micah Beck and Shaotao Liu
and Terry Moore and Stephen Soltesz",
title =        "Remote Visualization by Browsing Image Based Databases
with Logistical Networking",
crossref =     "ACM:2003:SII",
pages =        "??--??",
year =         "2003",
bibdate =      "Wed Nov 26 07:34:20 2003",
URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10691#1;
http://www.sc-conference.org/sc2003/paperpdfs/pap245.pdf",
abstract =     "The need to provide remote visualization of large
datasets with adequate levels of quality and
interactivity has become a major impediment to
distributed collaboration in Computational Science.
Although Image Based Rendering (IBR) techniques based
on plenoptic functions have some important advantages
over other approaches to this problem, they suffer from
an inability to deal with issues of network latency and
server load, due to the large size of the IBR databases
they generate. Consequently, IBR techniques have been
left largely unexplored for this purpose. In this paper
we describe strategies for addressing these obstacles
using Logistical Networking (LoN), which is a new and
highly scalable approach to deploying storage as a
shared communication resource. Leveraging LoN
technology and infrastructure, we developed a remote
visualization system based on concepts of light field
rendering, an IBR method using a 4-D plenoptic
function. Our system extends existing work on light
fields by employing a modified method of
parameterization and data organization that supports
more efficient prefetching, caching and loss-less
compression. Using this approach, we have been able to
interactively browse multi-gigabyte, high-resolution
light field databases across the wide area network at
30 frames per second.",
acknowledgement = ack-nhfb,
}

@InProceedings{Ma:2003:VVL,
author =       "Kwan-Liu Ma and Aleksander Stompel and Jacobo Bielak
and Omar Ghattas and Eui Joong Kim",
title =        "Visualizing Very Large-Scale Earthquake Simulations",
crossref =     "ACM:2003:SII",
pages =        "??--??",
year =         "2003",
bibdate =      "Wed Nov 26 07:34:20 2003",
URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10691#2;
http://www.sc-conference.org/sc2003/paperpdfs/pap287.pdf",
abstract =     "This paper presents a parallel adaptive rendering
algorithm and its performance for visualizing
time-varying unstructured volume data generated from
large-scale earthquake simulations. The objective is to
visualize 3D seismic wave propagation generated from a
0.5 Hz simulation of the Northridge earthquake, which
is the highest resolution volume visualization of an
earthquake simulation performed to date. This scalable
high-fidelity visualization solution we provide to the
scientists allows them to explore in the temporal,
spatial, and visualization domain of their data at high
resolution. This new high resolution explorability,
likely not presently available to most computational
science groups, will help lead to many new insights.
The performance study we have conducted on a massively
parallel computer operated at the Pittsburgh
Supercomputing Center helps direct our design of a
simulation-time visualization strategy for the
higher-resolution, 1Hz and 2 Hz, simulations.",
acknowledgement = ack-nhfb,
keywords =     "earthquake modeling; high-performance computing;
massively parallel supercomputing; scientific
visualization; parallel rendering; time-varying data;
unstructured grids; volume rendering; wave
propagation",
}

@InProceedings{Liu:2003:PCM,
author =       "Jiuxing Liu and Balasubramanian Chandrasekaran and
Jiesheng Wu and Weihang Jiang and Sushmitha Kini and
Weikuan Yu and Darius Buntinas and Pete Wyckoff and D.
K. Panda",
title =        "Performance Comparison of {MPI} Implementations over
crossref =     "ACM:2003:SII",
pages =        "??--??",
year =         "2003",
bibdate =      "Wed Nov 26 07:34:20 2003",
URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10696#0;
http://www.sc-conference.org/sc2003/paperpdfs/pap310.pdf",
abstract =     "In this paper, we present a comprehensive performance
comparison of MPI implementations over InfiniBand,
Myrinet and Quadrics. Our performance evaluation
consists of two major parts. The first part consists of
a set of MPI level micro-benchmarks that characterize
different aspects of MPI implementations. The second
part of the performance evaluation consists of
application level benchmarks. We have used the NAS
Parallel Benchmarks and the sweep3D benchmark. We not
only present the overall performance results, but also
relate application communication characteristics to the
information we acquired from the micro-benchmarks. Our
results show that the three MPI implementations all
cluster, InfiniBand can offer significant performance
improvements for a number of applications compared with
Myrinet and Quadrics when using the PCI-X bus. Even
with just the PCI bus, InfiniBand can still perform
better if the applications are bandwidth-bound.",
acknowledgement = ack-nhfb,
}

@InProceedings{Bouteiller:2003:MVF,
author =       "Aurelien Bouteiller and Franck Cappello and Thomas
Herault and Geraud Krawezik and Pierre Lemarinier and
Frederic Magniette",
title =        "{MPICH-V2}: a Fault Tolerant {MPI} for Volatile Nodes
based on Pessimistic Sender Based Message Logging",
crossref =     "ACM:2003:SII",
pages =        "??--??",
year =         "2003",
bibdate =      "Wed Nov 26 07:34:20 2003",
URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10696#1;
http://www.sc-conference.org/sc2003/paperpdfs/pap209.pdf",
abstract =     "Execution of MPI applications on clusters and Grid
deployments suffering from node and network failures
motivates the use of fault tolerant MPI
implementations. We present MPICH-V2 (the second
protocol of MPICHV project), an automatic fault
tolerant MPI implementation using an innovative
protocol that removes the most limiting factor of the
pessimistic message logging approach: reliable logging
of in transit messages. MPICH-V2 relies on
uncoordinated checkpointing, sender based message
logging and remote reliable logging of message logical
clocks. This paper presents the architecture of
MPICH-V2, its theoretical foundation and the
performance of the implementation. We compare MPICH-V2
to MPICH-V1 and MPICH-P4 evaluating (a) its
point-to-point performance, (b) the performance for the
NAS benchmarks, (c) the application performance when
many faults occur during the execution. Experimental
results demonstrate that MPICH-V2 provides performance
close to MPICH-P4 for applications using large messages
while reducing dramatically the number of reliable
nodes compared to MPICH-V1.",
acknowledgement = ack-nhfb,
}

@InProceedings{Kleban:2003:HDI,
author =       "Stephen D. Kleban and Scott H. Clearwater",
title =        "Hierarchical Dynamics, Interarrival Times, and
Performance",
crossref =     "ACM:2003:SII",
pages =        "??--??",
year =         "2003",
bibdate =      "Wed Nov 26 07:34:20 2003",
URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10696#2;
http://www.sc-conference.org/sc2003/paperpdfs/pap222.pdf",
abstract =     "We report on a model of the distribution of job
submission interarrival times in supercomputers.
Interarrival times are modeled as a consequence of a
complicated set of decisions between users, the queuing
algorithm, and other policies. This cascading hierarchy
of decision-making processes leads to a particular kind
of heavy-tailed distribution. Specifically,
hierarchically constrained systems suggest that fatter
tails are due to more levels coming into play in the
overall decision-making process. The key contribution
of this paper is that heavier tails resulting from more
complex decision-making processes, that is more
hierarchical levels, will lead to overall worse
performance, even when the average interarrival time is
the same. Finally, we offer some suggestions for how to
overcome these issues and the tradeoffs involved.",
acknowledgement = ack-nhfb,
keywords =     "hierarchy; relaxation process; interarrival; ASCI
queueing; dynamics",
}

author =       "Mark F. Adams and Harun H. Bayraktar and Tony M.
title =        "Applications of Algebraic Multigrid to Large-Scale
Finite Element Analysis of Whole Bone Micro-Mechanics
on the {IBM SP}",
crossref =     "ACM:2003:SII",
pages =        "??--??",
year =         "2003",
bibdate =      "Wed Nov 26 07:34:20 2003",
URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10694#0;
http://www.sc-conference.org/sc2003/paperpdfs/pap211.pdf",
abstract =     "Accurate micro-finite element analyses of whole bones
require the solution of large sets of algebraic
equations. Multigrid has proven to be an effective
approach to the design of highly scalable linear
solvers for solid mechanics problems. We present some
of the first applications of scalable linear solvers,
on massively parallel computers, to whole vertebral
body structural analysis. We analyze the performance of
our algebraic multigrid (AMG) methods on problems with
over 237 million degrees of freedom on IBM SP parallel
computers. We demonstrate excellent parallel
scalability, both in the algorithms and the
implementations, and analyze the nodal performance of
the important AMG kernels on the IBM Power3 and Power4
architectures.",
acknowledgement = ack-nhfb,
keywords =     "multigrid; trabecular bone; human vertebral body;
finite element method; massively parallel computing.",
}

@InProceedings{Wang:2003:PMS,
author =       "Kai Wang and Jun Zhang and Chi Shen",
title =        "Parallel Multilevel Sparse Approximate Inverse
Preconditioners in Large Sparse Matrix Computations",
crossref =     "ACM:2003:SII",
pages =        "??--??",
year =         "2003",
bibdate =      "Wed Nov 26 07:34:20 2003",
URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10694#1;
http://www.sc-conference.org/sc2003/paperpdfs/pap109.pdf",
abstract =     "We investigate the use of the multistep successive
preconditioning strategies (MSP) to construct a class
of parallel multilevel sparse approximate inverse (SAI)
preconditioners. We do not use independent set
ordering, but a diagonal dominance based matrix
permutation to build a multilevel structure. The
purpose of introducing multilevel structure into SAI is
to enhance the robustness of SAI for solving difficult
problems. Forward and backward preconditioning
iteration and two Schur complement preconditioning
strategies are proposed to improve the performance and
to reduce the storage cost of the multilevel
preconditioners. One version of the parallel multilevel
SAI preconditioner based on the MSP strategy is
implemented. Numerical experiments for solving a few
sparse matrices on a distributed memory parallel
computer are reported.",
acknowledgement = ack-nhfb,
}

@InProceedings{Qiang:2003:PPC,
author =       "Ji Qiang and Miguel A. Furman and Robert D. Ryne",
title =        "Parallel Particle-In-Cell Simulation of Colliding
Beams in High Energy Accelerators",
crossref =     "ACM:2003:SII",
pages =        "??--??",
year =         "2003",
bibdate =      "Wed Nov 26 07:34:20 2003",
URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10694#2;
http://www.sc-conference.org/sc2003/paperpdfs/pap223.pdf",
abstract =     "In this paper we present a self-consistent simulation
model of colliding beams in high energy accelerators.
The model, which is based on a particle-in-cell method,
uses a new developed shifted-Green function algorithm
for the efficient calculation of the beam-beam
interaction. The model uses transfer maps to treat the
external focusing elements and a stochastic map to
treat radiation damping and quantum excitation of the
beams. In the parallel implementation we studied
various strategies to deal with the particular nature
of the colliding beam system --- a system in which
there can be significant particle movement between
beam-beam collisions. We chose a particle-field
decomposition approach instead of the conventional
domain decomposition or particle decomposition
approach. The particle-field approach leads to good
load balance, reduced communication cost, and shows the
best scalability on an IBM SP3 among the three parallel
implementations we studied. A performance test of the
beam-beam model on a Cray T3E, IBM SP3, and a PC
cluster is presented. As an application, we studied the
effect of long-range collisions on antiproton lifetime
in the Fermilab Tevatron.",
acknowledgement = ack-nhfb,
}

@InProceedings{Dinda:2003:NQR,
author =       "Peter Dinda and Dong Lu",
title =        "Nondeterministic Queries in a Relational Grid
Information Service",
crossref =     "ACM:2003:SII",
pages =        "??--??",
year =         "2003",
bibdate =      "Wed Nov 26 07:34:20 2003",
URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10695#0;
http://www.sc-conference.org/sc2003/paperpdfs/pap146.pdf",
abstract =     "A Grid Information Service (GIS) stores information
about the resources of a distributed computing
developing RGIS, a GIS system based on the relational
data model. RGIS users can write SQL queries that
search for complex compositions of resources that meet
collective requirements. Executing these queries can be
very expensive, however. In response, we introduce the
nondeterministic query, an extension to the SELECT
statement, which allows the user (and RGIS) to trade
off between the query's running time and the number of
results. The results are a random sample of the
deterministic results, which we argue is sufficient and
appropriate. Herein we describe RGIS, the
nondeterministic query extension, and its
implementation. Our evaluation shows that a meaningful
tradeoff between query time and results returned is
achievable, and that the tradeoff can be used to keep
query time largely independent of query complexity.",
acknowledgement = ack-nhfb,
}

@InProceedings{Kurc:2003:ORC,
author =       "Tahsin Kurc and Feng Lee and Gagan Agrawal and Umit
Catalyurek and Renato Ferreira and Joel Saltz",
title =        "Optimizing Reduction Computations In a Distributed
Environment",
crossref =     "ACM:2003:SII",
pages =        "??--??",
year =         "2003",
bibdate =      "Wed Nov 26 07:34:20 2003",
URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10695#1;
http://www.sc-conference.org/sc2003/paperpdfs/pap135.pdf",
abstract =     "We investigate runtime strategies for data-intensive
applications that involve generalized reductions on
large, distributed datasets. Our set of strategies
includes replicated filter state, partitioned filter
state, and hybrid options between these two extremes.
We evaluate these strategies using emulators of three
real applications, different query and output sizes,
and a number of configurations. We consider execution
in a homogeneous cluster and in a distributed
environment where only a subset of nodes host the data.
Our results show replicating the filter state scales
well and outperforms other schemes, if sufficient
memory is available and sufficient computation is
involved to offset the cost of global merge step. In
other cases, hybrid is usually the best. Moreover, in
almost all cases, the performance of the hybrid
strategy is quite close to the best strategy. Thus, we
believe that hybrid is an attractive approach when the
relative performance of different schemes cannot be
predicted.",
acknowledgement = ack-nhfb,
}

@InProceedings{Shan:2003:JSA,
author =       "Hongzhang Shan and Leonid Oliker and Rupak Biswas",
title =        "Job Superscheduler Architecture and Performance in
Computational Grid Environments",
crossref =     "ACM:2003:SII",
pages =        "??--??",
year =         "2003",
bibdate =      "Wed Nov 26 07:34:20 2003",
URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10695#2;
http://www.sc-conference.org/sc2003/paperpdfs/pap267.pdf",
abstract =     "Computational grids hold great promise in utilizing
geographically separated heterogeneous resources to
solve large-scale complex scientific problems. However,
a number of major technical hurdles, including
distributed resource management and effective job
scheduling, stand in the way of realizing these gains.
In this paper, we propose a novel grid superscheduler
architecture and three distributed job migration
algorithms. We also model the critical interaction
between the superscheduler and autonomous local
schedulers. Extensive performance comparisons with
ideal, central, and local schemes using real workloads
from leading computational centers are conducted in a
workloads are used to perform a detailed sensitivity
analysis of our superscheduler. Several key metrics
demonstrate that substantial performance gains can be
achieved via smart superscheduling in distributed
computational grids.",
acknowledgement = ack-nhfb,
}

@InProceedings{Jaganathan:2003:CNP,
author =       "Ranjesh G. Jaganathan and Keith D. Underwood and Ron
R. Sass",
title =        "A Configurable Network Protocol for Cluster Based
Communications using Modular Hardware Primitives on an
Intelligent {NIC}",
crossref =     "ACM:2003:SII",
pages =        "??--??",
year =         "2003",
bibdate =      "Wed Nov 26 07:34:20 2003",
URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10702#0;
http://www.sc-conference.org/sc2003/paperpdfs/pap199.pdf",
abstract =     "The high overhead of generic protocols like TCP/IP
provides strong motivation for the development of a
better protocol architecture for cluster-based parallel
computers. Reconfigurable computing has a unique
opportunity to contribute hardware level protocol
acceleration while retaining the flexibility to adapt
to changing needs. Specifically, applications on a
cluster have various quality of service needs. In
addition, these applications typically run for a long
time relative to the reconfiguration time of an FPGA.
Thus, it is possible to provide application-specific
protocol processing to improve performance and reduce
space utilization. Reducing space utilization permits
the use of a greater portion of the FPGA for other
application-specific processing. This paper focuses on
work to create a set of parameterizable components that
can be put together as needed to obtain a customized
protocol for each application. To study the feasibility
of such an architecture, hardware components were built
that can be stitched together as needed to provide the
required functionality. Feasibility is demonstrated
using four different protocol configurations, namely:
(1) unreliable packet transfer; (2) reliable, unordered
message transfer without duplicate elimination; (3)
reliable, unordered message transfer with duplicate
elimination; and (4) reliable, ordered message transfer
with duplicate elimination. The different
configurations illustrate trade-offs between chip space
and functionality.",
acknowledgement = ack-nhfb,
}

@InProceedings{Feng:2003:OGE,
author =       "Wu-chun Feng and Justin and Hurwitz and Harvey B.
Newman and Sylvain Ravot and Roger Les Cottrell and
Olivier Martin and Fabrizio Coccetti and Cheng Jin and
David Wei and Steven Low",
title =        "Optimizing 10-Gigabit {Ethernet} in Networks of
Workstations, Clusters, and Grids: {A} Case Study",
crossref =     "ACM:2003:SII",
pages =        "??--??",
year =         "2003",
bibdate =      "Wed Nov 26 07:34:20 2003",
URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10702#1;
http://www.sc-conference.org/sc2003/paperpdfs/pap293.pdf",
abstract =     "This paper presents a case study of the 10-Gigabit
Ethernet (10GbE) adapter from Intel. Specifically,
with appropriate optimizations to the configurations of
the 10GbE adapter and TCP, we demonstrate that the
10GbE adapter can perform well in local-area,
storage-area, system-area, and wide-area networks. For
local-area, storage-area, and system-area networks in
support of networks of workstations, network-attached
storage, and clusters, respectively, we can achieve
over 7-Gb/s end-to-end throughput and 12$\mu$s
end-to-end latency between applications running on
Linux-based PCs. For the wide-area network in support
of grids, we broke the recently-set Internet2 Land
Speed Record by 2.5 times by sustaining an end-to-end
TCP/IP throughput of 2.38 Gb/s between Sunnyvale,
California and Geneva, Switzerland (i.e., 10,037
kilometers) to move over a terabyte of data in less
than an hour. Thus, the above results indicate that
10GbE may be a cost-effective solution across a
multitude of computing environments.",
acknowledgement = ack-nhfb,
}

@InProceedings{Coll:2003:SHB,
author =       "Salvador Coll and Jose Duato and Fabrizio Petrini and
Francisco J. Mora",
title =        "Scalable Hardware-Based Multicast Trees",
crossref =     "ACM:2003:SII",
pages =        "??--??",
year =         "2003",
bibdate =      "Wed Nov 26 07:34:20 2003",
URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10702#2;
http://www.sc-conference.org/sc2003/paperpdfs/pap300.pdf",
abstract =     "This paper presents an algorithm for implementing
optimal hardware-based multicast trees, on networks
that provide hardware support for collective
communication. Although the proposed methodology can be
generalized to a wide class of networks, we apply our
methodology to the Quadrics network, a state-of-the-art
network that provides hardware-based multicast
communication. The proposed mechanism is intended to
improve the performance of the collective communication
patterns on the network, in those cases where the
hardware support can not be directly used, for
instance, due to some faulty nodes. This scheme
provides significant reduction on multicast latencies
compared to the original system primitives, which use
multicast trees based on unicast communication. A
backtracking algorithm to find the optimal solution to
the problem is presented. In addition, a greedy
algorithm is presented and shown to provide near
optimal solutions. Finally, our experimental results
show the good performance and scalability of the
proposed multicast tree in comparison to the
multicast mechanism doubles barrier synchronization and
broadcasts performance when compared to the
production-level MPI library.",
acknowledgement = ack-nhfb,
}

@InProceedings{Balls:2003:SHS,
author =       "Gregory T. Balls and Scott B. Baden and Phillip
Colella",
title =        "{SCALLOP}: {A} Highly Scalable Parallel {Poisson}
Solver in Three Dimensions",
crossref =     "ACM:2003:SII",
pages =        "??--??",
year =         "2003",
bibdate =      "Wed Nov 26 07:34:20 2003",
URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10703#0;
http://www.sc-conference.org/sc2003/paperpdfs/pap201.pdf",
abstract =     "SCALLOP is a highly scalable solver and library for
elliptic partial differential equations on regular
block-structured domains. SCALLOP avoids high
advantage of the locality properties inherent to
solutions to elliptic PDEs. Communication costs are
small, on the order of a few percent of the total
running time on up to 1024 processors of NPACI's and
NERSC's IBM Power-3 SP systems. SCALLOP trades off
numerical overheads are independent of the number of
processors for a wide range of problem sizes. SCALLOP
is implicitly designed for infinite domain (free space)
boundary conditions, but the algorithm can be
reformulated to accommodate other boundary conditions.
The SCALLOP library is built on top of the KeLP
programming system and runs on a variety of
platforms.",
acknowledgement = ack-nhfb,
keywords =     "computation-intensive applications; parallel and
distributed algorithms; program optimization and
performance programming",
}

@InProceedings{Nakajima:2003:PIS,
author =       "Kengo Nakajima",
title =        "Parallel Iterative Solvers of {GeoFEM} with Selective
Blocking Preconditioning for Nonlinear Contact Problems
on the {Earth Simulator}",
crossref =     "ACM:2003:SII",
pages =        "??--??",
year =         "2003",
bibdate =      "Wed Nov 26 07:34:20 2003",
URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10703#1;
http://www.sc-conference.org/sc2003/paperpdfs/pap155.pdf",
abstract =     "An efficient parallel iterative method with selective
blocking preconditioning has been developed for
symmetric multiprocessor (SMP) cluster architectures
with vector processors such as the Earth Simulator.
This method is based on a three-level hybrid parallel
programming model, which includes message passing for
inter-SMP node communication, loop directives by OpenMP
for intra-SMP node parallelization and vectorization
for each processing element (PE). This method provides
robust and smooth convergence and excellent vector and
parallel performance in 3D geophysical simulations with
contact conditions performed on the Earth Simulator.
The selective blocking preconditioning is much more
efficient than ILU(1) and ILU(2). Performance for the
complicated Southwest Japan model with more than 23 M
DOF on 10 SMP nodes (80 PEs) of the Earth Simulator was
161.7 GFLOPS, corresponding to 25.3\% of the peak
performance for hybrid programming model, and 190.4
GFLOPS (29.8\% of the peak performance) for flat MPI,
respectively.",
acknowledgement = ack-nhfb,
}

@InProceedings{Karypis:2003:MCM,
author =       "George Karypis",
title =        "Multi-Constraint Mesh Partitioning for Contact\slash
Impact Computations",
crossref =     "ACM:2003:SII",
pages =        "??--??",
year =         "2003",
bibdate =      "Wed Nov 26 07:34:20 2003",
URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10703#2;
http://www.sc-conference.org/sc2003/paperpdfs/pap302.pdf",
abstract =     "We present a novel approach for decomposing
contact/impact computations in which the mesh elements
come in contact with each other during the course of
the simulation. Effective decomposition of these
computations poses a number of challenges as it needs
to both balance the computations and minimize the
amount of communication that is performed during the
finite element and the contact search phase. Our
approach achieves the first goal by partitioning the
underlying mesh such that it simultaneously balances
both the work that is performed during the finite
element phase and that performed during contact search
phase, while producing subdomains whose boundaries
consist of piecewise axes-parallel lines or planes. The
second goal is achieved by using a decision tree to
decompose the space into rectangular or box-shaped
regions that contain contact points from a single
partition. Our experimental evaluation on a sequence of
100 meshes, shows that this new approach can reduce the
algorithms.",
acknowledgement = ack-nhfb,
}

@InProceedings{Akcelik:2003:HRF,
author =       "Volkan Akcelik and Jacobo Bielak and George Biros and
Ioannis Epanomeritakis and Antonio Fernandez and Omar
Ghattas and Eui Joong Kim and Julio Lopez and David
O'Hallaron and Tiankai Tu and John Urbanic",
title =        "High Resolution Forward and Inverse Earthquake
Modeling on Terascale Computers",
crossref =     "ACM:2003:SII",
pages =        "??--??",
year =         "2003",
bibdate =      "Wed Nov 26 07:34:20 2003",
URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10707#0;
http://www.sc-conference.org/sc2003/paperpdfs/pap298.pdf",
abstract =     "For earthquake simulations to play an important role
in the reduction of seismic risk, they must be capable
of high resolution and high fidelity. We have developed
algorithms and tools for earthquake simulation based on
multiresolution hexahedral meshes. We have used this
capability to carry out 1 Hz simulations of the 1994
Northridge earthquake in the LA Basin using 100 million
grid points. Our wave propagation solver sustains 1.21
teraflop/s for 4 hours on 3000 AlphaServer processors
at 80\% parallel efficiency. Because of uncertainties
in characterizing earthquake source and basin material
properties, a critical remaining challenge is to invert
for source and material parameter fields for complex 3D
basins from records of past earthquakes. Towards this
end, we present results for material and source
inversion of high-resolution models of basins
undergoing antiplane motion using parallel scalable
inversion algorithms that overcome many of the
difficulties particular to inverse heterogeneous wave
propagation problems.",
acknowledgement = ack-nhfb,
}

@InProceedings{Kim:2003:IHP,
author =       "Seung Jo Kim and Chang Sung Lee and Jeong Ho Kim and
Minsu Joh and Sangsan Lee",
title =        "{IPSAP} : {A} High-performance Parallel Finite Element
Code for Large-scale Structural Analysis Based on
Domain-wise Multifrontal Technique",
crossref =     "ACM:2003:SII",
pages =        "??--??",
year =         "2003",
bibdate =      "Wed Nov 26 07:34:20 2003",
URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10707#1;
http://www.sc-conference.org/sc2003/paperpdfs/pap238.pdf",
abstract =     "Most of researches for large-scale parallel structural
analysis have focused on iterative solution methods
since direct solution methods generally have many
problems. However, due to the numerical robustness of
direct methods that guarantees the solution to be
obtained within estimated time, direct methods are much
more desirable for general application of large-scale
structural analysis, if the difficulties and
disadvantages can be overcome. In this research, we
propose the domain-wise multifrontal solver as an
efficient direct solver that can overcome most of these
difficulties and disadvantages. By using our own
structural analysis code IPSAP which uses the proposed
solver, we can solve the largest problem ever solved by
direct solvers and can sustain 191 Gflop/s with 256
CPUs on our self-made cluster system, Pegasus. By
implementing the block Lanczos algorithm using our
solver, IPSAP can solve eigenproblems with 7 millions
of DOFs within one hour.",
acknowledgement = ack-nhfb,
}

@InProceedings{Ying:2003:NPK,
author =       "Lexing Ying and George Biros and Denis Zorin and
Harper Langston",
title =        "A new parallel kernel-independent fast multipole
method",
crossref =     "ACM:2003:SII",
pages =        "??--??",
year =         "2003",
bibdate =      "Wed Nov 26 07:34:20 2003",
URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10707#2;
http://www.sc-conference.org/sc2003/paperpdfs/pap166.pdf",
abstract =     "We present a new adaptive fast multipole algorithm and
its parallel implementation. The algorithm is
kernel-independent in the sense that the evaluation of
pairwise interactions does not rely on any analytic
expansions, but only utilizes kernel evaluations. The
new method provides the enabling technology for many
important problems in computational science and
engineering. Examples include viscous flows, fracture
mechanics and screened Coulombic interactions. Our
MPI-based parallel implementation logically separates
the computation and communication phases to avoid
synchronization in the upward and downward computation
passes, and thus allows us to fully exploit computation
and communication overlapping. We measure isogranular
and fixed-size scalability for a variety of kernels on
the Pittsburgh Supercomputing Center's TCS-1
AlphaServer on up to 3000 processors. We have solved
viscous flow problems with up to 2.1 billion unknowns
and we have achieved 1.6 Tflops/s peak performance and
1.13 Tflops/s sustained performance.",
acknowledgement = ack-nhfb,
keywords =     "Fast multipole methods; adaptive algorithms; massively
parallel computing; boundary integral equations; N-body
problems; viscous flows",
}

@InProceedings{Petrini:2003:CMS,
author =       "Fabrizio Petrini and Darren J. Kerbyson and Scott
Pakin",
title =        "The Case of the Missing Supercomputer Performance:
Achieving Optimal Performance on the 8,192 Processors
of {ASCI Q}",
crossref =     "ACM:2003:SII",
pages =        "??--??",
year =         "2003",
bibdate =      "Wed Nov 26 07:34:20 2003",
URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10706#0;
http://www.sc-conference.org/sc2003/paperpdfs/pap301.pdf",
abstract =     "In this paper we describe how we improved the
effective performance of ASCI Q, the world's
second-fastest supercomputer, to meet our expectations.
Using an arsenal of performance-analysis techniques
including analytical models, custom microbenchmarks,
full applications, and simulators, we succeeded in
observing a serious --- but previously undetected ---
performance problem. We identified the source of the
problem, eliminated the problem, and closed the
loop'' by demonstrating up to a factor of 2 improvement
in application performance. We present our methodology
and provide insight into performance analysis that is
immediately applicable to other large-scale
supercomputers.",
acknowledgement = ack-nhfb,
}

@InProceedings{Dunigan:2003:EEC,
author =       "Thomas H. {Dunigan, Jr.} and Mark R. Fahey and James
B. White III and Patrick H. Worley",
title =        "Early Evaluation of the {Cray X1}",
crossref =     "ACM:2003:SII",
pages =        "??--??",
year =         "2003",
bibdate =      "Wed Nov 26 07:34:20 2003",
URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10706#1;
http://www.sc-conference.org/sc2003/paperpdfs/pap183.pdf",
abstract =     "Oak Ridge National Laboratory installed a 32 processor
Cray X1 in March, 2003, and will have a 256 processor
system installed by October, 2003. In this paper we
describe our initial evaluation of the X1 architecture,
focusing on microbenchmarks, kernels, and application
codes that highlight the performance characteristics of
the X1 architecture and indicate how to use the system
most efficiently.",
acknowledgement = ack-nhfb,
}

@InProceedings{Oliker:2003:ECB,
author =       "Leonid Oliker and Andrew Canning and Jonathan Carter
and John Shalf and David Skinner and Stephane Ethier
and Rupak Biswas and Jahed Djomehri and Rob Van der
Wijngaart",
title =        "Evaluation of Cache-based Superscalar and Cacheless
Vector Architectures for Scientific Computations",
crossref =     "ACM:2003:SII",
pages =        "??--??",
year =         "2003",
bibdate =      "Wed Nov 26 07:34:20 2003",
URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10706#2;
http://www.sc-conference.org/sc2003/paperpdfs/pap255.pdf",
abstract =     "The growing gap between sustained and peak performance
for scientific applications is a well-known problem in
high end computing. The recent development of parallel
vector systems offers the potential to bridge this gap
for many computational science codes and deliver a
substantial increase in computing capabilities. This
paper examines the intranode performance of the NEC
SX-6 vector processor and the cache-based IBM Power3/4
superscalar architectures across a number of scientific
computing areas. First, we present the performance of a
microbenchmark suite that examines low-level machine
characteristics. Next, we study the behavior of the NAS
Parallel Benchmarks. Finally, we evaluate the
performance of several scientific computing codes.
Results demonstrate that the SX-6 achieves high
performance on a large fraction of our applications and
often significantly outperforms the cache-based
architectures. However, certain applications are not
easily amenable to vectorization and would require
extensive algorithm and implementation reengineering to
utilize the SX-6 effectively.",
acknowledgement = ack-nhfb,
}

@InProceedings{Kee:2003:POP,
author =       "Yang-Suk Kee and Jin-Soo Kim and Soonhoi Ha",
title =        "{ParADE}: An {OpenMP} Programming Environment for
{SMP} Cluster Systems",
crossref =     "ACM:2003:SII",
pages =        "??--??",
year =         "2003",
bibdate =      "Wed Nov 26 07:34:20 2003",
URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10708#0;
http://www.sc-conference.org/sc2003/paperpdfs/pap130.pdf",
abstract =     "Demand for programming environments to exploit
clusters of symmetric multiprocessors (SMPs) is
increasing. In this paper, we present a new programming
environment, called ParADE, to enable easy, portable,
and high-performance programming on SMP clusters. It is
an OpenMP programming environment on top of a
(SDSM) system with a variant of home-based lazy release
consistency protocol. To boost performance, the runtime
system provides explicit message-passing primitives to
make it a hybrid-programming environment. Collective
communication primitives are used for the
synchronization and work-sharing directives associated
with small data structures, lessening the
synchronization overhead and avoiding the implicit
barriers of work-sharing directives. The OpenMP
translator bridges the gap between the OpenMP
abstraction and the hybrid programming interfaces of
the runtime system. The experiments with several NAS
benchmarks and applications on a Linux-based cluster
show promising results that ParADE overcomes the
performance problem of the conventional SDSM-based
OpenMP environment.",
acknowledgement = ack-nhfb,
keywords =     "programming environment; SMP cluster; software
distributed shared memory; hybrid programming; OpenMP;
MPI",
}

@InProceedings{Weatherly:2003:DMS,
author =       "D. Brent Weatherly and David K. Lowenthal and Mario
Nakazawa and Franklin Lowenthal",
title =        "{Dyn-MPI}: Supporting {MPI} on Non Dedicated
Clusters",
crossref =     "ACM:2003:SII",
pages =        "??--??",
year =         "2003",
bibdate =      "Wed Nov 26 07:34:20 2003",
URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10708#1;
http://www.sc-conference.org/sc2003/paperpdfs/pap126.pdf",
abstract =     "Distributing data is a fundamental problem in
implementing efficient distributed-memory parallel
programs. The problem becomes more difficult in
environments where the participating nodes are not
dedicated to a parallel application. We are
investigating the data distribution problem in non
dedicated environments in the context of explicit
message-passing programs.\par

To address this problem, we have designed and
implemented an extension to MPI called Dynamic MPI
(Dyn-MPI). The key component of Dyn-MPI is its run-time
system, which efficiently and automatically
redistributes data on the fly when there are changes in
the application or the underlying environment. Dyn-MPI
supports efficient memory allocation, precise
measurement of system load and computation time, and
node removal. Performance results show that programs
that use Dyn-MPI execute efficiently in non dedicated
environments, including up to almost a three-fold
improvement compared to programs that do not
redistribute data and a 25\% improvement over standard
acknowledgement = ack-nhfb,
}

@InProceedings{Barker:2003:EFD,
author =       "Kevin J. Barker and Nikos P. Chrisochoides",
title =        "An Evaluation of a Framework for the Dynamic Load
Balancing of Highly Adaptive and Irregular Parallel
Applications",
crossref =     "ACM:2003:SII",
pages =        "??--??",
year =         "2003",
bibdate =      "Wed Nov 26 07:34:20 2003",
URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10708#2;
http://www.sc-conference.org/sc2003/paperpdfs/pap268.pdf",
abstract =     "We present an evaluation of a flexible framework and
runtime software system for the dynamic load balancing
of asynchronous and highly adaptive and irregular
applications. These applications, which include
parallel unstructured and adaptive mesh refinement,
serve as building blocks for a large class of
scientific applications. Extensive study has lead to
the development of solutions to the dynamic load
balancing problem for loosely synchronous and
computation intensive programs; however, these methods
are not suitable for asynchronous and highly adaptive
applications. We evaluate a new software framework
which includes support for an Active Messages style
communication mechanism, global name space, transparent
object migration, and preemptive decision making. Our
results from both a 3-dimensional parallel advancing
front mesh generation program, as well as a synthetic
microbenchmark, indicate that this new framework
out-performs two existing general-purpose, well-known,
and widely used software systems for the dynamic load
balancing of adaptive and irregular parallel
applications.",
acknowledgement = ack-nhfb,
}

@InProceedings{Romein:2003:MFS,
author =       "John W. Romein and Jaap Heringa and Henri E. Bal",
title =        "A Million-Fold Speed Improvement in Genomic Repeats
Detection",
crossref =     "ACM:2003:SII",
pages =        "??--??",
year =         "2003",
bibdate =      "Wed Nov 26 07:34:20 2003",
URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10710#0;
http://www.sc-conference.org/sc2003/paperpdfs/pap189.pdf",
abstract =     "This paper presents a novel, parallel algorithm for
generating top alignments. Top alignments are used for
finding internal repeats in biological sequences like
proteins and genes. Our algorithm replaces an older,
sequential algorithm (Repro), which was prohibitively
slow for sequence lengths higher than 2000. The new
algorithm is an order of magnitude faster ($O(n^3)$
rather than $O(n^4)$). The paper presents a three-level
parallel implementation of the algorithm: using SIMD
multimedia extensions found on present-day processors
(a novel technique that can be used to parallelize any
application that performs many sequence alignments),
using shared-memory parallelism, and using
distributed-memory parallelism. It allows processing
the longest known proteins (nearly 35000 amino acids).
We show exceptionally high speed improvements: between
548 and 889 on a cluster of 64 dual-processor machines,
compared to the new sequential algorithm. Especially
for long sequences, extreme speed improvements over the
old algorithm are obtained.",
acknowledgement = ack-nhfb,
}

@InProceedings{Chrabakh:2003:GCB,
author =       "Wahid Chrabakh and Rich Wolski",
title =        "{GridSAT}: {A} Chaff-based Distributed {SAT} Solver
for the Grid",
crossref =     "ACM:2003:SII",
pages =        "??--??",
year =         "2003",
bibdate =      "Wed Nov 26 07:34:20 2003",
URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10710#1;
http://www.sc-conference.org/sc2003/paperpdfs/pap252.pdf",
abstract =     "We present GridSAT, a parallel and complete
satisfiability solver designed to solve non-trivial SAT
problem instances using a large number of widely
distributed and heterogeneous resources. The GridSAT
parallel algorithm uses intelligent backtracking,
distributed and carefully scheduled sharing of learned
clauses, and clause reduction. Our implementation
focuses on dynamic resource acquisition and release to
optimize application execution. We show how the large
number of computational resources that are available
from a Grid can be managed effectively for the
application by an automatic scheduler and effective
implementation. GridSAT execution speed is compared
against the best sequential solver as rated by the
SAT2002 competition using a wide variety of problem
instances. The results show that GridSAT delivers
speed-up for all but one of the test problem instances
that are of significant size. In addition, we describe
how GridSAT has solved previously unsolved
satisfiability problems and the domain science
contribution these results make.",
acknowledgement = ack-nhfb,
keywords =     "parallel; distributed; satisfiability; computational
grid",
}

@InProceedings{Vogels:2003:HNC,
author =       "Werner Vogels",
title =        "{HPC.NET} --- are {CLI}-based Virtual Machines
Suitable for High Performance Computing?",
crossref =     "ACM:2003:SII",
pages =        "??--??",
year =         "2003",
bibdate =      "Wed Nov 26 07:34:20 2003",
URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10710#2;
http://www.sc-conference.org/sc2003/paperpdfs/pap251.pdf",
abstract =     "The Common Language Infrastructure is a new,
standardized virtual machine that is likely to become
popular on several platforms. In this paper we review
whether this technology has any future in the
high-performance computing community, for example by
targeting the same application space as the Java-Grande
Forum. We review the technology by benchmarking three
implementations of the CLI and compare those with the
results on Java virtual machines.",
acknowledgement = ack-nhfb,
}

@InProceedings{Makino:2003:PET,
author =       "Junichiro Makino and Eiichiro Kokubo and Toshiyuki
Fukushige and Hiroshi Daisaka",
title =        "Performance evaluation and tuning of {GRAPE-6} ---
towards 40 real' {Tflops}",
crossref =     "ACM:2003:SII",
pages =        "??--??",
year =         "2003",
bibdate =      "Wed Nov 26 07:34:20 2003",
URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10711#0;
http://www.sc-conference.org/sc2003/paperpdfs/pap116.pdf",
abstract =     "In this paper, we describe the performance
characteristics of GRAPE-6, the sixth-generation
special-purpose computer for gravitational many-body
problems. GRAPE-6 consists of 2048 custom pipeline
chips, each of which integrates six pipeline processors
specialized for the calculation of gravitational
interaction between particles. The GRAPE hardware
performs the evaluation of the interaction. The
frontend processors perform all other operations, such
as the time integration of the orbits of particles,
I/O, on-the-fly analysis etc. The theoretical peak
speed of GRAPE-6 is 63.4 Tflops. We present the result
of benchmark runs, and discuss the performance
characteristics. We also present the measured
performance for a few real scientific applications. The
best performance so far achieved with real applications
is 35.3 Tflops.",
acknowledgement = ack-nhfb,
}

@InProceedings{Komatitsch:2003:BDF,
author =       "Dimitri Komatitsch and Seiji Tsuboi and Chen Ji and
Jeroen Tromp",
title =        "A 14.6 billion degrees of freedom, 5 teraflops, 2.5
terabyte earthquake simulation on the {Earth
Simulator}",
crossref =     "ACM:2003:SII",
pages =        "??--??",
year =         "2003",
bibdate =      "Wed Nov 26 07:34:20 2003",
URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10711#1;
http://www.sc-conference.org/sc2003/paperpdfs/pap124.pdf",
abstract =     "We use 1944 processors of the Earth Simulator to model
seismic wave propagation resulting from large
earthquakes. Simulations are conducted based upon the
spectral-element method, a high-degree finite-element
technique with an exactly diagonal mass matrix. We use
a very large mesh with 5.5 billion grid points (14.6
billion degrees of freedom). We include the full
complexity of the Earth, i.e., a three-dimensional
wave-speed and density structure, a 3-D crustal model,
ellipticity as well as topography and bathymetry. A
total of 2.5 terabytes of memory is needed. Our
implementation is purely based upon MPI, with loop
vectorization on each processor. We obtain an excellent
vectorization ratio of 99.3\%, and we reach a
performance of 5 teraflops (30\% of the peak
performance) on 38\% of the machine. The very high
resolution of the mesh allows us to perform fully
three-dimensional calculations at seismic periods as
low as 5 seconds.",
acknowledgement = ack-nhfb,
}

@InProceedings{Warren:2003:SSM,
author =       "Michael S. Warren and Chris L. Fryer and M. Patrick
Goda",
title =        "The {Space Simulator}: Modeling the Universe from
Supernovae to Cosmology",
crossref =     "ACM:2003:SII",
pages =        "??--??",
year =         "2003",
bibdate =      "Wed Nov 26 07:34:20 2003",
URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10711#2;
http://www.sc-conference.org/sc2003/paperpdfs/pap226.pdf",
abstract =     "The Space Simulator is a 294-processor Beowulf cluster
with theoretical peak performance just below 1.5
Teraflop/s. It is based on the Shuttle XPC SS51G mini
chassis. Each node consists of a 2.53 GHz Pentium 4
processor, 1 Gb of 333 MHz DDR SDRAM, an 80 Gbyte
Maxtor hard drive, and a 3Com 3C996B-T Gigabit Ethernet
card. The network is made up of a Foundry FastIron 1500
and 800 Gigabit Ethernet switch. Each individual node
cost less than $1000, and the entire system cost under$500,000. The cluster achieved Linpack performance of
665.1 Gflop/s on 288 processors in October 2002, making
it the 85th fastest computer in the world according to
the 20th TOP500 list. Performance has since improved to
757.1 Linpack Gflop/s, ranking at \#88 on the 21st
TOP500 list. This is the first machine in the TOP500 to
surpass Linpack price/performance of 1 dollar per
Mflop/s.",
acknowledgement = ack-nhfb,
}

@InProceedings{Dally:2003:MSS,
author =       "William J. Dally and Patrick Hanrahan and Mattan Erez
and Timothy J. Knight and Francois Labonte and Jung-Ho
Ahn and Nuwan Jayasena and Ujval J. Kapasi and Abhishek
Das and Jayanth Gummaraju and Ian Buck",
title =        "{Merrimac}: Supercomputing with Streams",
crossref =     "ACM:2003:SII",
pages =        "??--??",
year =         "2003",
bibdate =      "Wed Nov 26 07:34:20 2003",
URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10717#0;
http://www.sc-conference.org/sc2003/paperpdfs/pap246.pdf",
abstract =     "Merrimac uses stream architecture and advanced
interconnection networks to give an order of magnitude
more performance per unit cost than cluster-based
scientific computers built from the same technology.
Organizing the computation into streams and exploiting
the resulting locality using a register hierarchy
enables a stream architecture to reduce the memory
bandwidth required by representative applications by an
order of magnitude or more. Hence a processing node
with a fixed bandwidth (expensive) can support an order
of magnitude more arithmetic units (inexpensive). This
in turn allows a given level of performance to be
achieved with fewer nodes (a 1-PFLOPS machine, for
example, with just 8,192 nodes) resulting in greater
reliability, and simpler system management. We sketch
the design of Merrimac, a streaming scientific computer
that can be scaled from a \$20K 2 TFLOPS workstation to a \$20M 2 PFLOPS supercomputer and present the results
of some initial application experiments on this
architecture.",
acknowledgement = ack-nhfb,
}

@InProceedings{Taiji:2003:PEP,
author =       "Makoto Taiji and Tetsu Narumi and Yousuke Ohno and
Noriyuki Futatsugi and Atsushi Suenaga and Naoki Takada
and Akihiko Konagaya",
title =        "{Protein Explorer}: {A} Petaflops Special-Purpose
Computer System for Molecular Dynamics Simulations",
crossref =     "ACM:2003:SII",
pages =        "??--??",
year =         "2003",
bibdate =      "Wed Nov 26 07:34:20 2003",
URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10717#1;
http://www.sc-conference.org/sc2003/paperpdfs/pap168.pdf",
abstract =     "We are developing the Protein Explorer' system, a
petaflops special-purpose computer system for molecular
dynamics simulations. The Protein Explorer is a PC
cluster equipped with special-purpose engines that
calculate nonbonded interactions between atoms, which
is the most time-consuming part of the simulations. A
dedicated LSI MDGRAPE-3 chip' performs these force
calculations at a speed of 165 gigaflops or higher. The
system will have 6,144 MDGRAPE-3 chips to achieve a
nominal peak performance of one petaflop. The system
will be completed in 2006. In this paper, we describe
the project plans and the architecture of the Protein
Explorer.",
acknowledgement = ack-nhfb,
}

@InProceedings{Anderson:2003:EES,
author =       "Wendell Anderson and Preston Briggs and C. Stephen
Hellberg and Daryl W. Hess and Alexei Khokhlov and
Marco Lanzagorta and Robert Rosenberg",
title =        "Early Experience with Scientific Programs on the {Cray
MTA-2}",
crossref =     "ACM:2003:SII",
pages =        "??--??",
year =         "2003",
bibdate =      "Wed Nov 26 07:34:20 2003",
URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10717#2;
http://www.sc-conference.org/sc2003/paperpdfs/pap271.pdf",
abstract =     "We describe our experiences porting and tuning three
scientific programs to the Cray MTA-2, paying
particular attention to the problems posed by I/O. We
have measured the performance of each of the programs
over many different machine configurations and we
report on the scalability of each program. In addition,
we compare the performance of the MTA with that of an
SGI Origin running all three programs.",
acknowledgement = ack-nhfb,
}

@InProceedings{Singh:2003:MCS,
author =       "Gurmeet Singh and Shishir Bharathi and Ann Chervenak
and Ewa Deelman and Carl Kesselman and Mary Manohar and
Sonal Patil and Laura Pearlman",
title =        "A Metadata Catalog Service for Data Intensive
Applications",
crossref =     "ACM:2003:SII",
pages =        "??--??",
year =         "2003",
bibdate =      "Wed Nov 26 07:34:20 2003",
URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10718#0;
http://www.sc-conference.org/sc2003/paperpdfs/pap242.pdf",
abstract =     "Advances in computational, storage and network
technologies as well as middle ware such as the Globus
Toolkit allow scientists to expand the sophistication
and scope of data -intensive applications . These
applications produce and analyze terabytes and
petabytes of data that are distributed in millions of
files or objects. To manage these large data sets
the data needs to be managed. There are various types
services will exist in Grid environments that are
specialized for particular types of metadata
cataloguing and discovery. In this paper, we present
the design of a Metadata Catalog Service (MCS) that
provides a mechanism for storing and accessing
descriptive metadata and allows users to query for data
items based on desired attributes. We describe our
experience in using the MCS with several applications
and present a scalability study of the service.",
acknowledgement = ack-nhfb,
}

@InProceedings{Deelman:2003:GBG,
author =       "Ewa Deelman and Raymond Plante and Carl Kesselman and
Gurmeet Singh and Mei Su and Gretchen Greene and Robert
Hanisch and Niall Gaffney and Antonio Volpicelli and
James Annis and Vijay Sekhri and Tamas Budavari and
Maria Nieto-Santisteban and William O'Mullane and David
Bohlender and Tom McGlynn and Arnold Rots and Olga
Pevunova",
title =        "Grid-Based Galaxy Morphology Analysis for the
{National Virtual Observatory}",
crossref =     "ACM:2003:SII",
pages =        "??--??",
year =         "2003",
bibdate =      "Wed Nov 26 07:34:20 2003",
URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10718#1;
http://www.sc-conference.org/sc2003/paperpdfs/pap282.pdf",
abstract =     "As part of the development of the National Virtual
Observatory (NVO), a Data Grid for astronomy, we have
developed a prototype science application to explore
the dynamical history of galaxy clusters by analyzing
the galaxies' morphologies. The purpose of the
prototype is to investigate how Grid-based technologies
can be used to provide specialized computational
services within the NVO environment. In this paper we
focus on the key enabling technology components,
particularly Chimera and Pegasus which are used to
create and manage the computational workflow that must
be present to deal with the challenging application
requirements. We illustrate how the components
interplay with each other and can be driven from a
special purpose application portal.",
acknowledgement = ack-nhfb,
}

@InProceedings{Allen:2003:LPB,
author =       "Matthew S. Allen and Rich Wolski",
title =        "The {Livny} and {Plank-Beck} Problems: Studies in Data
Movement on the Computational Grid",
crossref =     "ACM:2003:SII",
pages =        "??--??",
year =         "2003",
bibdate =      "Wed Nov 26 07:34:20 2003",
URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10718#2;
http://www.sc-conference.org/sc2003/paperpdfs/pap266.pdf",
abstract =     "Research on scheduling this data management has
focused on both the problem of distributing the storage
load among a set of servers and on replication as a way
of ensuring reliability and data proximity. In order to
store large data sets and keep their load balanced
across many hosts, many applications choose to divide
these sets into sections and distribute them. To access
these files reliably in spite of individual host
failures, these sections are frequently replicated
across many file servers. While the projects cited
above have each explored these problems in different
ways, commonalities among the various successful
solutions are beginning to emerge. In this paper, we
investigate two such commonalities, identified by noted
researchers in the field: Dr. Miron Livny [4] from the
University of Wisconsin, and Dr. James Plank [2] and
Dr. Micah Beck [3] from the University of Tennessee.",
acknowledgement = ack-nhfb,
}

@InProceedings{Jones:2003:ISP,
author =       "Terry Jones and William Tuel and Larry Brenner and
Jeff Fier and Patrick Caffrey and Shawn Dawson and Rob
Neely and Robert Blackmore and Brian Maskell and Paul
Tomlinson and Mark Roberts",
title =        "Improving the Scalability of Parallel Jobs by adding
Parallel Awareness to the Operating System",
crossref =     "ACM:2003:SII",
pages =        "??--??",
year =         "2003",
bibdate =      "Wed Nov 26 07:34:20 2003",
URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10716#0;
http://www.sc-conference.org/sc2003/paperpdfs/pap136.pdf",
abstract =     "A parallel application benefits from scheduling
policies that include a global perspective of the
application's process working set. As the interactions
among cooperating processes increase, mechanisms to
ameliorate waiting within one or more of the processes
become more important. In particular, collective
operations such as barriers and reductions are
extremely sensitive to even usually harmless events
such as context switches among members of the process
working set. For the last 18 months, we have been
researching the impact of random short-lived
interruptions such as timer-decrement processing and
periodic daemon activity, and developing strategies to
minimize their impact on large processor-count SPMD
bulk-synchronous programming styles. We present a novel
co-scheduling scheme for improving performance of
fine-grain collective activities such as barriers and
reductions, describe an implementation consisting of
operating system kernel modifications and run-time
system, and present a set of empirical results
comparing the technique with traditional operating
system scheduling. Our results indicate a speedup of
over 300\% on synchronizing collectives.",
acknowledgement = ack-nhfb,
}

@InProceedings{Fernandez:2003:BMN,
author =       "Juan Fernandez and Eitan Frachtenberg and Fabrizio
Petrini",
title =        "{BCS-MPI}: a New Approach in the System Software
Design for Large-Scale Parallel Computers",
crossref =     "ACM:2003:SII",
pages =        "??--??",
year =         "2003",
bibdate =      "Wed Nov 26 07:34:20 2003",
URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10716#1;
http://www.sc-conference.org/sc2003/paperpdfs/pap306.pdf",
abstract =     "Buffered CoScheduled MPI (BCS-MPI) introduces a new
approach to design the communication layer for
large-scale parallel machines. The emphasis of BCS-MPI
is on the global coordination of a large number of
communicating processes rather than on the traditional
optimization of the point-to-point performance. BCS-MPI
delays the interprocessor communication in order to
schedule globally the communication pattern and it is
designed on top of a minimal set of collective
communication primitives. In this paper we describe a
prototype implementation of BCS-MPI and its
communication protocols. Several experimental results,
executed on a set of scientific applications, show that
BCS-MPI can compete with a production-level MPI
implementation, but is much simpler to implement, debug
and model. Keywords: MPI, buffered coscheduling, STORM,
cluster computing, large-scale parallel computers.",
acknowledgement = ack-nhfb,
}

@InProceedings{Moody:2003:SNB,
author =       "Adam Moody and Juan Fernandez and Fabrizio Petrini and
Dhabaleswar K. Panda",
title =        "Scalable {NIC}-based Reduction on Large-Scale
Clusters",
crossref =     "ACM:2003:SII",
pages =        "??--??",
year =         "2003",
bibdate =      "Wed Nov 26 07:34:20 2003",
URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10716#2;
http://www.sc-conference.org/sc2003/paperpdfs/pap316.pdf",
abstract =     "Many parallel algorithms require efficient reduction
collectives. In response, researchers have designed
algorithms considering a range of parameters including
data size, system size, and communication
characteristics. Throughout this past work, however,
processing was limited to the host CPU. Today, modern
Network Interface Cards (NICs) sport programmable
processors with substantial memory, and thus introduce
a fresh variable into the equation. In this paper, we
investigate this new option in the context of
large-scale clusters. Through experiments on the
960-node, 1920-processor ASCI Linux Cluster (ALC) at
Lawrence Livermore National Laboratory, we show that
NIC-based reductions outperform host-based algorithms
in terms of reduced latency and increased consistency.
In particular, in the largest configuration tested ---
1812 processors --- our NIC-based algorithm summed
single-element vectors of 32-bit integers and 64-bit
floating-point numbers in 73 $\mu$s and 118 $\mu$s,
respectively. These results represent respective
improvements of 121\% and 39\% over the
production-level MPI library.",
acknowledgement = ack-nhfb,
}

@InProceedings{Worringen:2003:FPN,
author =       "Joachim Worringen and Jesper Larson Traff and Hubert
Ritzdorf",
title =        "Fast Parallel Non-Contiguous File Access",
crossref =     "ACM:2003:SII",
pages =        "??--??",
year =         "2003",
bibdate =      "Wed Nov 26 07:34:20 2003",
URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10722#0;
http://www.sc-conference.org/sc2003/paperpdfs/pap319.pdf",
abstract =     "Many applications of parallel I/O perform
non-contiguous file accesses: instead of accessing a
single (large) block of data in a file, a number of
(smaller) blocks of data scattered throughout the file
needs to be accessed in each logical I/O operation.
However, only few file system interfaces directly
support this kind of non-contiguous file access. In
contrast, the most commonly used parallel programming
interface, MPI, incorporates a flexible model of
parallel I/O through its MPI-IO interface. With MPI-IO,
arbitrary non-contiguous file accesses are supported in
a uniform fashion by the use of derived MPI datatypes
set up by the user to reflect the desired I/O
pattern.\par

Despite a considerable amount of recent work in this
area, current MPI-IO implementations suffer from low
performance of such non-contiguous accesses when
compared to the performance of the storage system for
contiguous accesses. In this paper we analyze an
important bottleneck in the efficient handling of
non-contiguous access patterns in current
implementations of MPIIO. We present a new technique,
termed listless I/O, that can be incorporated into
MPI-IO implementations like the well-known ROMIO
implementation, and completely eliminates this
bottleneck. We have implemented the technique in
MPI/SX, the MPI implementation for the NEC SX-series of
parallel vector computers. Results with a synthetic
benchmark and an application kernel show that listless
I/O is able to increase the bandwidth for
non-contiguous file access by sometimes more than a
factor of 500 when compared to the traditional
approach.",
acknowledgement = ack-nhfb,
}

@InProceedings{Li:2003:PNH,
author =       "Jianwei Li and Wei-keng Liao and Alok Choudhary and
Robert Ross and Rajeev Thakur and William Gropp and Rob
Latham and Andrew Siegel and Brad Gallagher and Michael
Zingale",
title =        "{Parallel netCDF}: {A} High-Performance Scientific
{I/O} Interface",
crossref =     "ACM:2003:SII",
pages =        "??--??",
year =         "2003",
bibdate =      "Wed Nov 26 07:34:20 2003",
URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10722#1;
http://www.sc-conference.org/sc2003/paperpdfs/pap258.pdf",
abstract =     "Dataset storage, exchange, and access play a critical
role in scientific applications. For such purposes
netCDF serves as a portable, efficient file format and
programming interface, which is popular in numerous
scientific application domains. However, the original
interface does not provide an efficient mechanism for
parallel data storage and access. In this work, we
present a new parallel interface for writing and
reading netCDF datasets. This interface is derived with
minimal changes from the serial netCDF interface but
defines semantics for parallel access and is tailored
for high performance. The underlying parallel I/O is
achieved through MPI-IO, allowing for substantial
performance gains through the use of collective I/O
optimizations. We compare the implementation strategies
and performance with HDF5. Our tests indicate
programming convenience and significant I/O performance
improvement with this parallel netCDF (PnetCDF)
interface.",
acknowledgement = ack-nhfb,
}

author =       "Scott Alan Klasky and Stephane Ethier and Zhihong Lin
and Kevin Martins and Doug McCune and Ravi Samtaney",
title =        "Grid-Based Parallel Data Streaming implemented for the
Gyrokinetic Toroidal Code",
crossref =     "ACM:2003:SII",
pages =        "??--??",
year =         "2003",
bibdate =      "Wed Nov 26 07:34:20 2003",
URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10722#2;
http://www.sc-conference.org/sc2003/paperpdfs/pap207.pdf",
abstract =     "We have developed a threaded parallel data streaming
approach using Globus to transfer multi-terabyte
simulation data from a remote supercomputer to the
scientist's home analysis/visualization cluster, as the
simulation executes, with negligible overhead. Data
transfer experiments show that this concurrent data
transfer approach is more favorable compared with
writing to local disk and then transferring this data
to be post-processed. The present approach is conducive
to using the grid to pipeline the simulation with
post-processing and visualization. We have applied this
method to the Gyrokinetic Toroidal Code (GTC), a
3-dimensional particle-in-cell code used to study
micro-turbulence in magnetic confinement fusion from
first principles plasma theory.",
acknowledgement = ack-nhfb,
}

@InProceedings{Wisniewski:2003:EUS,
author =       "Robert W. Wisniewski and Bryan Rosenburg",
title =        "Efficient, Unified, and Scalable Performance
Monitoring for Multiprocessor Operating Systems",
crossref =     "ACM:2003:SII",
pages =        "??--??",
year =         "2003",
bibdate =      "Wed Nov 26 07:34:20 2003",
URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10721#0;
http://www.sc-conference.org/sc2003/paperpdfs/pap121.pdf",
abstract =     "Programming, understanding, and tuning the performance
of large multiprocessor systems is challenging. Experts
have difficulty achieving good utilization for
applications on large machines. The task of
implementing a scalable system such as an operating
system or database on large machines is even more
challenging. And the importance of achieving good
performance on multiprocessor machines is increasing as
the number of cores per chip increases and as the size
of multiprocessors increases. Crucial to achieving good
performance is being able to understand the behavior of
the system.\par

We have developed an efficient, unified, and scalable
tracing infrastructure that allows for correctness
debugging, performance debugging, and performance
monitoring of an operating system. The infrastructure
allows variable-length events to be logged without
The infrastructure allows cheap and parallel logging of
events by applications, libraries, servers, and the
kernel. The infrastructure was designed for K42, a new
open-source research kernel designed to scale near
perfectly on large cache-coherent 64-bit multiprocessor
systems. The techniques are generally applicable, and
many of them have been integrated into the Linux Trace
Toolkit. In this paper, we describe the implementation
of the infrastructure, how we used the facility, e.g.,
analyzing lock contention, to understand and achieve
K42's scalable performance, and the lessons we learned.
The infrastructure has been invaluable to achieving
great scalability.",
acknowledgement = ack-nhfb,
}

@InProceedings{Itzkowitz:2003:MPU,
author =       "Marty Itzkowitz and Brian J. N. Wylie and Christopher
Aoki and Nicolai Kosche",
title =        "Memory Profiling using Hardware Counters",
crossref =     "ACM:2003:SII",
pages =        "??--??",
year =         "2003",
bibdate =      "Wed Nov 26 07:34:20 2003",
URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10721#1;
http://www.sc-conference.org/sc2003/paperpdfs/pap182.pdf",
abstract =     "Although memory performance is often a limiting factor
in application performance, most tools only show
performance data relating to the instructions in the
program, not to its data. In this paper, we describe a
technique for directly measuring the memory profile of
an application. We describe the tools and their user
model, and then discuss a particular code, the MCF
benchmark from SPEC CPU 2000. We show performance data
for the data structures and elements, and discuss the
use of the data to improve program performance.
Finally, we discuss extensions to the work to provide
feedback to the compiler for prefetching and to
generate additional reports from the data.",
acknowledgement = ack-nhfb,
}

@InProceedings{Mohan:2003:IES,
author =       "Tushar Mohan and Bronis R. de Supinski and Sally A.
McKee and Frank Mueller and Andy Yoo and Martin
Schulz",
title =        "Identifying and Exploiting Spatial Regularity in Data
Memory References",
crossref =     "ACM:2003:SII",
pages =        "??--??",
year =         "2003",
bibdate =      "Wed Nov 26 07:34:20 2003",
URL =          "http://www.sc-conference.org/sc2003/inter_cal/inter_cal_detail.php?eventid=10721#2;
http://www.sc-conference.org/sc2003/paperpdfs/pap290.pdf",
abstract =     "The growing processor/memory performance gap causes
the performance of many codes to be limited by memory
accesses. If known to exist in an application, strided
memory accesses forming streams can be targeted by
optimizations such as prefetching, relocation,
remapping, and vector loads. Undetected, they can be a
significant source of memory stalls in loops. Existing
stream-detection mechanisms either require special
hardware, which may not gather statistics for
subsequent analysis, or are limited to compile-time
detection of array accesses in loops. Formally, little
treatment has been accorded to the subject; the concept
of locality fails to capture the existence of streams
in a program's memory accesses. The contributions of
this paper are as follows. First, we define spatial
regularity as a means to discuss the presence and
effects of streams. Second, we develop measures to
quantify spatial regularity, and we design and
implement an on-line, parallel algorithm to detect
streams -- and hence regularity -- in running
applications. Third, we use examples from real codes
and common benchmarks to illustrate how derived stream
statistics can be used to guide the application of
profile-driven optimizations. Overall, we demonstrate
the benefits of our novel regularity metric as an
instrument to detect potential for code optimizations
affecting memory performance.",
acknowledgement = ack-nhfb,
}

%%% ====================================================================
%%% Cross-referenced entries must come last:

@Proceedings{ACM:2003:SII,
editor =       "{ACM}",
booktitle =    "SC2003: Igniting Innovation. {Phoenix, AZ, November
15--21, 2003}",
title =        "{SC2003}: Igniting Innovation. {Phoenix, AZ, November
15--21, 2003}",
publisher =    pub-ACM # " and " # pub-IEEE,
`