### -*-awk-*-
### ====================================================================
###  @Awk-file{
###     author          = "Nelson H. F. Beebe",
###     version         = "1.00",
###     date            = "11 November 1999",
###     time            = "11:00:00 CET",
###     filename        = "bibsplit.awk",
###     copyright       = "Copyright (C) 1999 Nelson H. F. Beebe",
###     address         = "Center for Scientific Computing
###                        University of Utah
###                        Department of Mathematics, 322 INSCC
###                        155 S 1400 E RM 233
###                        Salt Lake City, UT 84112-0090
###                        USA",
###     telephone       = "+1 801 581 5254",
###     FAX             = "+1 801 585 1640, +1 801 581 4148",
###     URL             = "http://www.math.utah.edu/~beebe",
###     checksum        = "44397 1815 7730 60823",
###     email           = "beebe@math.utah.edu, beebe@acm.org,
###                        beebe@ieee.org (Internet)",
###     codetable       = "ISO/ASCII",
###     keywords        = "bibliography, BibTeX, splitting",
###     supported       = "yes",
###     docstring       = "This program implements the main processing
###                        of the bibsplit utility.  It is invoked by
###                        a UNIX shell script which handles some of
###                        the command-line options, and system-, user-,
###                        and directory-specific initialization files.
###
###                        Full documentation can be found in the
###                        accompanying UNIX manual pages which are
###                        installed with the program.
###
###                        The first version of this program was
###                        written mostly on the last
###                        Veteran's/Remembrance Day of the Twentieth
###                        Century, and is dedicated to the memory of
###                        those we lost in wars.  The revision date
###                        and time intentionally reflect that.
###
###                        The checksum field above contains a CRC-16
###                        checksum as the first value, followed by the
###                        equivalent of the standard UNIX wc (word
###                        count) utility output of lines, words, and
###                        characters.  This is produced by Robert
###                        Solovay's checksum utility.",
###  }
### ====================================================================

########################################################################
########################################################################
########################################################################
###                                                                  ###
### bibsplit: split BibTeX bibliography files into independent parts ###
###                                                                  ###
###              Copyright (C) 1999 Nelson H. F. Beebe               ###
###                                                                  ###
### This program is covered by the GNU General Public License (GPL), ###
### version 2 or later, available as the file COPYING in the program ###
### source distribution, and on the Internet at                      ###
###                                                                  ###
###               ftp://ftp.gnu.org/gnu/GPL                          ###
###                                                                  ###
###               http://www.gnu.org/copyleft/gpl.html               ###
###                                                                  ###
### This program is free software; you can redistribute it and/or    ###
### modify it under the terms of the GNU General Public License as   ###
### published by the Free Software Foundation; either version 2 of   ###
### the License, or (at your option) any later version.              ###
###                                                                  ###
### This program is distributed in the hope that it will be useful,  ###
### but WITHOUT ANY WARRANTY; without even the implied warranty of   ###
### MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the    ###
### GNU General Public License for more details.                     ###
###                                                                  ###
### You should have received a copy of the GNU General Public        ###
### License along with this program; if not, write to the Free       ###
### Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,   ###
### MA 02111-1307 USA                                                ###
########################################################################
########################################################################
########################################################################

########################################################################
########################################################################
########################################################################
### NB: The programming conventions for variables in this program    ###
###     are:                                                         ###
###                                                                  ###
###     UPPERCASE               global constants and user options    ###
###     Initialuppercase        global variables                     ###
###     lowercase               local variables                      ###
###                                                                  ###
### Outer comments begin with three sharps, block comments with two, ###
### and inline comments with one, by venerable tradition from Lisp.  ###
###                                                                  ###
### Functions are defined in strict lexicographic order.             ###
###                                                                  ###
### Local variables inside functions are declared in the function    ###
### argument list after a space, and appear in lexicographic order.  ###
###                                                                  ###
### Any deviation from these conventions is an error!                ###
########################################################################
########################################################################
########################################################################

########################################################################
### Index of global variables (there are 50, including 20 tables, which
### is far too many, but unavoidable in an implementation language that
### has no namespace control; the alternative is as many global access
### functions).  The list from which this table was derived can be
### reliably produced like this:
###
###	env AWKGLOBALS=/dev/stdout pawk -f bibsplit.awk /dev/null | \
###		awk '{print "###\t" $NF}' | \
###			sort -f -u | \
###				awk '{print "###"; print}'
###
###	BibTeX_Files_of_Referenced_Label[] table of lists of temporary
###					filenames, indexed by citation
###					labels which appear in a
###					\cite{} or crossref="label";
###					those files need the entry with
###					that label
###
###	BibTeX_to_Temp_Map[]		table of temporary filenames,
###					indexed by output BibTeX
###					filenames; this is the inverse
###					of Temp_to_BibTeX_Map[]
###
###	BYCENTURY			non-zero if ``-bycentury'' was
###					used
###
###	BYDECADE			non-zero if ``-bydecade'' was
###					used
###
###	BYHALFCENTURY			non-zero if ``-byhalfcentury''
###					was used
###
###	BYLABEL				non-zero if ``-bylabel'' was
###					used
###
###	BYNUMBER			nnnn if ``-bynumber nnnn'' was
###					used
###
###	BYPENTAD			non-zero if ``-bypentad'' was
###					used
###
###	BYRANGE				non-empty if ``-byrange
###					range-list'' was used
###
###	BYSCORE				non-zero if ``-byscore'' was
###					used
###
###	BYYEAR				non-zero if ``-byyear'' was used
###
###	Comment_Block			current block of comment lines
###
###	Destination[]			table of output BibTeX
###					filenames, indexed by citation
###					label
###
###	Entry_Comment_Header		current entry-block comments
###
###	Entry_Count			total non-@Preamble, non-@String
###					entries processed
###
###	File_Comment_Header		current file comment header
###
###	File_Comment_Trailer		current file comment trailer
###
###	FILTER				command string from ``-filter
###					command'' option
###
###	Input_Filename[]		table of input file names,
###					indexed by temporary output
###					filenames
###
###	Input_Files[]			table of 1's, indexed by input
###					file name
###
###	Input_File_Count		count of input files seen so far
###
###	Input_File_List			space-separated list of input
###					filenames
###
###	Label_BibTeX_File[]		table of (label,BibTeX-file)
###					pairs that record which
###					entries have already been
###					output
###
###	MAXOPEN				value from ``-maxopen nnn''
###					option
###
###	MAXTEMPOPEN			maximum number of open temporary
###					files (reduced from MAXOPEN)
###
###	N_Open_Temp_Files		number of currently open
###					temporary files
###
###	Open_Tempfiles_by_Name[]	table of 1's, indexed by
###					currently open temporary
###					filenames
###
###	Open_Tempfiles_by_Number[]	table of open temporary files,
###					indexed by
###					1..N_Open_Temp_Files
###
###	Part_Entry_Count		total non-@Preamble, non-@String
###					entries processed from the
###					current output file if
###					``-bynumber nnnn'' was used
###
###	Preamble			block of all @Preamble{}
###					sections encountered
###
###	Preamble_Comment_Header[]	table of preamble comment
###					headers, indexed by input
###					filename
###
###	PREFIX				output filename prefix, from
###					``-prefix xxx''
###
###	Range_of_Letter[]		table of letters and/or letter
###					pairs, from ``-byrange
###					range-list'', indexed by
###					characters 0..255
###
###	References_of_Label[]		table of lists of citation
###					labels not yet output to
###					BibTeX files, indexed by
###					citation label, initially
###					directly referenced from the
###					entry with that label, and
###					later expanded to include all
###					those ultimately reachable
###					from the entry with that label
###
###	Start_Entry_Count		starting entry count in current
###					temporary output file, counting
###					monotonically from the first
###					input entry
###
###	Strings_of_BibTeX_File[]	table of 1's, indexed by
###					(bibtex-filename,string-name)
###
###	String_Comment_Header[]		table of comment headers,
###					indexed by @String{} name
###
###	String_Entry[]			table of @String{} groups,
###					indexed by @String{} name
###
###	String_First_Entry[]		table of @String{} groups,
###					indexed by @String{} name, but
###					with each run of horizontal
###					and vertical whitespace
###					collapsed to a single blank
###
###	String_Location[]		table of filename:record-number
###					pairs, indexed by @String{}
###					name
###
###	String_Reference_Count[]	table of string name reference
###					counts, indexed by @String{}
###					name
###
###	Temp_to_BibTeX_Map[]		table of output filenames,
###					indexed by temporary
###					filenames; this is the inverse
###					of BibTeX_to_Temp_Map[]
###
###	This_Filename			current input filename
###
###	This_Input_Entry_Count		total entries, including
###					@Preamble and @String, in the
###					current input file
###
###	This_Prefix			last temporary filename prefix
###					generated by prefixname()
###
###	This_Reference_Entry_Count	total non-@Preamble, non-@String
###					entries in the current input
###					file
###
###	TMP				file extension for temporary
###					files
###
###	TMPDIR				directory name for temporary
###					files
###
###	TMPEXT				random unique file extension for
###					temporary files
###
###	YEAR_INTERVAL			number of publication years in a
###					year group in each output file;
###					the year-group
###
########################################################################

BEGIN					{ initialize() }

FILENAME != THIS_FILENAME 		{ new_file() }

/^ *%/					{ do_comment($0); next }

/^ *@[Ss][Tt][Rr][Ii][Nn][Gg] *{/	{ do_string($0); next }

/^ *@[Pp][Rr][Ee][Aa][Mm][Bb][Ll][Ee] *{/ { do_preamble($0); next }

/^ *@[A-Za-z]+{[^,]*,/			{ do_entry($0); next }

/^[ \b\f\r\t\v]*$/			{ next } # ignore blank and empty lines

					# Everything else is converted to a comment
					{ do_comment("%%% " $0); next }

END					{ terminate() }

### ====================================================================


function append_file(infile,outfile, old_RS)
{
    ## Append infile to outfile.

    ## During development, I wondered how the time would change if I
    ## switched to this code:
    ##
    ##     close(outfile)
    ##     system("cat " infile " >> " outfile)
    ##
    ## instead of the loop below.  A test run on a 15MB test file with
    ## 22,235 bibliographic entries showed that the cat-version ran
    ## slightly slower (about 2%).  For portability, it is better to
    ## use awk for the copy.

    old_RS = RS

    ## Optimization: copy records as paragraphs, instead of as lines.
    ## With mawk, this makes no noticeable difference, but with gawk,
    ## execution time is reduced by about 5%, and with nawk, about 23%.

    RS = ""			# records are now paragraphs
    while ((getline < infile) > 0)
    {
	print > outfile
	print "" > outfile	# don't forget the paragraph break!
    }

    RS = old_RS
}


function basename(filename)
{
    ## Remove any trailing file extension from filename, and
    ## return the result.  Any leading directory path is preserved.

    sub("[.][^.]*$","",filename)
    return (filename)
}


function brace_count(s, k,n,t)
{
    ## NB: This implementation of brace_count() is new with bibsort
    ## version 0.13 (and borrowed for bibsplit); see the README file in
    ## the bibsort distribution for a lengthy performance report.  The
    ## old algorithm is labeled bc-1 there, and the new one, bc-2.  On
    ## the tests there, the new one was up to 25.6 times faster.

    n = 0
    t = s
    while ((k = index(t,"{")) > 0)
    {
	n++
	t = substr(t,k+1)
    }
    t = s
    while ((k = index(t,"}")) > 0)
    {
	n--
	t = substr(t,k+1)
    }
    return (n)
}


function close_oldest_file( k)
{
    ## Close the file named by the first entry in the
    ## Open_Tempfiles_by_Name[] list, and shuffle that list down to
    ## make room for a new open file.

    if (N_Open_Temp_Files > 1)
    {
	close(Open_Tempfiles_by_Number[1])
	delete Open_Tempfiles_by_Name[Open_Tempfiles_by_Number[1]]
	for (k = 2; k <= N_Open_Temp_Files; ++k)
	    Open_Tempfiles_by_Number[k-1] = Open_Tempfiles_by_Number[k]
	delete Open_Tempfiles_by_Number[N_Open_Temp_Files]
	N_Open_Temp_Files--
    }
}


function collect_braced_item(infile,s, count,item,line)
{
    ## Starting with the current contents of s, collect lines from
    ## infile until we reach a zero brace count. To guard against
    ## infinite loops in the event of unbalanced braces, we abruptly
    ## terminate processing if an at-sign is detected in column 1.
    ## This function is adapted from a simpler (infile-less) version
    ## in bibsort.

    count = brace_count(trim_trailing(squeeze(s)))
    item = s "\n"
    while (count != 0)
    {
	if (infile == FILENAME)	# then in outer pattern/action loop
	{
	    if ((getline line) <= 0)
		    break
	}
	else			# reading a different input file
	{
	    if ((getline line < infile) <= 0)
		    break
	}
	if (substr(line,1,1) == "@") # should use match(line,/^[ \t]+@/),
				   # but this is faster, and usually correct
	    error("New entry encountered before balanced braces found")
	item = item trim_trailing(line) "\n"
	count += brace_count(line)
    }
    return item
}


function copy_entries(infile,label_list, bibtex_filenames,entry,k, \
		      label_table,n,nlabels,this_label)
{
    ## Read bibliographic entries from infile, and for each label that
    ## is found in the space-separated label_list, output it to all of
    ## the files in BibTeX_Files_of_Referenced_Label[label], in sorted
    ## order, so that any error messages that arise in the validation
    ## suite appear in the same order everywhere.

    if (label_list == "")	# this is commonly the case
	return

    progress_report("searching " infile " for missing referenced items " \
		    sample(label_list))

    nlabels = list_to_inverted_table(label_list,label_table)
    while ((entry = get_next_entry(infile)) != -1)
    {
	this_label = get_label(entry)

	## Because previous calls to this function may have augmented
	## the original contents of infile, we must be sure to only
	## consider labels that were part of the original infile, and
	## thus, we have the second test in this conditional. I
	## overlooked it in the first design, but dogged torture
	## testing fortunately exposed the error.

	if ((this_label in label_table) && (Destination[this_label] == infile))
	{
	    ## We have referenced this entry somewhere else and we need
	    ## to output it to all the referenced files.  To avoid too
	    ## many open output files, and avoid conflict with open
	    ## input files, we close each output file after appending a
	    ## single entry to it.

	    n = split(BibTeX_Files_of_Referenced_Label[this_label],bibtex_filenames," ")
	    quicksort(bibtex_filenames,1,n)

	    for (k = 1; k <= n; ++k)
	    {
		if (infile != bibtex_filenames[k]) # avoid writing input file,
		{		# although this condition should never arise
		    if (!((this_label,bibtex_filenames[k]) in Label_BibTeX_File))
		    {	# then this_label has not yet been written to bibtex_filenames[k]
			print entry >> bibtex_filenames[k]
			Label_BibTeX_File[this_label,bibtex_filenames[k]] = 1
			close(bibtex_filenames[k])
		    }
		}
	    }
	    if (--nlabels <= 0)		# label_list has been processed completely
		break
	}
    }
    close(infile)
}


function count(array, key,n)
{
    n = 0
    for (key in array)
        n++
    return (n)
}


function delete_files(filelist)
{
    ## Delete the files in the space-separated filelist.

    if (filelist != "")
	system("rm -f " filelist)
}


function do_comment(s)
{
    Comment_Block = Comment_Block trim_trailing(s) "\n"
}


function do_entry(s)
{
    output_entry_to_tempfile(collect_braced_item(FILENAME,s))
    Entry_Count++		# total non-@Preamble, non-@String entries
				# processed
    Part_Entry_Count++		# total non-@Preamble, non-@String
				# entries processed from the current output
				# file if ``-bynumber nnnn'' was specified
    This_Input_Entry_Count++	# total entries, including @Preamble and
				# @String, in the current input file
    This_Reference_Entry_Count++ # total non-@Preamble, non-@String
				# entries in the current input file
}


function do_preamble(s)
{
    if (Comment_Block != "")
    {
	if (This_Input_Entry_Count == 0)
	    File_Comment_Header[This_Filename] = \
		File_Comment_Header[This_Filename] Comment_Block
	else
	    Preamble_Comment_Header[This_Filename] = \
		Preamble_Comment_Header[This_Filename] Comment_Block
	Comment_Block = ""
    }
    Preamble = Preamble ((Preamble == "") ? "" : "\n\n") \
	collect_braced_item(FILENAME,s)
    This_Input_Entry_Count++
}


function do_string(s, item,name)
{
    name = s
    item = collect_braced_item(FILENAME,s)
    name = get_string_name(item)
    if (name in String_Entry)
    {
	if (squeeze_whitespace(item) != String_First_Entry[name])
	{
	    warning("duplicate definition of " name ": previous definition at " \
		    String_Location[name])
	    String_Entry[name] = String_Entry[name] "\n\n" item
	}
    }
    else
    {
	String_Entry[name] = item
	String_First_Entry[name] = squeeze_whitespace(item)
	String_Location[name] = this_filename() ":" FNR
	String_Reference_Count[name] = 0
    }
    if (Comment_Block != "")
    {
	if (This_Input_Entry_Count == 0)
	    File_Comment_Header[This_Filename] = \
		File_Comment_Header[This_Filename] Comment_Block
	else
	    String_Comment_Header[name] = String_Comment_Header[name] Comment_Block
	Comment_Block = ""
    }
    This_Input_Entry_Count++
}


function error(message)
{
    ## Print a message and terminate with a failing exit code.

    warning("FATAL ERROR: " message)
    exit(1)
}


function expand_file_references(label,bibtex_filename,expanded, k,list,n,parts)
{
    ## Return a space-separated list of the label, and all labels that
    ## its entry ultimately references, where each member of the
    ## returned list has not yet been output to bibtex_filename.
    ##
    ## The second argument, expanded[], is an array of 1's indexed by
    ## label, which is used to terminate recursion in the common case
    ## of circular dependencies.  On the first (outer) call, it MUST
    ## be empty.

    if (label == "") {print "%% DEBUG: Unexpected NULL label passed to expand_file_references()" ">/dev/stderr"; exit(255)}

    list = ""
    if (label in expanded) # then this label already recursively expanded
	list = ""
    else if (label in References_of_Label) # this label has dependents
    {
	expanded[label] = 1
	n = split(References_of_Label[label],parts," ")
	list =  ((label,bibtex_filename) in Label_BibTeX_File) ? "" : label
	for (k = 1; k <= n; ++k)
	    list = list " " expand_file_references(parts[k],bibtex_filename,expanded)
    }

    gsub(" +"," ",list)		# reduce duplicate separators
    return (list)
}


function expand_file_references_outer(label,bibtex_filename, expanded)
{
    ## This is a wrapper for expand_file_references() to ensure
    ## that the expanded[] array is empty on each outermost call.

    return (expand_file_references(label,bibtex_filename,expanded))
}


function filter_output( command,k,n,outfile)
{
    ## If FILTER is non-empty, for each output file in ascending
    ## lexicographic order, apply the user-specified filter to that
    ## file, producing a modified temporary output file, and if it
    ## succeeds, rename the temporary file to the original file name.
    ## The temporary file is named with suffix TMP instead of TMPEXT,
    ## so that the validation suite sees the same file name on all
    ## systems.

    if (FILTER != "")
    {
	n = table_to_sorted_table(Temp_to_BibTeX_Map,outfile)

	for (k = 1; k <= n; ++k)
	{
	    ## To reduce clutter, we don't show the final command that
	    ## removes any temporary file left from a failed filter
	    ## step.

	    command = ("< " outfile[k] " " FILTER " > " outfile[k] TMP \
		       " && mv " outfile[k] TMP " " outfile[k])
	    progress_report("filtering: " command)
	    system(command "; rm -f " outfile[k] TMP)
	}
    }
}


function get_label(entry, label)
{
    ## Extract the citation label from entry, and return a lowercase
    ## version of it, or "UNKNOWN" if no label can be found.

    if (match(entry,"^ *@[A-Za-z]+{[^,]*,"))
    {
	label = substr(entry,RSTART,RLENGTH)
	sub("^ *@[A-Za-z]+{[ \t]*","",label)
	sub("[ \t]*,","",label)
	label = tolower(label) # BibTeX ignores lettercase in labels
    }
    else
	label = "UNKNOWN"

    return (label)
}


function get_next_entry(infile, line)
{
    ## Get the next non-@String, non-@Preamble, BibTeX entry from
    ## infile, ignoring blank and empty lines, and comments, and
    ## return it.  If no entry can be found, return -1.

    while ((getline line < infile) > 0)
    {
	if (match(line,"^@[^{]*{[^,]*,"))
	    return (collect_braced_item(infile,line))
    }
    return (-1)
}


function get_string_name(s)
{
    ## Given @String{name = "value"} in s, extract and return name in
    ## lowercase (since BibTeX ignores lettercase in string names).

    ## This removal of the quoted string value should not be
    ## necessary, but this program uncovered a bug in mawk-1.3.3 (and
    ## probably earlier) where the next gsub() incorrectly reduced
    ##
    ## ``@String{j-COMPCON-SPRING89      = "Digest of Papers of {COMPCON} Spring '89"}''
    ##
    ## to
    ##
    ## ``COMPCON} Spring '89"}''
    ##
    ## instead of to
    ##
    ## ``j-COMPCON-SPRING89''
    ##
    ## The bug was reported on [12-Nov-1999] to the appropriate awk
    ## developers.

    gsub("\".*$","",s)

    gsub("^[^{]+[{][ \t]*","",s)
    gsub("[ \t]*=.*$","",s)
    return (tolower(s))
}


function initialize( k)
{
    ## Handle all startup initializations.  The invoking shell script
    ## has already handled several options (-?, -author, -copyright,
    ## -help, -logfile file, -outfile file, -quick, -silent, and
    ## -version) that we need not deal with here, and it saved only
    ## the most recent -byxxx option, since that is the one that
    ## is documented to apply.

    ## Coerce all numeric command-line options from strings to numbers
    BYCENTURY += 0
    BYHALFCENTURY += 0
    BYDECADE += 0
    if (BYNUMBER ~ "^0+$")	# special case: remap explicit 0 to infinity
	BYNUMBER = 2147483647	# 2^31 - 1
    BYNUMBER += 0
    BYPENTAD += 0
    BYSCORE += 0
    BYYEAR += 0

    if (BYCENTURY)
	YEAR_INTERVAL = 100
    else if (BYHALFCENTURY)
	YEAR_INTERVAL = 50
    else if (BYSCORE)
	YEAR_INTERVAL = 20
    else if (BYDECADE)
	YEAR_INTERVAL = 10
    else if (BYPENTAD)
	YEAR_INTERVAL = 5
    else if (BYYEAR)
	YEAR_INTERVAL = 1
    else	# splitting every 1,000,000,000 years will keep things together!
	YEAR_INTERVAL = 1000000000

    if (BYLABEL)	# convert to equivalent ``-byrange rangelist''
	BYRANGE = "a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w,x,y,z"

    initialize_range_table()

    ## The number of allowable open files varies radically between
    ## operating systems.  On some (e.g., Compaq/DEC OpenVMS and VAX
    ## VMS), it may also depend on user quotas and the number of open
    ## files in other jobs belonging to the current user.  On others
    ## (IBM PC DOS), it may depend on local configuration files
    ## (files=nnn in config.sys).
    ##
    ## We want to supply a default value for MAXOPEN that works almost
    ## everywhere, because the awk language provides no way of
    ## determining the limit. We cannot just try to open a series of
    ## scratch files, because an open failure raises an error, and
    ## cannot be caught.
    ##
    ## Experiments with gawk, mawk, and nawk showed that nawk was the
    ## most restrictive: its run.c file raises an error if more than
    ## FOPEN_MAX files are opened.
    ##
    ## I examined the C preprocessor output for run.c on several
    ## systems to produce the table below, and augmented it with
    ## measurements from a short C program, tryopen.c, that opened
    ## output files until failure occurred.
    ##
    ## Since stdin, stdout, and stderr are required to be open at the
    ## start of execution, the last column is expected to be 3 less
    ## than the preceding column, yet on some systems, the number that
    ## can really be opened is much larger than FOPEN_MAX.
    ##
    ## Indeed, the Sun Solaris 2.7 <stdio.h> file contains this
    ## comment: ``The value of _NFILE [== FOPEN_MAX] is defined in the
    ## Processor Specific ABI.  The value is chosen for historical
    ## reasons rather than for truly processor related attribute.
    ## Note that the SPARC Processor Specific ABI uses the common UNIX
    ## historical value of 20 so it is allowed to fall through.''
    ##
    ##  ========================================================================
    ##  Vendor and Model        O/S Version                    FOPEN_MAX tryopen
    ##  ========================================================================
    ##  Apple Macintosh PPC G3  Rhapsody 5.5                          20     253
    ##  DEC Alpha               OSF/1 4.0F                            64    4093
    ##  HP 9000/735             HP-UX 10.01                           60      57
    ##  IBM PowerPC             AIX 4.2                             2000    1997
    ##  Intel Pentium II MMX    GNU/Linux 2.2.5-22 (Redhat 6.0)      256    1021
    ##  NeXT Turbostation       Mach 3.3                             256     253
    ##  SGI Indigo/2            IRIX 5.3                             100     197
    ##  SGI Origin 200          IRIX 6.5                             100     197
    ##  Sun SPARC               GNU/Linux 2.2.5-22smp (Redhat 6.0)   256    1021
    ##  Sun SPARC               Solaris 2.6                           20      61
    ##  Sun SPARC               Solaris 2.7                           20      61
    ##  ========================================================================
    ##
    ## These variations suggest that awk implementations should not
    ## believe FOPEN_MAX, but should simply continue to open files
    ## until the open fails.

    MAXOPEN += 0
    if (MAXOPEN <= 0)
	MAXOPEN = 20		# minimum value from the table above
    MAXTEMPOPEN = MAXOPEN
    MAXTEMPOPEN -= 5		# stdin, stdout, and stderr are already
				# open, one command-line input file
				# will be soon, and one output .bib
				# file will be open later: 3 + 1 + 1 = 5
    if (MAXTEMPOPEN <= 0)	# replace nonsense values by a reasonable limit
	MAXTEMPOPEN = 20 - 5	# based on the table above

    Start_Entry_Count = 1

    if (TMPDIR == "")
	TMPDIR = ("TMPDIR" in ENVIRON) ? ENVIRON["TMPDIR"] : "/tmp"

    ## Because the temporary files are usually written into a public
    ## scratch directory, there is the possibility of name collisions.
    ## We handle this by including a large random number in the
    ## temporary file extension.
    ##
    ## On UNIX, this would normally lead to file names of the form
    ## foo.tmp.513870662, but some operating systems do not support
    ## multiple dots in filename, so we use underscores instead of
    ## dots, producing foo_tmp_513870662.
    ##
    ## TMP gets used later as a suffix on the output file names in
    ## filter_output() if a ``-filter command'' option was given.
    ##
    ## The temporary files should only rarely be seen anyway: just
    ## after a failure of this program, and then only in a temporary
    ## file directory.  Primitive file systems, like MS-DOS with its
    ## obnoxious 8+3 limits on file name lengths, will require more
    ## changes that I'm not going to support for now.
    srand()			# seed with time-of-day
    TMP = "_tmp"
    TMPEXT = TMP "_" int(rand()*1000000000)

    ## Enforce a portable set of characters in output filenames,
    ## including directory component separators: backslash (IBM PC DOS
    ## and Microsoft Windows), colon (Apple Macintosh MacOS), and
    ## slash (UNIX): the other characters come from the ISO 9660
    ## standard for filenames on CD-ROMs: Volume and File Structure of
    ## CD-ROM for Information Interchange, ISO 9660:1988(E), and we
    ## permit one additional character, hyphen, since it is available
    ## in almost all common file systems.
    gsub("[^-:\\\\/A-Za-z0-9._]","",PREFIX)
}


function initialize_range_table( k,letter,n,range)
{
    ## Expand the ``-byrange range-list'' to allow a fast constant-time
    ## lookup.

    k = (BYRANGE == "") ? 0 : split(tolower(BYRANGE),range,",")
    for ( ; k > 0; --k)		# do a sanity check on the specified ranges
    {
	if (!match(range[k],"^[a-z][-_][a-z]$") && !match(range[k],"^[a-z]$"))
	    error("invalid -byrange value [" range[k] "] in [" BYRANGE "]")
    }

    ## In the first draft of this program, range lookup was handled by a
    ## function find_range(letter) that scanned the range table
    ## sequentially.  Profiling with pawk on a 15MB test file with
    ## 22,235 bibliographic entries showed that about 15% of the run
    ## time was spent in that function, and it was the hottest spot in
    ## the whole program, so it has been optimized away by precomputing
    ## a table, Range_of_Letter[], indexed by letter.

    for (n = 0; n <= 255; ++n)
	Range_of_Letter[sprintf("%c",n)] = "UNKNOWN"

    for (n = 1; n <= 26; ++n)
    {
	letter = substr("abcdefghijklmnopqrstuvwxyz",n,1)
	for (k in range)	# the access order does not matter here
	{
	    if (substr(range[k],1,1) == letter)
	    {
		Range_of_Letter[letter] = range[k]
		break
	    }
	    else if ((substr(range[k],1,1) <= letter) && \
		    (substr(range[k],2,1) ~ "[-_]") && \
		    (letter <= substr(range[k],3,1)))
	    {
		Range_of_Letter[letter] = range[k]
		break
	    }
	}
    }
}


function less(a,b)
{
    ## This is the user-defined comparison function used in quicksort()
    ## (though actually called from partition()).

    return (a < b)
}


function list_to_inverted_table(list,inverted_table, k,n,parts)
{
    ## Convert a list of space-separated tokens to an inverted table
    ## indexed by token name, with the table value being the index of
    ## the token in the original list (counting from 1), and return
    ## the number of tokens in inverted_table[].  That number may be
    ## less than the number of list elements, if the list contained
    ## duplicates.

    n = split(list,parts," ")
    for (k = 1; k <= n; ++k)
	inverted_table[parts[k]] = k

    return (count(inverted_table))
}


function make_tempfilename(s, temp_filename,the_basename)
{
    ## Create a temporary filename and a corresponding output BibTeX
    ## filename to hold the entry s, record them in three global
    ## arrays (BibTeX_to_Temp_Map[], Input_Filename[],
    ## Temp_to_BibTeX_Map[]), and return the temporary filename.

    ## NB: do NOT use basename() here, because that would modify a
    ## name created from a prefix containing a dot.
    the_basename = prefixname() suffix_byxxx(s)

    temp_filename = TMPDIR "/" tailname(the_basename) TMPEXT

    if (!(temp_filename in Temp_to_BibTeX_Map))	# then have new temporary file
    {
	Input_Filename[temp_filename] = This_Filename
	Temp_to_BibTeX_Map[temp_filename] = the_basename ".bib"
	BibTeX_to_Temp_Map[Temp_to_BibTeX_Map[temp_filename]] = temp_filename

	## Ensure that all new temporary files are empty.  Normally,
	## awk would handle this, but because of the ``-maxopen nnn''
	## option, we may need to close a temporary output file, and
	## then reopen it later for further output, so all output to
	## such files has to be in append mode.
	delete_files(temp_filename)
    }

    return (temp_filename)
}


function new_file()
{
    ## Handle initializations when the input file changes.

    if ((This_Filename != "") && (Comment_Block != ""))
	File_Comment_Trailer[This_Filename] = Comment_Block

    Comment_Block = ""
    THIS_FILENAME = FILENAME	# used in the outer pattern/action block
    This_Filename = this_filename()
    Input_Files[This_Filename] = 1
    Input_File_List = Input_File_List ((Input_File_List == "") ? "" : " ") \
	    This_Filename
    Input_File_Count++
    This_Input_Entry_Count = 0
    This_Reference_Entry_Count = 0
    progress_report("reading " This_Filename)
}


function open_tempfile(tempfilename)
{
    ## `Open' filename for output and return filename.  [We really
    ## don't do the open: awk does that for us when the file is
    ## referenced in an I/O statement.]
    ##
    ## This function manages the closing of the oldest temporary file
    ## in the event that a ``-maxopen nnnn'' option has requested
    ## that, or a default limit for MAXTEMPOPEN has been set, and
    ## reached.

    if (!(tempfilename in Open_Tempfiles_by_Name))
    {				# then this is a not-yet-open file
	Open_Tempfiles_by_Name[tempfilename] = 1
	Open_Tempfiles_by_Number[++N_Open_Temp_Files] = tempfilename
	if (N_Open_Temp_Files > MAXTEMPOPEN)
	    close_oldest_file()
    }
    return (tempfilename)
}


function output_all_bibfiles( filenames,k,n,name)
{
    ## Handle the second pass: for each temporary file, create a new
    ## output BibTeX file containing the needed @String{} definitions,
    ## followed by a copy of the entries from the temporary file.  On
    ## completion, delete all of the temporary files.

    ## Process the temporary files in lexicographic order, so as
    ## not to confuse the user:

    n = table_to_sorted_inverted_table(Temp_to_BibTeX_Map,filenames)

    for (k = 1; k <= n; ++k)
    {
	close(filenames[k])
	output_bibfile(filenames[k])
    }

    delete_files(table_to_list(filenames,n))
}


function output_bibfile(temp_filename, n,outfile)
{
    ## Create a new output BibTeX file containing the @Preamble{...},
    ## all of the @Strings{...} referenced in temp_filename, and then
    ## the contents of temp_filename.

    ## As a safety feature, supply numeric extensions on output
    ## filenames that happen to match input filenames.  For
    ## portability reasons discussed in initialize() about the
    ## definition of TMPEXT, we use underscore instead of dot.
    n = 0
    while (Temp_to_BibTeX_Map[temp_filename] in Input_Files)
	Temp_to_BibTeX_Map[temp_filename] = \
	    sprintf("%s_%d",Temp_to_BibTeX_Map[temp_filename],++n)

    outfile = Temp_to_BibTeX_Map[temp_filename]

    progress_report("writing " outfile)

    output_file_comment_header(temp_filename,outfile)
    output_preamble(temp_filename,outfile)
    output_referenced_strings(outfile)
    output_entry_comment_header(temp_filename,outfile)
    append_file(temp_filename,outfile)
    output_file_comment_trailer(temp_filename,outfile)
    close(outfile)
    close(temp_filename)
}


function output_comment_block(tempfilename)
{
    ## NB: Because we might have closed a temporary file, we must write
    ## in append mode (>>), not normal output (>) mode!

    if (Comment_Block != "")
    {
	if (This_Input_Entry_Count == 0)
	    File_Comment_Header[This_Filename] = \
		File_Comment_Header[This_Filename] Comment_Block
	else if (This_Reference_Entry_Count == 0)
	    Entry_Comment_Header[This_Filename] = \
		Entry_Comment_Header[This_Filename] Comment_Block
	else
	    print Comment_Block >> tempfilename
	Comment_Block = ""
    }
}


function output_cross_references(label,entry,this_bibtex_filename, bibtex_filenames,k,n,t)
{
    ## If the entry was cited or cross-referenced, then we need to
    ## output it to one or more other files as well, taking care not
    ## to output it to the current one twice.  We also need to record
    ## these dependencies so that in the third pass, we can ensure
    ## that each output BibTeX file is completely self-contained, with
    ## every referenced entry present.

    ## We assume that BibTeX's required order of cross-referenced
    ## entries has been followed: entries must follow all those that
    ## cross-reference them.  In such a case, we can quickly and
    ## efficiently resolve cross-references on the first pass.
    ## Anything that we miss here will be handled later in a third
    ## pass, in output_unsatisfied_references().

    if (label in BibTeX_Files_of_Referenced_Label)
    {
	n = split(BibTeX_Files_of_Referenced_Label[label],bibtex_filenames," ")
	for (k = 1; k <= n; ++k)
	{
	    if (bibtex_filenames[k] != this_bibtex_filename)
	    {				# to prevent duplicate output
		if (!((label,bibtex_filenames[k]) in Label_BibTeX_File))
		{			# then this entry has not yet been output
		    t = BibTeX_to_Temp_Map[bibtex_filenames[k]]
		    open_tempfile(t)	# to handle the `-maxopen nnn' support
		    print entry >> t	# entry ends with newline, so normal paragraph break
		    Label_BibTeX_File[label,bibtex_filenames[k]] = 1 # to remember this output
		    delete bibtex_filenames[k] # to mark that file is handled
		}
	    }
	}

	## Reconstruct the files list from the remaining missing ones.

	t = sparse_table_to_list(bibtex_filenames,n)

	## Even though t might be empty at this point, we still need to
	## preserve BibTeX_Files_of_Referenced_Label[label], because its
	## existence is needed later in output_unsatisfied_references()
	## to correctly construct dependency lists.  This was a subtle
	## bug: originally, I was deleting the entry when t was empty,
	## and it took quite some time to figure out why some
	## cross-references were not being correctly resolved.
	## print "%% DEBUG: output_cross-references(): BibTeX_Files_of_Referenced_Label[" label "] reset to [" t "]" >"/dev/stderr"
	## BibTeX_Files_of_Referenced_Label[label] = t
    }

    record_dependents(label,entry,this_bibtex_filename)
}


function output_entry_comment_header(temp_filename,outfile)
{
    if (Input_Filename[temp_filename] in Entry_Comment_Header)
	print Entry_Comment_Header[Input_Filename[temp_filename]] >outfile
}


function output_entry_to_tempfile(entry, label,tempfilename)
{
    ## Output the entry to a temporary file whose name is determined
    ## by the command-line ``-byxxx'' selection option, and record all
    ## of the @String{} names that it references, since we need that
    ## information later when we write the final output BibTeX files.

    tempfilename = open_tempfile(make_tempfilename(entry))

    ## Save the entry on the temporary file.

    output_comment_block(tempfilename)

    ## NB: Because we might have closed a temporary file, we must write
    ## in append mode (>>), not normal output (>) mode!

    print entry >> tempfilename	# entry already ends with newline, so we
				# get the desired paragraph break here

    ## Remember the final destination BibTeX file for this entry, so
    ## that we can patch up unresolved cross-references after the
    ## second pass.

    label = get_label(entry)
    Destination[label] = Temp_to_BibTeX_Map[tempfilename]
    Label_BibTeX_File[label,Destination[label]] = 1

    output_cross_references(label,entry,Temp_to_BibTeX_Map[tempfilename])
}


function output_file_comment_header(temp_filename,outfile)
{
    if (Input_Filename[temp_filename] in File_Comment_Header)
    {
	if (File_Comment_Header[Input_Filename[temp_filename]] \
	    !~ "^%+ *-[*]-BibTeX-[*]-")
	    print "%% -*-BibTeX-*-\n" >outfile
	print File_Comment_Header[Input_Filename[temp_filename]] >outfile
    }
    else
	print "%% -*-BibTeX-*-\n" >outfile
}


function output_file_comment_trailer(temp_filename,outfile)
{
    if (Input_Filename[temp_filename] in File_Comment_Trailer)
	print File_Comment_Trailer[Input_Filename[temp_filename]] > outfile
}


function output_preamble(temp_filename,outfile)
{
    if (Input_Filename[temp_filename] in Preamble_Comment_Header)
	print Preamble_Comment_Header[Input_Filename[temp_filename]] > outfile
    print Preamble > outfile
}


function output_referenced_strings(outfile, k,n,name,pair,parts,string_names)
{
    ## Find strings required in outfile, sort them, and output them
    ## to outfile.

    n = 0
    for (pair in Strings_of_BibTeX_File)
    {
	split(pair,parts,SUBSEP) # pair == (bibtex_filename SUBSEP string_name)
	if (parts[1] == outfile)
	    string_names[++n] = parts[2]
    }
    quicksort(string_names,1,n)
    for (k = 1; k <= n; ++k)
    {
	name = string_names[k]
	if (name in String_Comment_Header)
	    print String_Comment_Header[name] > outfile
	print String_Entry[name] > outfile
	String_Reference_Count[name]++
    }
}


function output_unreferenced_strings( k,n,name,outfile,string_names)
{
    ## Create a sorted list of unreferenced @String{...} definitions and
    ## write them out to a special file.

    n = 0
    for (name in String_Reference_Count)
    {
	if (String_Reference_Count[name] == 0)
	    string_names[++n] = name
    }
    if (n > 0)
    {
	quicksort(string_names,1,n)
	outfile = This_Prefix "UNUSED.bib"
	progress_report("writing " outfile)
	print "%% -*-BibTeX-*-" >outfile
	print "%% This file contains all @String{...} definitions that appear to be unused" >outfile
	print "%% in the input BibTeX file" ((Input_File_Count > 1) ? "s" : "") \
		": " Input_File_List >outfile
	print "" >outfile
	for (k = 1; k <= n; ++k)
	{
	    if (string_names[k] in String_Comment_Header)
		print String_Comment_Header[string_names[k]] > outfile
	    print String_Entry[string_names[k]] > outfile
	}
	close(outfile)
    }
}


function output_unsatisfied_references( bibtex_filenames,k,label,n,\
					needed_labels_from_bibtex_file,parts)
{
    ## Examine the tables of labels that have been cited, or cross
    ## referenced, and find those which have not yet been output to the
    ## BibTeX file in which they are referenced.  This can happen if an
    ## earlier entry was cited or cross-referenced by a later one.  Such
    ## instances are expected to be relatively uncommon, so the overhead
    ## of this function should normally not be large.
    ##
    ## In the worst case, each output BibTeX file will be read
    ## completely, effectively making a third pass over the input
    ## stream.  However, if we don't do it here, then the poor human
    ## user will have to do the same thing manually, and that is error
    ## prone, slow, and tedious.

    ## First prepare lists of space-separated labels indexed by output
    ## BibTeX file name:

    for (label in References_of_Label)
    {				# loop over all labels that have dependents
	if (label in BibTeX_Files_of_Referenced_Label)
	{
	    n = split(expand_file_references_outer(label,Destination[label]),parts," ")
	    for (k = 1; k <= n; ++k)
		needed_labels_from_bibtex_file[Destination[parts[k]]] = \
		    needed_labels_from_bibtex_file[Destination[parts[k]]] " " parts[k]
	}
	else if (!(label in Destination)) # we have an inconsistent bibliography
	    warning("unsatisfied \\cite{" label "} or crossref=\"" label "\"")
    }

    ## Because we need the bibsplit validation suite to behave
    ## reproducibly everywhere, we must sort items in the
    ## needed_labels_from_bibtex_file[] lists, and its indexing list of
    ## bibtex_filenames as well. That way, any
    ##
    ##     searching FILENAME for missing referenced items LABEL LABEL ...
    ##
    ## or
    ##
    ##     unsatisfied \cite{LABEL} or crossref="LABEL"
    ##
    ## messages will be the same order in all implementations.

    n = table_to_sorted_inverted_table(needed_labels_from_bibtex_file,bibtex_filenames)

    ## Now copy each missing entry from its output BibTeX file to other
    ## BibTeX files that reference it:

    for (k = 1; k <= n; ++k)
	copy_entries(bibtex_filenames[k],
		     sort_list(needed_labels_from_bibtex_file[bibtex_filenames[k]]))
}


function partition(array,left,right, i,j,swap,v)
{
    ## This is a helper function for quicksort().

    i = left - 1
    j = right
    v = array[right]
    for (;;)
    {
	while (less(array[++i],v))
	    ;
	while (less(v,array[--j]))
	{
	    if (j == left)
		break
	}
	if (i >= j)
	    break
	swap = array[i]
	array[i] = array[j]
	array[j] = swap
    }
    swap = array[i]
    array[i] = array[right]
    array[right] = swap
    return (i)
}


function prefixname( prefix)
{
    ## Return an output filename prefix based on that requested by
    ## ``-prefix xxx'', or derived from the current filename.

    prefix = (PREFIX == "") ? basename(this_filename()) : PREFIX
    This_Prefix = prefix # global: used in terminate() for saving unused strings
    return (prefix)
}


function progress_report(s)
{
    ## This is the only place that stdout is written; however, the
    ## invoking shell script might have redirected this in response to
    ## ``-outfile filename'' or ``-silent'' command-line options.

    print s
    fflush("/dev/stdout")	# ensure that the user sees it
}


function quicksort(array,left,right, i)
{
    ## The code in partition() and quicksort() is a direct translation
    ## of the simple quicksort algorithm given in Robert Sedgewick's
    ## ``Algorithms in C'', 3rd edition, Addison-Wesley, 1998,
    ## pp. 305--307.  We need an O(N lg N) algorithm here instead of a
    ## simpler O(N^2) algorithm because the font list has thousands of
    ## entries.  There are many things that one can do to tweak
    ## quicksort() to make its worst-case behavior of O(N^2) unlikely,
    ## and to improve its performance on small sequences by switching
    ## to other sorting algorithms.  However, we do not attempt any of
    ## those refinements here.
    ##
    ## The user-defined less(a,b) function conceals the details of how
    ## array items are compared.

    if (right <= left)
	return
    i = partition(array,left,right)
    quicksort(array, left, i - 1)
    quicksort(array, i + 1, right)
}


function record_cite_use(current_label,s,bibtex_filename, k,labels,n,parts)
{
    ## Recursively examine the BibTeX entry, s, for use of
    ## \cite{label,label,...} instances, and record their use in
    ## BibTeX_Files_of_Referenced_Label[] for later reference in the
    ## second pass.

    if (match(s,"\\\\cite{[^{}]+}"))
    {
	labels = substr(s,RSTART+6,RLENGTH-7) # BibTeX ignores lettercase in labels
	s = substr(s,RSTART+RLENGTH)	# remainder of s for recursive handling later
	labels = trim(tolower(labels))	# BibTeX ignores lettercase in labels
	n = split(labels,parts," *, *") # spaces do not normally appear, but allow them
	for (k = 1; k <= n; ++k)
	{
	    record_label_reference(current_label,parts[k])
	    record_label_bibtex_filename(parts[k],bibtex_filename)
	}
	record_cite_use(current_label,s,bibtex_filename) # recurse for remaining instances
    }
}


function record_crossref_use(current_label,s,bibtex_filename, k,label,n,parts)
{
    ## Recursively examine the BibTeX entry, s, for use of
    ## crossref="label" instances, and record their use in
    ## BibTeX_Files_of_Referenced_Label[] for later reference in the
    ## second pass.

    if (match(s,"\001[ \t]*[Cc][Rr][Oo][Ss][Ss][Rr][Ee][Ff][ \t]*=[ \t]*[\"{][^\001\"}]*[\"}][ \t]*,[ \t]*\001"))
    {
	label = substr(s,RSTART,RLENGTH)
	s = substr(s,RSTART+RLENGTH-1)
	sub("\001[ \t]*[Cc][Rr][Oo][Ss][Ss][Rr][Ee][Ff][ \t]*=[ \t]*[\"{]","",label)
	sub("[\"}][ \t]*,[ \t]*\001","",label)
	label = trim(tolower(label)) # BibTeX ignores lettercase in labels
	record_label_reference(current_label,label)
	record_label_bibtex_filename(label,bibtex_filename)
	record_crossref_use(current_label,s,bibtex_filename) # recurse for remaining instances
    }
}


function record_dependents(label,entry,bibtex_filename, temp_entry)
{
    ## Record all of the dependents of this entry:
    ##	(1) \cite{label1,label2,...} citation references,
    ##	(2) crossref="label" references, and
    ##	(3) @String{} references.

    ## awk and nawk cannot handle newlines in regular expressions, so
    ## the workaround is to replace newlines in a temporary copy of
    ## the entry by another control character before doing the matching
    ## in record_string_use().  While this substitution increases the
    ## number of times each entry is processed, the substitution
    ## happens efficiently inside the library, and profiling shows
    ## only about a 4% slowdown with gawk, so we accept that decrease,
    ## in the interest of improved portability to more awk
    ## implementations.  We choose Ctl-A (\001) for the substitution
    ## character; that, and several others, are already used by
    ## bibsort, and correspond to no visible character, so they are
    ## unlikely to be in use in any real BibTeX entry.

    temp_entry = entry
    gsub("\n","\001",temp_entry)	# hide newlines
    record_cite_use(label,temp_entry,bibtex_filename)
    record_crossref_use(label,temp_entry,bibtex_filename)
    record_string_use(temp_entry,bibtex_filename)
}


function record_label_bibtex_filename(label,bibtex_filename)
{
    ## Record the fact that we need to write entry label to
    ## bibtex_filename, unless we already know that, or have already
    ## output that entry to that file.

    if ((label,bibtex_filename) in Label_BibTeX_File)
	return			# because already output
    else if (index(BibTeX_Files_of_Referenced_Label[label],bibtex_filename) > 0)
	return			# because already recorded
    else			# record the dependency
	BibTeX_Files_of_Referenced_Label[label] = \
	  BibTeX_Files_of_Referenced_Label[label] " " bibtex_filename
}


function record_label_reference(label,referenced_label, t)
{
    ## Record the reference of entry with label to the entry with
    ## referenced_label.  It is assumed, but NOT checked, that both
    ## labels have been lowercased. In order to prevent false matches
    ## of labels that are contained in other labels, we require that
    ## the list in References_of_Label[] contain spaces on EACH SIDE
    ## of all labels.  Doubled spaces inside the list, and the outer
    ## spaces, cause no problems in later calls to split().

    t = (" " referenced_label " ")
    if (label in References_of_Label)
    {
	if (index(References_of_Label[label],t) == 0)
	    References_of_Label[label] = References_of_Label[label] t
    }
    else if (label != "")
	References_of_Label[label] = t
}


function record_string_use(s,bibtex_filename, k,n,parts,t)
{
    ## Recursively examine the BibTeX entry, s, for use of @String{}
    ## definitions, and record their use in Strings_of_BibTeX_File[] for later
    ## reference in output_referenced_strings() in the second pass.
    ##
    ## WARNING: This code is not foolproof: it will correctly handle
    ##		month = nov # " 11",
    ##		year = "1918",
    ##		year = {1918},
    ## but not
    ##		month = "11 " # nov,
    ##
    ## In any event, that form is an obsolete abberation due to a
    ## deficiency of older BibTeX styles, which lacked a day keyword;
    ## the modern form is
    ##
    ## 		day = "11",
    ## 		month = nov,
    ##
    ## Fortunately, most BibTeX uses of strings are simple ones, like
    ##
    ## 		journal = j-CACM,
    ##
    ## so we don't try harder here.  The impact of a failed recognition
    ## might be that a referenced string is not included in a split
    ## file, but its definition will be saved somewhere, so it is never
    ## lost.

    if (match(s,"\001[ \t]*[A-Za-z][-_A-Za-z0-9]*[ \t]*=[ \t]*[A-Za-z][-:_A-Za-z0-9]*[^\001]*\001"))
    {
	t = substr(s,RSTART,RLENGTH-1)
	s = substr(s,RSTART+RLENGTH-1)
	gsub("[{]\"[}]","",t)		# remove braced quotes
	gsub("\\\\\"","",t)		# remove umlaut accents
	gsub("\"[^\"]*\"","",t)		# remove quoted strings
	sub("^[^=]*=[ \t]*","",t)	# reduce "publisher = pub-ABC # pub-XYZ"
	gsub("[^-:_A-Za-z0-9]+"," ",t)	# to "pub-ABC pub-XYZ"
	t = trim(tolower(t))		# BibTeX ignores lettercase in @String abbreviations

	n = split(t,parts," ")
	for (k = 1; k <= n; ++k)
	{
	    if (parts[k] in String_Entry)
		Strings_of_BibTeX_File[bibtex_filename,parts[k]] = 1
	    else if (index(" jan feb mar apr may jun jul aug sep oct nov dec ", \
			   (" " parts[k] " ")) == 0)
		warning("undefined string abbreviation [" parts[k] "]")
	}
	record_string_use(s,bibtex_filename)	# recurse for remaining key=value assignments
    }
}


function sample(s)
{
    ## Return up to 50 characters of s, with newlines changed to
    ## spaces, and a trailing ellipsis if s was truncated.

    gsub("\n"," ",s)
    return ((length(s) > 50) ? (substr(s,1,50) "...") : s)
}


function sort_list(list, n,parts)
{
    ## Sort entries in a list of space-separated elements into
    ## ascending order, and return the result.

    n = split(list,parts," ")

    quicksort(parts,1,n)

    return (table_to_list(parts,n))
}


function sort_table_lists(table, key)
{
    ## Given a table of lists of space-separated elements, sort the
    ## list elements into ascending lexicographic order.
    for (key in table)
	table[key] = sort_list(table[key])
}


function sparse_table_to_list(table,n, k,list)
{
    ## Convert a sparse table indexed by 1..n, with possible holes, to
    ## a list of space-separated elements, and return that list.

    list = ""
    for (k = 1; k <= n; ++k)
    {
	if (k in table)
	    list = list " " table[k]
    }

    return (substr(list,2))	# discard leading space, if any
}


function squeeze(s, kbrace,kspace)
{
    ## Squeeze whitespace, reducing " @ Book { Jones:1999:ABC , " to
    ## "@Book{Jones:1999:ABC,", and return the result.

    sub(/^[ \t]*@[ \t]*/,"@",s)	# eliminate space before and after initial @
    kbrace = index(s,"{")	# eliminate space between entryname and brace
    kspace = match(s,"[ \t]")
    if (kspace < kbrace)	# then found intervening space
	sub(/[ \t]+{/,"{",s)	# NB: sub(), NOT gsub(), here
    return (s)
}


function squeeze_whitespace(s)
{
    ## Reduce each run of whitespace to a single blank, and return the
    ## result.

    gsub(/[ \t\n\r\f][ \t\n\r\f]+/," ",s)
    return (s)
}


function suffix_bynumber(s)
{
    ## Return a suitable suffix for the ``-bynumber'' option.

    if (Part_Entry_Count > BYNUMBER)
    {
	Part_Entry_Count = 1
	Start_Entry_Count = Entry_Count
    }
    return (sprintf("%06d", Start_Entry_Count))
}


function suffix_byrange(s)
{
    ## Return a suitable suffix for the ``-byrange rangelist'' option.

    if (match(s,"^@[^{]*{[^,]*,"))
    {
	s = substr(s,RSTART,RLENGTH-1)
	sub("^@[^{]*{","",s)
	return (Range_of_Letter[tolower(substr(s,1,1))])
    }
    else
    {
	warning("cannot parse citation label in entry [" sample(s) "]")
	return ("UNKNOWN")
    }
}


function suffix_byxxx(s)
{
    ## This function directs the complex job of examining a BibTeX
    ## entry to determine into which temporary output file it should
    ## be written.  It returns a suffix suitable for building an
    ## output filename.

    if (BYCENTURY || BYHALFCENTURY || BYSCORE || BYDECADE || BYPENTAD || BYYEAR)
	return (suffix_byyears(s))
    else if (BYRANGE)
	return (suffix_byrange(s))
    else if (BYNUMBER)
	return (suffix_bynumber(s))
    else			# no selection: default to "-bynumber 2000"
    {
	BYNUMBER = 2000		# about half the limit for TeX and BibTeX
	return (suffix_bynumber(s))
    }
}


function suffix_byyears(s)
{
    ## Return a suitable suffix for the ``-bycentury''-like options.

    if (match(s,"\n[ \t]*year[ \t]*=[ \t]*[\"{][ \t]*[12][0-9][0-9][0-9][^0-9]"))
	return (sprintf("%04d", int((0 + substr(s,RSTART+RLENGTH-5,4)) / \
				    YEAR_INTERVAL) * YEAR_INTERVAL))
    else
    {
	warning("cannot parse year in entry [" sample(s) "]")
	return ("UNKNOWN")
    }
}


function table_to_list(table,n, k,list)
{
    ## Convert a table indexed by 1..n to a list of space-separated
    ## elements, and return that list.

    list = ""
    for (k = 1; k <= n; ++k)
	list = list " " table[k]

    return (substr(list,2))	# discard leading space, if any
}


function table_to_sorted_inverted_table(table,inverted_table, n,name,parts)
{
    ## Sort entries in a table indexed by strings into ascending order
    ## in an inverted table of the same strings, indexed by 1..n, and
    ## return n.

    n = 0
    for (name in table)
	inverted_table[++n] = name

    quicksort(inverted_table,1,n)

    return (n)
}


function table_to_sorted_table(table,sorted_table, n,name,parts)
{
    ## Sort entries in a table indexed by strings into ascending order
    ## in a table of the same values, indexed by 1..n, and return n.

    n = 0
    for (name in table)
	sorted_table[++n] = table[name]

    quicksort(sorted_table,1,n)

    return (n)
}


function tailname(s)
{
    ## Discard any directory path from s, and return the result.

    if (match(s,"[^:\\\\/]*$"))	# handle MacOS, DOS/Windows, UNIX separators
	return (substr(s,RSTART,RLENGTH))
    else
	return (s)
}


function terminate()
{
    ## Direct the second pass, and the creation of the unused strings
    ## file, and then handle any unsatisfied references arising from
    ## references to earlier, already-output, entries by later ones.
    ##
    ## Although in the worst case, this forces a complete third pass
    ## through the output, in practice, only a small portion of the
    ## output files will be reread.
    ##
    ## The alternate, and faster, approach, would be to hold all
    ## entries in memory, but that would often be impossible on small
    ## systems.  Also, even in ordinary cases, it would make bibsplit
    ## use much more memory than we require here for making another
    ## pass through some of the output files.

    output_all_bibfiles()
    output_unreferenced_strings()
    output_unsatisfied_references()
    filter_output()
}


function this_filename()
{
    ## Because we might use the input file name to create output file
    ## names, we must handle the special case of empty or hyphen, and
    ## remap those to "stdin".

    return ((FILENAME == "") || (FILENAME == "-")) ? "stdin" : FILENAME
}


function trim(s)
{
    ## Trim leading and trailing horizonal space from s, and return the
    ## result.

    sub(/^[ \t]+/,"",s)
    sub(/[ \t]+$/,"",s)
    return (s)
}


function trim_trailing(s)
{
    ## Trim trailing horizonal space from s, and return the result.

    sub(/[ \t]+$/,"",s)
    return (s)
}


function warning(message)
{
    ## Issue a warning message on stderr, identifying the input file and
    ## current file line number.  This form can be parsed by emacs'
    ## compile-class commands, making it easy to locate the error.

    print this_filename() ":" FNR ":%%" message >"/dev/stderr"
}
