### -*-awk-*-
### ====================================================================
###  @Awk-file{
###     author          = "Nelson H. F. Beebe",
###     version         = "0.04",
###     date            = "03 March 2001",
###     time            = "19:33:11 MST",
###     filename        = "biblabel.awk",
###     address         = "Center for Scientific Computing
###                        University of Utah
###                        Department of Mathematics, 322 INSCC
###                        155 S 1400 E RM 233
###                        Salt Lake City, UT 84112-0090
###                        USA",
###     telephone       = "+1 801 581 5254",
###     FAX             = "+1 801 585 1640, +1 801 581 4148",
###     URL             = "http://www.math.utah.edu/~beebe",
###     checksum        = "34415 1065 4180 33749",
###     email           = "beebe@math.utah.edu, beebe@acm.org,
###                        beebe@computer.org, beebe@ieee.org
###                        (Internet)",
###     codetable       = "ISO/ASCII",
###     keywords        = "bibliography, BibTeX, citation label, LaTeX,
###                        TeX",
###     supported       = "yes",
###     docstring       = "This program filters a BibTeX bibliography,
###                        or bibliography fragment, on its standard
###                        input, or one or more bibliographies named on
###                        the command line, and prints on standard
###                        output lines containing pairs of old and new
###                        citation labels, suitable for input to the
###                        companion program, citesub(1).
###
###                        The citation label is formed by these rules,
###                        easily applicable by a human, or by a
###                        computer program like this one:
###
###                        (1) Take the first author's last name,
###                            dropping apostrophes, Jr/Sr/generation
###                            numbers, and eliminating accents
###                            (e.g. J{\"a}nsch -> Jaensch, and
###                            Jind\v{r}ich -> Jindrich), using
###                            multi-letter transliterations if that is
###                            conventional.  Preserve hyphenated names,
###                            like Baeza-Yates, in full.
###
###                        (2) Append a colon.
###
###                        (3) Append the four-digit year of publication
###
###                        (4) Append another colon.
###
###                        (5) Pick the initial letters of at most three
###                            of the leading important words in the
###                            title that begin with a letter, excluding
###                            articles, prepositions, and TeX math
###                            mode, and append those letters.
###
###                            For example, given the title ``On ${C}^1$
###                            interpolating hierarchical spline
###                            bases'', this recipe produces IHS.
###
###                        (6) If the resulting label is already in use,
###                            add a letter a, b, c, ... to make it
###                            unique.  In those rare cases when there
###                            are more than 26 such collisions, add
###                            additional letters, producing suffixes
###                            written in a base-26 number system in
###                            ascending order: a .. z, aa .. az , ba
###                            .. bz , ..., za .. zz , aaa .. aaz , ...,
###                            zza .. zzz , ..., aaaa .. aaaz , ...,
###                            zzza .. zzzz , ....
###
###                        This will produce a label like
###                        Smith:1994:ABC.
###
###                        The reason for including a four-digit year is
###                        that the worldwide Y2K problem at the
###                        millennium change amply demonstrated the
###                        foolishness of two-digit year abbreviations.
###                        Also, some bibliographies may be historical,
###                        with entries dating back hundreds of years.
###                        Using a four-digit year will keep sorts of
###                        otherwise identical keys in chronological
###                        order, and putting the year before the key
###                        derived from the title will facilitate
###                        sorting by year by, e.g., bibsort (1).
###
###                        Because any change in citation labels must be
###                        accompanied by a change in citations in all
###                        documents that use the bibliography, it is
###                        not sufficient to just produce a new
###                        bibliography file with changed labels.
###
###                        Instead, the output is a file containing old
###                        and new labels, one per line, suitable for
###                        input to the companion citesub(1) program
###                        (either awk or C versions).
###
###                        To avoid confusion between labels with common
###                        prefixes, such as Smith80 and Smith80a,
###                        citesub(1) will check for leading context of a
###                        left brace, quote, comma, whitespace, or
###                        beginning of line and trailing context of a
###                        right brace, comma, quote, percent,
###                        whitespace, or end of line so as to match
###                        these styles:
###
###                             @Book{Smith:1980:ABC,
###                             crossref = <quote>Smith:1980:ABC<quote>,
###                             crossref = {Smith:1980:ABC},
###                             \cite{Smith:1980:ABC}
###                             \cite{Smith:1980:ABC,Jones:1994:DEF}
###                             \cite{%
###                                     Smith:1980:ABC,%
###                                     Jones:1994:DEF%
###                             }
###
###                        The Ignore[] list initialized in
###                        make_ignore_list() contains words which are
###                        to be ignored in preparation of the
###                        alphabetic string from the title words.  It
###                        will be augmented at run-time from two files:
###                        biblabel.ign (intended to be a generic
###                        enhancement), and a file assigned to IGNOREFILES
###                        on the command line.
###
###                        Created labels are guaranteed to be unique
###                        within the input files provided on the
###                        command line.  However, in a larger project,
###                        one may wish to exclude labels that are
###                        already in use in other bibliographies.  To
###                        provide for this, the USEDFILES variable can be
###                        set on the command line to define the name of
###                        a file of labels that are already in use.  It
###                        will be augmented by labels from a generic
###                        file biblabel.use.  Lines in this file
###                        consist of whitespace-separated pairs of
###                        filenames and citation labels.
###
###                        When a label is found in use, and the current
###                        file matches the in-use label filename, the
###                        label is considered to be unused; otherwise,
###                        repeated runs through this program would keep
###                        changing already-assigned labels.
###
###                        Usage:
###                             awk -f biblabel.awk
###                                    [ -v CORPORATEFILES="$CORPORATEFILES" ] \
###                                    [ -v DUMPCORPORATEFILES="output-file(s)" ] \
###                                    [ -v DUMPIGNOREFILES="output-file(s)" ] \
###                                    [ -v DUMPLABELFILES="output-file(s)" ] \
###                                    [ -v INPUTFILES="input-file(s)" ]
###                                    [ -v IGNOREFILES="input-ignore-file(s)" ]
###                                    [ -v LONGCORPORATENAMES=nnn ] \
###                                    [ -v USEDFILES="input-used-labels-file(s)" ] \
###                                    [ -v VERSIONNUMBER="xxx" ] \
###                                    [ -v VERSIONDATE="xxx" ] \
###                                    mybib.bib >mybib.sub
###
###                        Examine the output substitutions to decide
###                        whether the generated labels are acceptable,
###                        and make any changes by hand.
###
###                        Tentatively apply the label substitutions and
###                        compare the input and output files:
###
###                            citesub -f mybib.sub mybib.bib >mybib.bib.tmp
###                            diff mybib.bib mybib.bib.tmp
###
###                            citesub -f mybib.sub myfile.ltx >myfile.ltx.tmp
###                            diff myfile.ltx myfile.ltx.tmp
###
###                        If the citesub(1) substitutions are acceptable,
###                        then:
###
###                            mv mybib.bib.tmp mybib.tmp
###                            mv myfile.ltx.tmp myfile.ltx
###
###                        The checksum field above contains a CRC-16
###                        checksum as the first value, followed by the
###                        equivalent of the standard UNIX wc (word
###                        count) utility output of lines, words, and
###                        characters.  This is produced by Robert
###                        Solovay's checksum utility.",
###  }
### ====================================================================

BEGIN 					{ begin_bibliography() }

/^@[A-Za-z][A-Za-z0-9_-]*/		{ begin_entry(); next }

/^ *author *= *"/			{ author_string(); next }

/^ *editor *= *"/			{ editor_string(); next }

/^ *title *= *"/			{ title_string(); next }

					# Use the booktitle only when the title
					# has not been seen yet, and if a title
					# is subsequently seen, we will override
					# Title_Abbrev from booktitle by a new
					# Title_Abbrev from title.
/^ *booktitle *= *"/			{ if (Title_Abbrev == "") title_string(); next }

/^ *crossref *= *"/			{ crossref_string(); next }

/^ *year *= *"/				{ year_string(); next }

/^}/ 					{ end_entry(); next }

					{ next }

END					{ end_bibliography() }

#=======================================================================

function author_string()
{
    Lastname = do_names(value($0))
}


function begin_bibliography( n,parts)
{
    n = split(INPUTFILES,parts," ")
    Filename = (n == 0) ? "-" : parts[1]
    "date" | getline Current_Date_and_Time
    close("date")

    RESET = "@RESET@"
    VALID_YEAR_PATTERN = "(1[0-9][0-9Xx][0-9Xx]|20[0-9Xx][0-9Xx])" # allow only 1000..2099 and 10xx..20xx

    ## gawk and mawk support 8-bit regexp patterns, but nawk does not,
    ## sigh...  Instead, nawk strips the high-order bit, making all
    ## characters match this regexp.  We use this run-time behavior
    ## difference to control diagnostic output flagging unexpected
    ## characters in 8#200..8#377.

    Eight_Bit_Regexps = ("A" !~ "[\200-\377]")
    initialize_maps()

    make_corporate_ignore_list( )
    make_ignore_list()
    do_extend_corporate_list(("biblabel.cig " CORPORATEFILES))
    do_extend_ignore_list(("biblabel.ign " IGNOREFILES))
    do_extend_used_list(("biblabel.use " USEDFILES))
    do_dump_ignore_list(DUMPIGNOREFILES)
    do_dump_corporate_list(DUMPCORPORATEFILES)
}


function begin_entry()
{
    Lastname = ""
    New_Label = ""
    Old_Label = entry_label($0)
    Pages[1] = ""
    Title_Abbrev = ""
    Type = entry_type($0)
    Year = "20xx"	# default year so we can always make a citation label
}


function crossref_string( crossref)
{
    ## Recognize a cross-reference value that matches the standard label
    ## form, and save its year value.  Otherwise, ignore it.
    crossref = value($0)
    if ((crossref ~ ("^[^:]*:" VALID_YEAR_PATTERN ":[^:]*$")) && \
	match(crossref,(":" VALID_YEAR_PATTERN ":")))
	set_year(substr(crossref,RSTART+1,RLENGTH-2))
}


function delete_substring(s,rstart,rlength)
{
    return (substr(s,1,rstart-1) substr(s,rstart+rlength))
}


function do_braced_personal_name(fullname,shortname, k,n,parts)
{
    ## Typical values of shortname (however, the parenthesized strings
    ## will already have been removed):
    ##
    ##    "Chr. Br{\\\"a}uchle"
    ##    "Cleve B. {Moler (consulting editor)}"
    ##    "D. C. {Van Hart}"
    ##    "E. Fran{\c{c}}ois"
    ##    "Giuseppe {Radicati di Brozolo}"
    ##    "Hal K. {St. Clair}"
    ##    "Jose E. {Amaral de Sa}"
    ##    "Karel P\'{\i}{\v{s}}ka"
    ##    "L. B{\\\"o}r{\\\"o}czky"
    ##    "Larry L. Schumaker (Eds.)"
    ##    "Lech Papie{\.z}"
    ##    "M. N. {el Agizy}"
    ##    "P. Jan M. {van Bentum}"
    ##    "Patrick {Wood, editor}"
    ##    "Peter {van Emde Boas}"
    ##    "R. A. M. {Hartmann (or Hartman?)}"
    ##    "R. J. {von Gutfeld}"
    ##    "Ronald M. {Smith, Sr.}"
    ##    "S. (Salvatore) R. {La Paglia}"
    ##    "{Jones IV}"
    ##    "{Major Henry} M. Robert"
    ##    "{Smith, Jr.}"
    ##    "{\AA}ke Bj{\"o}rck"
    ##    "{de la Vall{\\'e}e-Poussin}"

    ## print "DEBUG: do_braced_personal_name(" shortname ")"
    shortname = unbrace_outer(shortname)
    n = split(shortname,parts," ")
    if (parts[n] ~ "^[JS][r][.]?$")	# then Jr-like
    {
	if ((n > 1) && (parts[n-1] !~ ",$"))
	    warning("NOTE: Junior-like suffixes are normally preceded by a comma: [" shortname "]")
	n--	
    }
    else if (parts[n] ~ "^[IVXLCDM]+$")	# then roman-numeral-like
    {
	if ((n > 1) && (parts[n-1] ~ ",$"))
	    warning("NOTE: generational suffixes are normally not preceded by a comma: [" shortname "]")
	n--
    }
    return (do_show_name(fullname,reduce_personal_name(unsplit(parts,1,n,""))))
}


function do_corporate_name(fullname,shortname, k,n,parts,t)
{
    ## Typical values of shortname (however, the parenthesized strings
    ## will already have been removed):
    ##
    ##    "AT{\&T}"
    ##    "{ACM\slash SIGGRAPH}"
    ##    "{ANSI Subcommittee X3J3}"
    ##    "{Abraxas Software, Inc.}"
    ##    "{Adobe Systems Incorporated}"
    ##    "{American Psychological Association}"
    ##    "{Apple Computer, Inc.}"
    ##    "{EARN Staff}"
    ##    "{Electronic Frontier Foundation}"
    ##    "{IBM Corporation}"
    ##    "{IEEE Task P754}"
    ##    "{IEEE}"
    ##    "{Mortice Kern Systems, Inc.}"
    ##    "{PostScript Developer Support Group}"
    ##    "{Seventy Six Assistant Editors}"
    ##    "{The Be Development Team}"
    ##    "{The German Section of the Oxford University Press Dictionary Department}"
    ##    "{The MathWorks, Inc.}"
    ##    "{The Samba Team}"
    ##    "{The staff of O'Reilly \& Associates, Inc.}
    ##    "{X/Open Company, Ltd.}"

    ## print "DEBUG: do_corporate_name([" fullname "],[" shortname "])"
    shortname = unbrace_outer(shortname)
    gsub("/"," ",shortname)		# reduce "X/Open" to "X Open"
    parts[1] = ""			# create array to keep "gawk --lint" happy
    n = split_at_brace_level_zero(shortname,parts," +")
    for (k = 1; k <= n; ++k)
    {
	t = tolower(parts[k])
	gsub(" *[,.:;]$","",t)
	if (t in Ignore)
	    delete parts[k]
	else if (t in Corporate_Ignore)
	    delete parts[k]
    }
    shortname = unsplit(parts,1,n,SUBSEP)
    n = split(shortname,parts,SUBSEP)
    if ((!LONGCORPORATENAMES) && (n > 1)) # so we do not reduce "IBM" to "I"
    {
	for (k = 1; k <= n; ++k)	# reduce words to initial letters
	    parts[k] = substr(fix_accents(parts[k]),1,1)
	t = do_show_name(fullname,reduce_corporate_name(unsplit(parts,1,n,"")))
    }
    else
    {
	gsub(SUBSEP,"-",shortname)	# convert separators to hyphens
	t = do_show_name(fullname,reduce_corporate_name(shortname))
    }
    return (t)
}


function do_dump_corporate_list(filelist, filenames,k,n)
{
    gsub(" */dev/null *","",filelist)	# ignore empty input files
    n = split(filelist, filenames, " ")
    for (k = 1; k <= n; ++k)
	dump_corporate_list(filenames[k])
    if (n > 0)				# terminate if we did anything
	exit(0)
}


function do_dump_ignore_list(filelist, filenames,k,n)
{
    gsub(" */dev/null *","",filelist)	# ignore empty input files
    n = split(filelist, filenames, " ")
    for (k = 1; k <= n; ++k)
	dump_ignore_list(filenames[k])
    if (n > 0)				# terminate if we did anything
	exit(0)
}


function do_dump_labels(filelist, filenames,k,n)
{
    gsub(" */dev/null *","",filelist)	# ignore empty input files
    n = split(filelist, filenames, " ")
    for (k = 1; k <= n; ++k)
	dump_labels(filenames[k])
}


function do_extend_corporate_list(filelist, filenames,k,n)
{
    gsub(" */dev/null *","",filelist)	# ignore empty input files
    n = split(filelist, filenames, " ")
    for (k = 1; k <= n; ++k)
	extend_corporate_list(filenames[k])
}


function do_extend_ignore_list(filelist, filenames,k,n)
{
    gsub(" */dev/null *","",filelist)	# ignore empty input files
    n = split(filelist, filenames, " ")
    for (k = 1; k <= n; ++k)
	extend_ignore_list(filenames[k])
}


function do_extend_used_list(filelist, filenames,k,n)
{
    gsub(" */dev/null *","",filelist)	# ignore empty input files
    n = split(filelist, filenames, " ")
    for (k = 1; k <= n; ++k)
	extend_used_list(filenames[k])
}


function do_names(s, k,n,parts)
{
    ## print "DEBUG: do_names(" s ")"
    parts[1] = ""			# create array to keep "gawk --lint" happy
    n = split_at_brace_level_zero(s,parts," +and +")
    return (do_one_name(parts[1]))

    ## for (k = 1; k <= n; ++k)
    ## {
    ##	## print "DEBUG: do_names(" s "): parts[" k "] = [" parts[k] "]"
    ##	do_one_name(parts[k])
    ## }
}


function do_one_name(fullname, k,n,parts,s,t)
{
    ## print "DEBUG: do_one_name(" fullname ")"
    parts[1] = ""			# create array to keep "gawk --lint" happy
    s = strip_parenthesized_strings(fullname)
    s = reduce_TeX_macros(s)
    n = split_at_brace_level_zero(s,parts," +")

    ## These cases should not happen, but often do: suffixes outside braces
    if (parts[n] ~ "^[JS][r][.]?$")	# then Jr-like
    {
	if ((n > 1) && (parts[n-1] !~ ",$"))
	    warning("NOTE: Junior-like suffixes are normally preceded by a comma: [" s "]")
	n--
    }
    else if (parts[n] ~ "^[IVXLCDM]+$")	# then roman-numeral-like
    {
	if ((n > 1) && (parts[n-1] ~ ",$"))
	    warning("NOTE: generational suffixes are normally not preceded by a comma: [" s "]")
	n--
    }

    if (n < 1)
    {
	warning("WARNING: unexpected standalone Junior-like name suffix [" s "]")
	n = 1
    }

    ## TO DO: normalize name from "Last, First" to "First Last" here.
    ## Since biblabel.sh always invokes bibclean first, this
    ## normalization has already been done.  This program is not
    ## expected to be invoked standalone, except for testing purposes.

    ## print "DEBUG: do_one_name(" fullname "): parts[" n "] = [" parts[n] "]"

    if ((n == 1) && (parts[1] ~ "^{.*}$")) # "{IBM Corporation}"
	s = do_corporate_name(fullname,parts[n])
    else if (parts[n] ~ "^{.*}$")	# "{Jones IV}" or "{Smith, Jr.}"
	s = do_braced_personal_name(fullname,parts[n])
    else				# "Smith" or "de la Vall{\'e}e-Poussin"
    {
	t = parts[n]
	while ((--n > 0) && (parts[n] ~ "^[a-z]"))
	    t = (parts[n] t)
	s = do_show_name(fullname,reduce_personal_name(t))
    }
    return (s)
}


function do_show_name(fullname,abbrevname)
{
    ## printf("%-44s-->\t%s\n", fullname, abbrevname)
    return (abbrevname)
}


function dump_corporate_list(filename)
{
    dump_list(filename,Corporate_Ignore,"corporate ignore list")
}


function dump_header(filename,title)
{
    ## Print a dump file header which, for want of anything better,
    ## follows the syntax of Adobe Document Structuring Conventions in
    ## PostScript files, defined in Appendix G of the PostScript
    ## Language Reference Manual, Second edition, Addison-Wesley, 1990,
    ## ISBN 0-201-18127-4 (and sadly, omitted from the 1999 third
    ## edition!).

    print "%% Title: Dump of " title						> filename
    print "%% CreationDate: " Current_Date_and_Time				> filename
    print "%% Creator: biblabel version " VERSIONNUMBER " [" VERSIONDATE "]"	> filename
    print "%% For: " get_personal_name() " <" get_email_address() ">"		> filename
    print "%% Directory: " ENVIRON["PWD"]					> filename
    print ""									> filename
}


function dump_ignore_list(filename)
{
    dump_list(filename,Ignore,"ignore list")
}


function dump_labels(filename, sort_pipe,word)
{
    dump_header(filename,"labels in use (sorted by label)")
    close (filename)
    sort_pipe = ("sort -f -k 2 -k 1 >> " filename)
    for (word in In_Use_Label)
    {
	if (word != "")
	    printf("%-23s\t%s\n", \
		   In_Use_File[word], In_Use_Label_Name[word]) | sort_pipe
    }
    close (sort_pipe)
}


function dump_list(filename,table,title, sort_pipe,word)
{
    dump_header(filename,title)
    close (filename)
    sort_pipe = ("sort -f >> " filename)
    for (word in table)
	print word | sort_pipe
    close (sort_pipe)
}


function editor_string()
{
    if (Lastname)	# don't set Lastname if already set from authors
	return
    Lastname = do_names(value($0))
}


function end_bibliography()
{
    for (Old_Label in Old_to_New)
    {
	# NB: we print ALL labels here, even if (Old_Label == Old_to_New[Old_Label]),
	# because we later extract the new labels from the *.sub files to
	# prepare the biblabel.use file
	if (Old_Label && Old_to_New[Old_Label])
	    print_substitution_line(Old_Label,Old_to_New[Old_Label])
    }
    do_dump_labels(DUMPLABELFILES)
}


function end_entry()
{
    if (Type == "Article") 		New_Label = label_Article()
    else if (Type == "Book")		New_Label = label_Book()
    else if (Type == "Booklet")		New_Label = label_Booklet()
    else if (Type == "DEAthesis")	New_Label = label_DEAthesis()
    else if (Type == "InBook")		New_Label = label_InBook()
    else if (Type == "InCollection")	New_Label = label_InCollection()
    else if (Type == "InProceedings")	New_Label = label_InProceedings()
    else if (Type == "Manual")		New_Label = label_Manual()
    else if (Type == "MastersThesis")	New_Label = label_MastersThesis()
    else if (Type == "Misc")		New_Label = label_Misc()
    else if (Type == "Periodical")	New_Label = label_Periodical()
    else if (Type == "PhdThesis")	New_Label = label_PhdThesis()
    else if (Type == "Proceedings")	New_Label = label_Proceedings()
    else if (Type == "TechReport")	New_Label = label_TechReport()
    else if (Type == "Unpublished")	New_Label = label_Unpublished()
    else				New_Label = ""

    ## print_debug("end_entry(): Old_Label = <" Old_Label "> Type = <" Type "> New_Label = <" New_Label ">")
    if (is_label_in_use(Old_Label))		# duplicate label in input bibliography
	warning("ERROR: duplicate citation label [" Old_Label "]")
    else
    {
	if (New_Label)			# then generated a valid new label
	{
	    if ((New_Label in New_to_Old) || \
		is_label_in_use(New_Label)) # then duplicate label
		New_Label = make_unique_new_label(New_Label)
	    New_to_Old[New_Label] = Old_Label
	    record_label_use(get_filename(),Old_Label,New_Label)
	}
	else			# remember old labels to check for dups
	    record_label_use(get_filename(),Old_Label,"")
    }
    Type = ""			# forget current type and label
    Old_Label = ""		# so as to catch unbalanced braces
}


function entry_label(s, k1,k2)
{
    ## Return the citation label from a `@EntryType{label,' line
    k1 = index(s,"{") + 1
    k2 = index(s,",")
    return (substr(s,k1,k2-k1))
}


function entry_type(s)
{
    return (substr(s,2,index(s,"{")-2))
}


function extend_corporate_list(filename)
{
    extend_word_list(filename,Corporate_Ignore)
}


function extend_ignore_list(filename, fnr,k,line,n,words)
{
    extend_word_list(filename,Ignore)
}


function extend_used_list(filename, fnr,line,n,old_file,old_label,parts)
{
    if (file_exists(filename))
    {
	fnr = 0
	while (getline line <filename)
	{
	    fnr++
	    line = trim(strip_comments(line))
	    if (line == "")
		;			# ignore empty lines
	    else if (line ~ "^%")
		;			# ignore comment lines
	    else if (line == RESET)
	    {
		split("",In_Use_Label)	# this empties In_Use_Label
		split("",In_Use_File)	# this empties In_Use_File
	    }
	    else			# expect "filename citation-label" pairs
	    {
		n = split(line,parts," ")
		if (n == 2)
		{
		    old_file = parts[1]
		    old_label = parts[2]
		    record_label_use(old_file,old_label,"")
		    ## print_debug("extend_used_list(): excluding old_label = <" old_label "> from file = [" old_file "]")
		}
		else
		    warning(("WARNING: unrecognized text [" line "]"),filename,fnr)
	    }
	}
	close(filename)
    }
}


function extend_word_list(filename,wordlist, fnr,k,line,n,words)
{
    if (file_exists(filename))
    {
	fnr = 0
	while (getline line <filename)
	{
	    fnr++
	    line = trim(strip_comments(line))
	    n = split(line,words," ")
	    for (k = 1; k <= n; ++k)
	    {
		if (words[k] == RESET)
		    split("",wordlist)	# this empties wordlist[]
		else
		{
		    gsub(/[^a-zA-Z]/,"",words[k]) # discard all but letters
		    words[k] = tolower(words[k])	# and convert to lower case
		    wordlist[words[k]] = 1
		    ## print "DEBUG: ignoring word <" word[k] ">"
		}
	    }
	}
	close(filename)
    }
}


function file_exists(filename)
{
    ## Unfortunately, getline in both nawk and gawk will hang if its
    ## file does not exist, so we need to test for file existence by
    ## invoking a shell command, sigh...

    return (system("cat " filename " >/dev/null 2>&1") == 0)
}


function fix_accents(lastname, s)
{
    s = lastname

    ## Remove accents from last name, because we need a control-sequence
    ## free label.  The order of some of these is important, so do NOT
    ## rearrange them!
    ## Eventually, an algorithmic reduction should be applied here instead
    ## of all of these substitutions.

    ## Eliminate acute, grave, circumflex, macron, tilde accents.
    gsub(/\\[.'`^=~]/,	"",	s)

    gsub(/\\[uvHtcdbk]{/,"",	s) # eliminate most other accents,
				   # including newly-added Polish ogonek

    gsub(/~/,		" ",	s) # De~Raedt -> De Raedt

    gsub(/{\\i}/,	"i",	s) # dotless i -> i
    gsub(/{\\j}/,	"j",	s) # dotless j -> j
    gsub(/{\\l}/,	"l",	s) # Ko{\l}os -> Kolos

    gsub(/\\i{}/,	"i",	s) # dotless i -> i
    gsub(/\\j{}/,	"j",	s) # dotless j -> j
    gsub(/\\l{}/,	"l",	s) # Ko\l{}os -> Kolos

    # Prior to version 0.03 of biblabel, umlauted a, o, and u were
    ## converted to ae, oe, and ue, following the German practice of
    ## those transliterations when font support is lacking. From version
    ## 0.03, we follow instead the recommendations of the MLA Handbook
    ## (4th ed., p. 85, section 2.8.2) and the Chicago Manual of Style
    ## (14th ed., p. 741, section 17.103) that the umlaut be dropped
    ## without adding an e after the letter.  This practice is then
    ## consistent with the dropping of all other accents on all other
    ## letters.  The German scharfe-ess (\ss, ess-tzet) is converted to
    ## ss.

    gsub(/\\"A/,	"A",	s) #
    gsub(/\\"O/,	"O",	s) #
    gsub(/\\"U/,	"U",	s) #
    gsub(/\\"\\i/,	"i",	s) # Vorono{\"\i} -> Voronoi
    gsub(/\\"a/,	"a",	s) # J{\"a}nsch -> Jansch
    gsub(/\\"e/,	"e",	s) # Eli{\"e}ns -> Eliens
    gsub(/\\"i/,	"i",	s) # Bsa{\"i}es -> Bsaies
    gsub(/\\"o/,	"o",	s) # B{\"o}rgers -> Borgers
    gsub(/\\"u/,	"u",	s) # R{\"u}de -> Rude
    gsub(/\\"y/,	"y",	s) # Delabbe{\"y} ->Delabbey
    gsub(/\\"{A}/,	"a",	s) #
    gsub(/\\"{O}/,	"o",	s) #
    gsub(/\\"{U}/,	"u",	s) #
    gsub(/\\"{a}/,	"a",	s) # J{\"{a}}nsch -> Jansch
    gsub(/\\"{e}/,	"e",	s) # Eli{\"{e}}ns -> Eliens
    gsub(/\\"{i}/,	"i",	s) # Bsa{\"{i}}es -> Bsaies
    gsub(/\\"{o}/,	"o",	s) # B{\"{o}}rgers -> Borgers
    gsub(/\\"{u}/,	"u",	s) # H{\"{u}}bner -> Hubner

    gsub(/\\AA{}/,	"Aa",	s) # {\AA}rhus -> Aarhus
    gsub(/\\AE{}/,	"Ae",	s) # {\AE}ro -> Aero
    gsub(/\\OE{}/,	"Oe",	s) # {\OE}rsted -> Oersted
    gsub(/\\aa{}/,	"aa",	s) # Ring\aa -> Ringaa
    gsub(/\\ae{}/,	"ae",	s) # D{\ae}hlen -> Daehlen
    gsub(/\\oe{}/,	"oe",	s) # St{\oe}ren -> Stoeren
    gsub(/\\o{}/,	"o",	s) # Bj\o{}rstad -> Bjorstad
    gsub(/\\ss{}/,	"ss",	s) # F{\"o}{\ss}meier ->Fossmeier)

    gsub(/{\\AA}/,	"Aa",	s) # {\AA}rhus -> Aarhus
    gsub(/{\\AE}/,	"Ae",	s) # {\AE}ro -> Aero
    gsub(/{\\OE}/,	"Oe",	s) # {\OE}rsted -> Oersted
    gsub(/{\\aa}/,	"aa",	s) # Ring\aa -> Ringaa
    gsub(/{\\ae}/,	"ae",	s) # D{\ae}hlen -> Daehlen
    gsub(/{\\oe}/,	"oe",	s) # St{\oe}ren -> Stoeren
    gsub(/{\\o}/,	"o",	s) # Bj\o{}rstad -> Bjorstad
    gsub(/{\\ss}/,	"ss",	s) # F{\"o}{\ss}meier ->Fossmeier)

    gsub(/\\u\\i[{}]/,  "i",	s) # Kosovski{\u\i} -> Kosovskii

    gsub(/\\&/,		"",	s) # AT{\&T} -> ATT

    gsub(/'/,		"",	s) # Il'in -> Ilin
    gsub(/`/,		"",	s) # O`Hearn -> OHearn

    gsub(/[{}]/,	"",	s) # {R}oeck -> Roeck

    ## Warn if we missed some.  These messages often indicate errors in the
    ## input bibliography.
    if (s ~ /[^ \/&{}.,a-zA-Z'0-9-]/)
	warning("WARNING: incomplete accent removal [" lastname "] -> [" s "]")

    ## And then remove everything but letters, digits, and hyphen
    gsub(/[^a-zA-Z0-9-]/,	"",	s)

    return (s)
}


function fix_author_editor(s, k,n,name,t)
{
    t = s

    ## Change parentheses and periods to spaces
    gsub(/[().]/,		" ",	t)

    ## Change explicit space to ordinary space
    gsub(/\\ /,			" ",	t)

    ## Remove Jr-like modifiers.
    ## Problem cases:
    ##    "John Smith Sr and A. Srinivasan"
    ##    "A. Srinivasan and John Smith Sr"
    ##    "A. Srinivasan Sr and A. Srinivasan Jr"
    ##    "Shu-T'ien Xi"
    ## These cannot be handled reliably by sub() and gsub(), because
    ## awk's regular expression matching has no provision for
    ## specification of trailing context.  They can be dealt with if we
    ## first split the field into names of individuals, check for
    ## trailing context by a separate match() call, and then delete
    ## the matched string, minus the trailing context.
    n = split(t, name, " and ")
    for (k = 1; k <= n; ++k)
    {
	## print_debug("name[" k "] = [" name[k] "]")
	## handle upper-case Roman numerals
	if (match(name[k],/,?[ ~]+[IVXLCDM]+[. }]*$/))
	    name[k] = delete_substring(name[k],RSTART,RLENGTH-1)
	## print_debug("\t->name[" k "] = [" name[k] "]")

	## handle Jr and Sr
	if (match(name[k],/,?[ ~]+[JjSs]r[. }]*$/))
	    name[k] = delete_substring(name[k],RSTART,RLENGTH-1)

	## print_debug("\t\t->name[" k "] = [" name[k] "]")

    }
    t = name[1]
    for (k = 2; k <= n; ++k)
	t = t " and " name[k]

    gsub(/ +/," ",t)
    gsub(/ +}/,"}",t)	# NB: reduction of "{Jones, Jr.}" produced "{Jones }"

    return (t)
}


function get_email_address()
{
    return (ENVIRON["USER"] "@" ENVIRON["HOST"])
}


function get_filename( s)
{
    ## Return the current input filename.  This is either FILENAME, if
    ## it is not stdin (represented by "" or "-"), or else Filename,
    ## which was set in begin_bibliography() from the global INPUTFILES list.
    ## The reason for this aliasing is that this program is normally
    ## invoked from the biblabel.sh script, with input piped from
    ## bibclean.

    ## nawk has the empty string, "", for stdin, while gawk and mawk
    ## have "-": hide this difference to avoid bogus validation suite
    ## warnings.
    s = ((FILENAME == "") ? "-" : FILENAME)

    if ((s == "-") && (Filename != ""))
	s = Filename

    return (s)
}


function get_personal_name( cmd,s)
{
    cmd = ("ypmatch " ENVIRON["USER"] " passwd | awk -F: '{print $5}'")
    cmd | getline s
    close(cmd)
    if (s == "")
    {
	cmd = ("grep :" ENVIRON["USER"] ": /etc/passwd | awk -F: '{print $5}'")
	cmd | getline s
	close(cmd)
    }
    return (s)
}


function initialize_maps()
{
    ## Define a mapping table from ISO8859-1 characters to zero- or
    ## more-character replacement strings that attempt to maintain
    ## mnenomic value.  We reduce umlaut-letter to letter, rather than
    ## letter+e, since that is the transliteration practice in both
    ## English and the Scandinavian languages.  German would make the
    ## latter reduction instead.
    ##

    ## Low 128 characters: verbatim mappings:

    Map_ASCII["\000"] = "\000"		# "Ctl-@"
    Map_ASCII["\001"] = "\001"		# "Ctl-A"
    Map_ASCII["\002"] = "\002"		# "Ctl-B"
    Map_ASCII["\003"] = "\003"		# "Ctl-C"
    Map_ASCII["\004"] = "\004"		# "Ctl-D"
    Map_ASCII["\005"] = "\005"		# "Ctl-E"
    Map_ASCII["\006"] = "\006"		# "Ctl-F"
    Map_ASCII["\007"] = "\007"		# "Ctl-G"
    Map_ASCII["\010"] = "\010"		# "Ctl-H"
    Map_ASCII["\011"] = "\011"		# "Ctl-I"
    Map_ASCII["\012"] = "\012"		# "Ctl-J"
    Map_ASCII["\013"] = "\013"		# "Ctl-K"
    Map_ASCII["\014"] = "\014"		# "Ctl-L"
    Map_ASCII["\015"] = "\015"		# "Ctl-M"
    Map_ASCII["\016"] = "\016"		# "Ctl-N"
    Map_ASCII["\017"] = "\017"		# "Ctl-O"
    Map_ASCII["\020"] = "\020"		# "Ctl-P"
    Map_ASCII["\021"] = "\021"		# "Ctl-Q"
    Map_ASCII["\022"] = "\022"		# "Ctl-R"
    Map_ASCII["\023"] = "\023"		# "Ctl-S"
    Map_ASCII["\024"] = "\024"		# "Ctl-T"
    Map_ASCII["\025"] = "\025"		# "Ctl-U"
    Map_ASCII["\026"] = "\026"		# "Ctl-V"
    Map_ASCII["\027"] = "\027"		# "Ctl-W"
    Map_ASCII["\030"] = "\030"		# "Ctl-X"
    Map_ASCII["\031"] = "\031"		# "Ctl-Y"
    Map_ASCII["\032"] = "\032"		# "Ctl-Z"
    Map_ASCII["\033"] = "\033"		# "Ctl-["
    Map_ASCII["\034"] = "\034"		# "Ctl-\"
    Map_ASCII["\035"] = "\035"		# "Ctl-]"
    Map_ASCII["\036"] = "\036"		# "Ctl-^"
    Map_ASCII["\037"] = "\037"		# "Ctl-_"
    Map_ASCII["\040"] = "\040"		# " "
    Map_ASCII["\041"] = "\041"		# "!"
    Map_ASCII["\042"] = "\042"		# """
    Map_ASCII["\043"] = "\043"		# "#"
    Map_ASCII["\044"] = "\044"		# "$"
    Map_ASCII["\045"] = "\045"		# "%"
    Map_ASCII["\046"] = "\046"		# "&"
    Map_ASCII["\047"] = "\047"		# "'"
    Map_ASCII["\050"] = "\050"		# "("
    Map_ASCII["\051"] = "\051"		# ")"
    Map_ASCII["\052"] = "\052"		# "*"
    Map_ASCII["\053"] = "\053"		# "+"
    Map_ASCII["\054"] = "\054"		# ","
    Map_ASCII["\055"] = "\055"		# "-"
    Map_ASCII["\056"] = "\056"		# "."
    Map_ASCII["\057"] = "\057"		# "/"
    Map_ASCII["\060"] = "\060"		# "0"
    Map_ASCII["\061"] = "\061"		# "1"
    Map_ASCII["\062"] = "\062"		# "2"
    Map_ASCII["\063"] = "\063"		# "3"
    Map_ASCII["\064"] = "\064"		# "4"
    Map_ASCII["\065"] = "\065"		# "5"
    Map_ASCII["\066"] = "\066"		# "6"
    Map_ASCII["\067"] = "\067"		# "7"
    Map_ASCII["\070"] = "\070"		# "8"
    Map_ASCII["\071"] = "\071"		# "9"
    Map_ASCII["\072"] = "\072"		# ":"
    Map_ASCII["\073"] = "\073"		# ";"
    Map_ASCII["\074"] = "\074"		# "<"
    Map_ASCII["\075"] = "\075"		# "="
    Map_ASCII["\076"] = "\076"		# ">"
    Map_ASCII["\077"] = "\077"		# "?"
    Map_ASCII["\100"] = "\100"		# "@"
    Map_ASCII["\101"] = "\101"		# "A"
    Map_ASCII["\102"] = "\102"		# "B"
    Map_ASCII["\103"] = "\103"		# "C"
    Map_ASCII["\104"] = "\104"		# "D"
    Map_ASCII["\105"] = "\105"		# "E"
    Map_ASCII["\106"] = "\106"		# "F"
    Map_ASCII["\107"] = "\107"		# "G"
    Map_ASCII["\110"] = "\110"		# "H"
    Map_ASCII["\111"] = "\111"		# "I"
    Map_ASCII["\112"] = "\112"		# "J"
    Map_ASCII["\113"] = "\113"		# "K"
    Map_ASCII["\114"] = "\114"		# "L"
    Map_ASCII["\115"] = "\115"		# "M"
    Map_ASCII["\116"] = "\116"		# "N"
    Map_ASCII["\117"] = "\117"		# "O"
    Map_ASCII["\120"] = "\120"		# "P"
    Map_ASCII["\121"] = "\121"		# "Q"
    Map_ASCII["\122"] = "\122"		# "R"
    Map_ASCII["\123"] = "\123"		# "S"
    Map_ASCII["\124"] = "\124"		# "T"
    Map_ASCII["\125"] = "\125"		# "U"
    Map_ASCII["\126"] = "\126"		# "V"
    Map_ASCII["\127"] = "\127"		# "W"
    Map_ASCII["\130"] = "\130"		# "X"
    Map_ASCII["\131"] = "\131"		# "Y"
    Map_ASCII["\132"] = "\132"		# "Z"
    Map_ASCII["\133"] = "\133"		# "["
    Map_ASCII["\134"] = "\134"		# "\"
    Map_ASCII["\135"] = "\135"		# "]"
    Map_ASCII["\136"] = "\136"		# "^"
    Map_ASCII["\137"] = "\137"		# "_"
    Map_ASCII["\140"] = "\140"		# "`"
    Map_ASCII["\141"] = "\141"		# "a"
    Map_ASCII["\142"] = "\142"		# "b"
    Map_ASCII["\143"] = "\143"		# "c"
    Map_ASCII["\144"] = "\144"		# "d"
    Map_ASCII["\145"] = "\145"		# "e"
    Map_ASCII["\146"] = "\146"		# "f"
    Map_ASCII["\147"] = "\147"		# "g"
    Map_ASCII["\150"] = "\150"		# "h"
    Map_ASCII["\151"] = "\151"		# "i"
    Map_ASCII["\152"] = "\152"		# "j"
    Map_ASCII["\153"] = "\153"		# "k"
    Map_ASCII["\154"] = "\154"		# "l"
    Map_ASCII["\155"] = "\155"		# "m"
    Map_ASCII["\156"] = "\156"		# "n"
    Map_ASCII["\157"] = "\157"		# "o"
    Map_ASCII["\160"] = "\160"		# "p"
    Map_ASCII["\161"] = "\161"		# "q"
    Map_ASCII["\162"] = "\162"		# "r"
    Map_ASCII["\163"] = "\163"		# "s"
    Map_ASCII["\164"] = "\164"		# "t"
    Map_ASCII["\165"] = "\165"		# "u"
    Map_ASCII["\166"] = "\166"		# "v"
    Map_ASCII["\167"] = "\167"		# "w"
    Map_ASCII["\170"] = "\170"		# "x"
    Map_ASCII["\171"] = "\171"		# "y"
    Map_ASCII["\172"] = "\172"		# "z"
    Map_ASCII["\173"] = "\173"		# "{"
    Map_ASCII["\174"] = "\174"		# "|"
    Map_ASCII["\175"] = "\175"		# "}"
    Map_ASCII["\176"] = "\176"		# "~"
    Map_ASCII["\177"] = "\177"		# ""

    ## High 128 characters: non-verbatim mappings.
    ##
    ## In principle, none of these characters should ever occur in
    ## BibTeX files, which are supposed to be restricted to 7-bit
    ## ASCII with TeX control sequences used to generate accented
    ## letters.  However, we try to handle them gracefully, and warn
    ## about their presence.

    Map_ASCII["\200"] = ""		# ""
    Map_ASCII["\201"] = ""		# ""
    Map_ASCII["\202"] = ""		# ""
    Map_ASCII["\203"] = ""		# ""
    Map_ASCII["\204"] = ""		# ""
    Map_ASCII["\205"] = ""		# ""
    Map_ASCII["\206"] = ""		# ""
    Map_ASCII["\207"] = ""		# ""
    Map_ASCII["\210"] = ""		# ""
    Map_ASCII["\211"] = ""		# ""
    Map_ASCII["\212"] = ""		# ""
    Map_ASCII["\213"] = ""		# ""
    Map_ASCII["\214"] = ""		# ""
    Map_ASCII["\215"] = ""		# ""
    Map_ASCII["\216"] = ""		# ""
    Map_ASCII["\217"] = ""		# ""
    Map_ASCII["\220"] = ""		# ""
    Map_ASCII["\221"] = ""		# ""
    Map_ASCII["\222"] = ""		# ""
    Map_ASCII["\223"] = ""		# ""
    Map_ASCII["\224"] = ""		# ""
    Map_ASCII["\225"] = ""		# ""
    Map_ASCII["\226"] = ""		# ""
    Map_ASCII["\227"] = ""		# ""
    Map_ASCII["\230"] = ""		# ""
    Map_ASCII["\231"] = ""		# ""
    Map_ASCII["\232"] = ""		# ""
    Map_ASCII["\233"] = ""		# ""
    Map_ASCII["\234"] = ""		# ""
    Map_ASCII["\235"] = ""		# ""
    Map_ASCII["\236"] = ""		# ""
    Map_ASCII["\237"] = ""		# ""
    Map_ASCII["\240"] = ""		# ""
    Map_ASCII["\241"] = ""		# ""
    Map_ASCII["\242"] = ""		# ""
    Map_ASCII["\243"] = ""		# ""
    Map_ASCII["\244"] = ""		# ""
    Map_ASCII["\245"] = ""		# ""
    Map_ASCII["\246"] = ""		# ""
    Map_ASCII["\247"] = ""		# ""
    Map_ASCII["\250"] = ""		# ""
    Map_ASCII["\251"] = ""		# ""
    Map_ASCII["\252"] = ""		# ""
    Map_ASCII["\253"] = ""		# ""
    Map_ASCII["\254"] = ""		# ""
    Map_ASCII["\255"] = ""		# ""
    Map_ASCII["\256"] = ""		# ""
    Map_ASCII["\257"] = ""		# ""
    Map_ASCII["\260"] = ""		# ""
    Map_ASCII["\261"] = ""		# ""
    Map_ASCII["\262"] = ""		# ""
    Map_ASCII["\263"] = ""		# ""
    Map_ASCII["\264"] = ""		# ""
    Map_ASCII["\265"] = ""		# ""
    Map_ASCII["\266"] = ""		# ""
    Map_ASCII["\267"] = ""		# ""
    Map_ASCII["\270"] = ""		# ""
    Map_ASCII["\271"] = ""		# ""
    Map_ASCII["\272"] = ""		# ""
    Map_ASCII["\273"] = ""		# ""
    Map_ASCII["\274"] = ""		# ""
    Map_ASCII["\275"] = ""		# ""
    Map_ASCII["\276"] = ""		# ""
    Map_ASCII["\277"] = ""		# ""
    Map_ASCII["\300"] = "A"		# ""
    Map_ASCII["\301"] = "A"		# ""
    Map_ASCII["\302"] = "A"		# ""
    Map_ASCII["\303"] = "A"		# ""
    Map_ASCII["\304"] = "A"		# ""
    Map_ASCII["\305"] = "Aa"		# ""
    Map_ASCII["\306"] = "Ae"		# ""
    Map_ASCII["\307"] = "C"		# ""
    Map_ASCII["\310"] = "E"		# ""
    Map_ASCII["\311"] = "E"		# ""
    Map_ASCII["\312"] = "E"		# ""
    Map_ASCII["\313"] = "E"		# ""
    Map_ASCII["\314"] = "I"		# ""
    Map_ASCII["\315"] = "I"		# ""
    Map_ASCII["\316"] = "I"		# ""
    Map_ASCII["\317"] = "I"		# ""
    Map_ASCII["\320"] = "Eth"		# ""
    Map_ASCII["\321"] = "N"		# ""
    Map_ASCII["\322"] = "o"		# ""
    Map_ASCII["\323"] = "o"		# ""
    Map_ASCII["\324"] = "o"		# ""
    Map_ASCII["\325"] = "o"		# ""
    Map_ASCII["\326"] = "o"		# ""
    Map_ASCII["\327"] = ""		# ""
    Map_ASCII["\330"] = "O"		# ""
    Map_ASCII["\331"] = "U"		# ""
    Map_ASCII["\332"] = "U"		# ""
    Map_ASCII["\333"] = "U"		# ""
    Map_ASCII["\334"] = "U"		# ""
    Map_ASCII["\335"] = "Y"		# ""
    Map_ASCII["\336"] = "Thorn"		# ""
    Map_ASCII["\337"] = "ss"		# ""
    Map_ASCII["\340"] = "a"		# ""
    Map_ASCII["\341"] = "a"		# ""
    Map_ASCII["\342"] = "a"		# ""
    Map_ASCII["\343"] = "a"		# ""
    Map_ASCII["\344"] = "a"		# ""
    Map_ASCII["\345"] = "aa"		# ""
    Map_ASCII["\346"] = "ae"		# ""
    Map_ASCII["\347"] = "c"		# ""
    Map_ASCII["\350"] = "e"		# ""
    Map_ASCII["\351"] = "e"		# ""
    Map_ASCII["\352"] = "e"		# ""
    Map_ASCII["\353"] = "e"		# ""
    Map_ASCII["\354"] = "i"		# ""
    Map_ASCII["\355"] = "i"		# ""
    Map_ASCII["\356"] = "i"		# ""
    Map_ASCII["\357"] = "i"		# ""
    Map_ASCII["\360"] = "eth"		# ""
    Map_ASCII["\361"] = "n"		# ""
    Map_ASCII["\362"] = "o"		# ""
    Map_ASCII["\363"] = "o"		# ""
    Map_ASCII["\364"] = "o"		# ""
    Map_ASCII["\365"] = "o"		# ""
    Map_ASCII["\366"] = "o"		# ""
    Map_ASCII["\367"] = ""		# ""
    Map_ASCII["\370"] = "o"		# ""
    Map_ASCII["\371"] = "u"		# ""
    Map_ASCII["\372"] = "u"		# ""
    Map_ASCII["\373"] = "u"		# ""
    Map_ASCII["\374"] = "u"		# ""
    Map_ASCII["\375"] = "y"		# ""
    Map_ASCII["\376"] = "thorn"		# ""
    Map_ASCII["\377"] = "y"		# ""
}


function is_label_in_use(label, in_use)
{
    ## Version 0.01 had
    ##
    ##  in_use = (tolower(label) in In_Use_Label) && (get_filename() != In_Use_File[tolower(label)])
    ##
    ## The second test for filename mismatched was based on the
    ## assumption that the input file contained no duplicate labels, so
    ## there was no need to check.  The result of this was that (a) no
    ## duplicate label warning was ever issued, and (b) long label
    ## suffixes (e.g. aaaag) were generated for duplicate labels.
    ## At Version 0.02, this was changed to test only for membership in
    ## the In_Use_Label[] array.

    in_use = (tolower(label) in In_Use_Label)
    ## print_debug("is_label_in_use(" label ") -> " in_use " get_filename()=[" get_filename() "] In_Use_File=[" In_Use_File[tolower(label)] "]")
    return (in_use)
}


### Eventually, we may want to prepare more customized labels, but for
### now, most types use the book label, Author:yyy:ABC, based on the title
### and four digits of the year.  In most cases, this still generates a
### unique label, and anyway, in end_bibliography(), we ensure that unique
### labels are generated by adding letter suffixes where needed.

function label_Article()
{
    return (label_Book())
}


function label_Book()
{
    ## print_debug("label_Book(): Old_Label = <" Old_Label "> Last name = <" Lastname "> ttlabb = <" Title_Abbrev "> Year = <" Year ">")

    if (Lastname && Title_Abbrev && Year)
	return Lastname ":" Year ":" Title_Abbrev # BibNet style: "Smith:1996:AMS"
    else
	return ""		# insufficient information to create a label
}


function label_Booklet()
{
    return (label_Book())
}


function label_DEAthesis()
{
    return (label_Book())
}


function label_InBook()
{
    return (label_Book())
}


function label_InCollection()
{
    return (label_Book())
}


function label_InProceedings()
{
    return (label_Book())
}


function label_Manual()
{
    return (label_Book())
}


function label_MastersThesis()
{
    return (label_Book())
}


function label_Misc()
{
    return (label_Book())
}


function label_Periodical()
{
    return (label_Book())
}


function label_PhdThesis()
{
    return (label_Book())
}


function label_Proceedings()
{
    return (label_Book())
}


function label_TechReport()
{
    return (label_Book())
}


function label_Unpublished()
{
    return (label_Book())
}


function make_ascii(s, k,n,parts)
{
    if (Eight_Bit_Regexps) 		# can use fast convenient gawk/mawk extension
	n = split(s,parts,"")
    else				# must be nawk
    {
	n = length(s)
	for (k = 1; k <= n; ++k)
	    parts[k] = substr(s,k,1)
    }
    for (k = 1; k <= n; ++k)
	parts[k] = Map_ASCII[parts[k]]
    return (unsplit(parts,1,n,""))
}


function make_corporate_ignore_list()
{
    ## List of (lowercased, punctuation-free) words to ignore in
    ## corporate names.
    Corporate_Ignore["co"]		= 1
    Corporate_Ignore["company"]		= 1
    Corporate_Ignore["corp"]		= 1
    Corporate_Ignore["corporation"]	= 1
    Corporate_Ignore["gmbh"]		= 1
    Corporate_Ignore["group"]		= 1
    Corporate_Ignore["inc"]		= 1
    Corporate_Ignore["incorporated"]	= 1
    Corporate_Ignore["limited"]		= 1
    Corporate_Ignore["ltd"]		= 1
    Corporate_Ignore["staff"]		= 1
    Corporate_Ignore["team"]		= 1
}


function make_ignore_list()
{
    ## List of words to Ignore in forming citation labels.  The initial
    ## list was extracted from the bibindex badwords list, and covers
    ## a few European languages as well as English.
    Ignore["a"]         = 1
    Ignore["ab"]        = 1
    Ignore["aber"]      = 1
    Ignore["als"]       = 1
    Ignore["an"]        = 1
    Ignore["and"]       = 1
    Ignore["are"]       = 1
    Ignore["as"]        = 1
    Ignore["auf"]       = 1
    Ignore["aus"]       = 1
    Ignore["az"]        = 1
    Ignore["bei"]       = 1
    Ignore["bir"]       = 1
    Ignore["but"]       = 1
    Ignore["da"]        = 1
    Ignore["das"]       = 1
    Ignore["dat"]       = 1
    Ignore["de"]        = 1
    Ignore["dei"]       = 1
    Ignore["dem"]       = 1
    Ignore["den"]       = 1
    Ignore["der"]       = 1
    Ignore["des"]       = 1
    Ignore["det"]       = 1
    Ignore["di"]        = 1
    Ignore["die"]       = 1
    Ignore["dos"]       = 1
    Ignore["e"]         = 1
    Ignore["een"]       = 1
    Ignore["eene"]      = 1
    Ignore["egy"]       = 1
    Ignore["ei"]        = 1
    Ignore["ein"]       = 1
    Ignore["eine"]      = 1
    Ignore["einen"]     = 1
    Ignore["einer"]     = 1
    Ignore["eines"]     = 1
    Ignore["eit"]       = 1
    Ignore["el"]        = 1
    Ignore["en"]        = 1
    Ignore["er"]        = 1
    Ignore["es"]        = 1
    Ignore["et"]        = 1
    Ignore["ett"]       = 1
    Ignore["eyn"]       = 1
    Ignore["eyne"]      = 1
    Ignore["for"]       = 1
    Ignore["from"]      = 1
    Ignore["fuer"]      = 1
    Ignore["fur"]       = 1
    Ignore["gl"]        = 1
    Ignore["gli"]       = 1
    Ignore["ha"]        = 1
    Ignore["haben"]     = 1
    Ignore["had"]       = 1
    Ignore["hai"]       = 1
    Ignore["has"]       = 1
    Ignore["hat"]       = 1
    Ignore["have"]      = 1
    Ignore["he"]        = 1
    Ignore["heis"]      = 1
    Ignore["hen"]       = 1
    Ignore["hena"]      = 1
    Ignore["henas"]     = 1
    Ignore["het"]       = 1
    Ignore["hin"]       = 1
    Ignore["hinar"]     = 1
    Ignore["hinir"]     = 1
    Ignore["hinn"]      = 1
    Ignore["hith"]      = 1
    Ignore["ho"]        = 1
    Ignore["hoi"]       = 1
    Ignore["i"]         = 1
    Ignore["il"]        = 1
    Ignore["in"]        = 1
    Ignore["ist"]       = 1
    Ignore["ka"]        = 1
    Ignore["ke"]        = 1
    Ignore["l"]         = 1
    Ignore["la"]        = 1
    Ignore["las"]       = 1
    Ignore["le"]        = 1
    Ignore["les"]       = 1
    Ignore["lo"]        = 1
    Ignore["los"]       = 1
    Ignore["mia"]       = 1
    Ignore["mit"]       = 1
    Ignore["n"]         = 1
    Ignore["na"]        = 1
    Ignore["nji"]       = 1
    Ignore["not"]       = 1
    Ignore["o"]         = 1
    Ignore["oder"]      = 1
    Ignore["of"]        = 1
    Ignore["on"]        = 1
    Ignore["or"]        = 1
    Ignore["os"]        = 1
    Ignore["others"]    = 1
    Ignore["s"]         = 1
    Ignore["sie"]       = 1
    Ignore["sind"]      = 1
    Ignore["so"]        = 1
    Ignore["t"]         = 1
    Ignore["ta"]        = 1
    Ignore["the"]       = 1
    Ignore["to"]        = 1
    Ignore["um"]        = 1
    Ignore["uma"]       = 1
    Ignore["un"]        = 1
    Ignore["una"]       = 1
    Ignore["und"]       = 1
    Ignore["une"]       = 1
    Ignore["uno"]       = 1
    Ignore["unter"]     = 1
    Ignore["von"]       = 1
    Ignore["with"]      = 1
    Ignore["y"]         = 1
    Ignore["yr"]        = 1

    ## Additional words added later
    Ignore["also"]      = 1
    Ignore["any"]       = 1
    Ignore["away"]      = 1
    Ignore["by"]        = 1
    Ignore["cum"]       = 1
    Ignore["dans"]      = 1
    Ignore["down"]      = 1
    Ignore["into"]      = 1
    Ignore["is"]        = 1
    Ignore["its"]       = 1
    Ignore["off"]       = 1
    Ignore["onto"]      = 1
    Ignore["out"]       = 1
    Ignore["over"]      = 1
    Ignore["sur"]       = 1
    Ignore["that"]      = 1
    Ignore["these"]     = 1
    Ignore["this"]      = 1
    Ignore["those"]     = 1
    Ignore["uber"]	= 1
    Ignore["unto"]      = 1
    Ignore["up"]        = 1
    Ignore["via"]       = 1
    Ignore["without"]   = 1
    Ignore["zu"]        = 1
    Ignore["zum"]       = 1
    Ignore["zur"]       = 1

    ## More words added [17-Feb-1997]
    Ignore["am"]	= 1
    Ignore["at"]	= 1
    Ignore["aus"]	= 1
    Ignore["aux"]	= 1
    Ignore["be"]	= 1
    Ignore["bin"]	= 1
    Ignore["bist"]	= 1
    Ignore["gehabt"]	= 1
    Ignore["hab"]	= 1
    Ignore["habe"]	= 1
    Ignore["habt"]	= 1
    Ignore["haette"]	= 1
    Ignore["hast"]	= 1
    Ignore["hatte"]	= 1
    Ignore["is"]	= 1
    Ignore["ne"]	= 1
    Ignore["nicht"]	= 1
    Ignore["oben"]	= 1
    Ignore["ohne"]	= 1
    Ignore["pas"]	= 1
    Ignore["seid"]	= 1
    Ignore["uber"]	= 1
    Ignore["vom"]	= 1
}


function make_suffixed_label(label, k,suffixed_new_label)
{				# add a suffix to make a unique label
    for (k = 1; k <= 26; ++k) # try suffixes "a", "b", ..., "z"
    {
	suffixed_new_label = label substr("abcdefghijklmnopqrstuvwxyz",k,1)
	if (suffixed_new_label in New_to_Old)
	{
	    ## print_debug("make_suffixed_label(" label "): [" suffixed_new_label "] in New_to_Old: value=[" New_to_Old[suffixed_new_label] "]")
	    continue
	}
	else if (is_label_in_use(suffixed_new_label))
	{
	    ## print_debug("make_suffixed_label(" label "): is_label_in_use(" suffixed_new_label ") -> 1")
	    continue
	}
	else
	    return (suffixed_new_label)
    }
    ## exhausted "a" "b" ... "z", so (tail) recursively add suffixes
    return (make_suffixed_label(label "a"))
}


function make_unique_new_label(new_label, label_1,label_2)
{
    ## print_debug("make_unique_new_label: new_label <" new_label ">")

    ## we have two duplicate labels, so find replacements for both, so e.g.
    ## Smith:ABC1980 becomes Smith:ABC1980a and Smith:ABC1980b
    if (New_to_Old[new_label])	# then new_label still in use
    {
	label_1 = make_suffixed_label(new_label)
	replace_label(new_label,label_1)
    }

    label_2 = make_suffixed_label(new_label)
    New_to_Old[label_2] = New_to_Old[new_label]
    ## BUG: New_to_Old[label_2] might not exist yet:
    record_label_use(get_filename(),New_to_Old[label_2],label_2)

    ## print_debug("make_unique_new_label: new_label <" new_label "> label_1 <" label_1 "> label_2 <" label_2 "> New_to_Old[" label_2 "] = <" New_to_Old[label_2] ">" )

    new_label = label_2
    return (new_label)
}


function print_debug(s)
{
    ## print "DEBUG: " get_filename() ":" FNR "\t[" Old_Label "] " s >"/dev/stderr"
    return 0 # dummy statement: gawk will not accept an empty function body
}


function print_substitution_line(old,new, k,delimiters,d,sortpipe)
{
    ## sortpipe = "sort -b -f"		# to sort by old labels
    sortpipe = "sort -b -f +1 -2" 	# to sort by new labels

    printf("%s%*s%s\n", Old_Label, 32 - length(Old_Label), " ",
	Old_to_New[Old_Label]) | sortpipe
}


function record_label_use(file,old,new, t)
{
    ## Because BibTeX Ignores letter case in labels and entry names, we
    ## track label usage by lowercased indexes into In_Use_Label[].  However,
    ## we preserve letter case of indexes into Old_to_New[] so that
    ## more readable mixed case labels (e.g. Smith:ABC1980) can be
    ## supported.

    if (new)
	Old_to_New[old] = new
    t = tolower(old)
    In_Use_Label[t] = 1
    In_Use_Label_Name[t] = old
    In_Use_File[t] = file
    ## print_debug("record_label_use(" file "," old "," new ")")
}


function reduce_corporate_name(s)
{
    ## print "DEBUG: reduce_corporate_name(" s ")"
    if (Eight_Bit_Regexps && (s ~ "[\200-\377]"))
	warning("PORTABILITY: unexpected 8-bit character(s) found in author/editor name [" s "]")
    s = make_ascii(s)
    s = fix_accents(s)
    gsub("[^-A-Za-z0-9]","",s)		# allow digits (e.g., 3M or X3H3)
    return (s)
}


function reduce_personal_name(s)
{
    ## print "DEBUG: reduce_personal_name(" s ")"
    if (Eight_Bit_Regexps && (s ~ "[\200-\377]"))
	warning("PORTABILITY: unexpected 8-bit character(s) found in author/editor name [" s "]")
    s = make_ascii(s)
    s = fix_accents(s)
    gsub("[^-A-Za-z]","",s)
    return (s)
}


function reduce_TeX_macros(s)
{
    ## Reduce punctuation-like control sequences and discretionary hyphen
    gsub("\\\\-","",s)
    gsub("\\\\&","\\&",s)
    gsub("\\\\slash","/",s)
    gsub("\\\\endash"," -- ",s)
    gsub("\\\\emdash"," --- ",s)

    return (squeeze_blanks(s))
}


function replace_label(new,newer)
{
    ## Copy original mapping to newer label, and delete original.
    ## However, we retain the Old_to_New[] and In_Use_Label[] status,
    ## because they may refer to labels defined in other bibliography
    ## files that we are not processing on this run, and anyway,
    ## if we have just produced Smith:1980:ABCa, Smith:1980:ABCb, and
    ## Smith:1980:ABCc, we don't want to free Smith:1980:ABC for later
    ## use.
    New_to_Old[newer] = New_to_Old[new]
    New_to_Old[new] = ""
    record_label_use(get_filename(),New_to_Old[newer],newer)
}


function set_year(year)
{
    if (year ~ ("^" VALID_YEAR_PATTERN "$"))
	Year = year
    else
    {
	warning("WARNING: year [" year "] out of acceptable range " \
	    "[1000..2099] for citation label: using 20xx instead")
	Year = "20xx"	# xx, not ??, to avoid interfering with awk patterns
    }
}


function split_at_brace_level_zero(s,array,regexp, k,level,n)
{
    # Split a string s into array[], but only when regexp occurs
    # at unbackslashed-brace-level 0.

    ## if (DEBUG) printf("DEBUG: [%s] -> ",s) >"/dev/stderr"

    level = 0
    n = 0
    for (k = 1; k <= length(s); ++k)
    {
	if ((substr(s,k,1) == "{") && (substr(s,k-1,1) != "\\"))
	    level++
	else if ((substr(s,k,1) == "}") && (substr(s,k-1,1) != "\\"))
	    level--
	if (level == 0)			# then we can try for a separator match
	{
	    if (match(substr(s,k),regexp) && (RSTART == 1) && (RLENGTH > 0))
	    {
		array[++n] = substr(s,1,k-1)
		s = substr(s,k - 1 + RSTART + RLENGTH)
		k = 0			# WARNING: restart search
	    }
	}
    }
    array[++n] = s			# last chunk (which might be whole string)

    ## for (k = 1; k <= n; ++k)
    ##     printf("%d:[%s] ",k,array[k])
    ## if (DEBUG) print "" >"/dev/stderr"

    return (n)
}


function squeeze_blanks(s)
{
    gsub(/  +/," ",s)		# squeeze out duplicate blanks
    return (s)
}


function strip_comments(s)
{
    gsub("[ \t]*[%#].*$","",s)
    return (s)
}


function strip_math(s)
{
    gsub(/\\$/," ",s)		# change \$ to space

    ## delete all math modes, including attached following
    ## letters (e.g. $k$th and $k$-th, but not $B$-spline)
    gsub(/[$][^$]*[$]-th/, 	" ", s)
    gsub(/[$][^$]*[$][A-Za-z]*/," ", s)

    return (s)
}


function strip_parenthesized_strings(s)
{
    gsub("[(][^()]*[)]"," ",s)
    return (trim(squeeze_blanks(s)))
}


function title_string( k,n,s,titlewords)
{
    ## Save a 1- to 3-letter Title_Abbrev for use in making a citation
    ## label.  Braces are discarded, and only words beginning with a
    ## letter are candidates, so that a title like
    ## "${L}^{2}$ error bounds for the {R}ayleigh-{R}itz-{G}alerkin method"
    ## will reduce to EBR.
    s = value($0)

    gsub(/\\[$ ]/," ",s)	# convert \$ and \<space> to space

    s = strip_math(s)		# discard math mode

    ## collapse runs of hyphen, slash, and whitespace to single spaces
    gsub(/[---\/ \t\n]+/, " ",s)

    ## convert TeX control sequences for dashes and slashes to single space
    gsub(/\\emdash/," ",s)
    gsub(/\\endash/," ",s)
    gsub(/\\slash/, " ",s)

    ## convert dotless-i to i, dotless-j to j, Polish slash-l to l,
    ## and slash-L to L.
    while (match(s,/[\\][ijlL]( *[{}]| +)/) > 0)
	s = substr(s,1,RSTART-1) substr(s,RSTART+1)

    ## delete font changes
    gsub(/[\\](bf|em|it|rm|sc|sf|sl|tt) +/,"",s)
    gsub(/[\\]text(bf|it|md|rm|sc|sf|sl|tt|up) */,"",s)
    gsub(/[\\](it|sc|sl|up)shape */,"",s)
    gsub(/[\\](bf|md)series */,"",s)
    gsub(/[\\](rm|sf|tt)family */,"",s)

    ## delete accents, including newly-added Polish ogonek
    gsub(/[\\][uvHtcbdk][ {]/, "",s)

    ## collapse other control sequences to unbackslashed names [awk gsub()
    ## does not support numbered subexpressions, so we need a loop here]
    while (match(s,/[\\][a-zA-Z]+/) > 0)
	s = substr(s,1,RSTART-1) substr(s,RSTART+1)

    ## remove Dutch, French, Italian, ... contractions
    gsub(/[ ][DdLlNnTt]'/, " ", s)

    ## change quotation marks to space
    gsub(/``|''/," ",s)

    ## change selected punctuation to space
    gsub(/[().,;:!?~]/," ",s)

    ## squeeze out remaining non-alphanumerics
    gsub(/[^A-Za-z0-9 ]/, "", s)

    s = tolower(s)		# need uniform case for Ignore[] lookup

    n = split(s,titlewords)
    Title_Abbrev = ""
    for (k = 1; (k <= n) && (length(Title_Abbrev) < 3); ++k)
    {
	## print_debug("title_string(): considering <" titlewords[k] ">")
	if ((!(titlewords[k] in Ignore)) && (titlewords[k] ~ /^[A-Za-z]/))
	{
	    ## print_debug("title_string(): using <" titlewords[k] ">")
	    Title_Abbrev = Title_Abbrev substr(titlewords[k],1,1)
	}
    }
    Title_Abbrev = toupper(Title_Abbrev)
}


function trim(s)
{
    gsub(/^[ \t]+/,"",s)
    gsub(/[ \t]+$/,"",s)
    return (s)
}


function unbrace_outer(s)
{
    sub("^ *{","",s)
    sub("} *$","",s)
    return (s)
}


function unsplit(array,start,end,separator, k,s)
{
    ## Join elements start..end of a possibly sparse array into a
    ## string with nondeleted elements separated by separator, and
    ## return it.  This is the opposite of split().
    s = ""
    for (k = start; k <= end; ++k)
    {
	if (k in array)
	    s = (s ((s == "") ? "" : separator) array[k])
    }
    return (s)
}


function value(s, k1,k2)
{
    ## Return the value string, EXCLUDING surrounding quotes, of a
    ## `key = "value",' or `key = abbrev' pair
    k1 = index(s,"\"")
    k2 = index(s,"\",")
    if ((k2 == 0) && (k1 > 0))	# recognized unclosed strings
	k2 = length($0) + 1
    if (k2 > k1)
	return (substr(s,k1+1,k2 - (k1 + 1)))
    k1 = index(s,"= ")
    k2 = index(s,",")
    return (substr(s,k1+2,k2 - (k1 + 2)))
}


function warning(msg,altfilename,altfnr, filename,fnr)
{
    ## Print msg on stderr.  The optional arguments
    ## (altfilename,altfnr), if specified, override the default of
    ## (get_filename(),FNR).  The message format of "filename:linenumber:msg"
    ## conforms to emacs expectations, and GNU conventions.

    ## nawk has the empty string, "", for stdin, while gawk and mawk
    ## have "-": hide this difference to avoid bogus validation suite
    ## warnings.
    filename = ((altfilename == "") ? get_filename() : altfilename)
    filename = ((filename == "") ? "-" : filename)
    fnr = ((altfilename == "") ? FNR : altfnr)

    print filename ":" fnr ":" msg >"/dev/stderr"
}


function year_string( parts,year)
{
    year = value($0)
    split(year,parts,"[ ,;]+")		# allow lists like "1990, 1992, 1994"
    set_year(parts[1])
}
