### ====================================================================
###  @Awk-file{
###     author          = "Nelson H. F. Beebe",
###     version         = "0.00",
###     date            = "01 November 1994",
###     time            = "09:16:04 MST",
###     filename        = "citesub.awk",
###     address         = "Center for Scientific Computing
###                        Department of Mathematics
###                        University of Utah
###                        Salt Lake City, UT 84112
###                        USA",
###     telephone       = "+1 801 581 5254",
###     FAX             = "+1 801 581 4148",
###     checksum        = "57735 244 1127 8676",
###     email           = "beebe@math.utah.edu (Internet)",
###     codetable       = "ISO/ASCII",
###     keywords        = "bibliography, BibTeX, citation label, LaTeX,
###                        TeX",
###     supported       = "yes",
###     docstring       = "This program filters a list of files, or
###                        standard input if none are specified, to
###                        standard output, carrying out BibTeX citation
###                        label substitutions specified by entries in a
###                        file given by a command-line argument.  Each
###                        line of that file contains a pair of citation
###                        labels, surrounded by an arbitrary amount of
###                        whitespace.
###
###                        Usage:
###                        	nawk -f citesub.awk file(s) >newfile
###                        	nawk -f citesub.awk -v SUBFILE=subfile
###					file(s) >newfile
###                        	nawk -f citesub.awk -v SUBFILE=subfile
###					<oldfile >newfile
###
###			   Depending on the version of nawk or gawk,
###			   the -v switch may or may not be required.
###
###			   If `SUBFILE=subfile' is omitted, the
###			   substitution filename is determined by
###			   replacing the extension of the first named
###			   file with .sub.
###
###			   A filename of `-' means standard input.
###
###			   Old citation labels are substituted only if
###			   they are preceded by a left brace, quote,
###			   comma, whitespace, or beginning of line, and
###			   followed by right brace, comma, quote,
###			   percent, whitespace, or end of line.
###
###                        This program was written to deal efficiently
###                        with large bibliographies.  Obvious citation
###                        label substitution schemes using the sed
###                        stream editor, or awk, to compare each input
###                        line with all old citation labels perform
###                        badly, because the execution time is
###                        proportional to the PRODUCT of the number of
###                        labels and the number of input lines.  Also,
###                        with more than about 100 labels, most UNIX
###                        sed implementations exceed internal table
###                        sizes, and simply refuse to run.
###
###                        This version achieves efficiency by
###                        tokenizing each input line, and looking up
###                        in an awk associative array (table) tokens
###                        that are candidates for substitution.  The
###                        array lookup takes constant time,
###                        independent of the number of patterns to be
###                        substituted, so the run time is
###                        proportional only to the number of input
###                        lines.  Nevertheless, the overhead of awk's
###                        interpretive execution makes this
###                        implementation about 50 times slower than
###                        the companion implementation in citesub.c.
###
###                        The checksum field above contains a CRC-16
###                        checksum as the first value, followed by the
###                        equivalent of the standard UNIX wc (word
###                        count) utility output of lines, words, and
###                        characters.  This is produced by Robert
###                        Solovay's checksum utility.",
###  }
### ====================================================================

BEGIN \
{
    stderr = "/dev/tty"		# nawk
    stderr = "/dev/stderr"	# gawk (and Sun nawk)

    if (SUBFILE == "")		# try to infer a substitution filename
    {
	SUBFILE = ARGV[1]
	sub(/[.][^.]+$/,".sub",SUBFILE)
    }

    if (SUBFILE == "")
	usage_and_exit()

    # This string defines the set of characters which can occur
    # in citation labels.   See Nelson H. F. Beebe, "Bibliography
    # prettyprinting and syntax checking", TUGboat 14(3), 222,
    # October (1993) and TUGboat 14(4), 395--419, December (1993).
    LABELCHARS = \
		"ABCDEFGHIJKLMNOPQRSTUVWXYZ" \
		"abcdefghijklmnopqrstuvwxyz" \
		"0123456789" \
		":-" \
		"+/" \
		".'_"

    for (k = 1; k <= length(LABELCHARS); ++k)
	ISLABELCHAR[substr(LABELCHARS,k,1)] = 1

    LEADING_CONTEXT  = "{,\" \t"  # what must precede a citation label
    TRAILING_CONTEXT = "},\" \t%" # what must follow a citation label

    # read the substitution file and save the label mappings
    FILENAME = SUBFILE		# for warning messages
    n = 0
    FNR = 0			# for warning messages
    while ((getline < SUBFILE) > 0)
    {
	FNR++
	if (NF != 2)
	{
	    gsub(/[ \t]+/," ")	# collapse multiple spaces
	    warning("Ignoring bad substitution file line [" $0 \
		"]: expected 2 fields, but found " NF )
	}
	else
	{
	    n++
	    new[$1] = $2
	    for (k = 1; k <= length($1); ++k)
	    {
		if (index(LABELCHARS,substr($0,k,1)) == 0)
		{
		    warning("Invalid citation label character `" \
			    substr($0,k,1) "' in [" $1 "]")
		    break
		}
	    }
	}
    }

    if (n == 0)
	warning("No label substitutions found in file [" SUBFILE "]")
}

{
    # Efficiency issue: process all lines in current file WITHOUT
    # splitting them into fields.  The getline loop will terminate
    # at the end of the current file, so this action group will be
    # executed only once per input file.  Timing experiments with
    # gawk and nawk show no significant improvement from this
    # optimization over the simple body "process_line($0)".

    process_line($0)
    while ((getline line) > 0)
	process_line(line)
}

#=======================================================================

function error(s)
{
    warning("ERROR:" s)
    exit(1)
}

function process_line(line, k,n,t,prev_char,next_char)
{
    # Efficiency issue: is it faster to classify characters by linear
    # search in LABELCHARS, or by table lookup in ISLABELCHAR[]?  On
    # Sun SPARCstation LX, they are quite close.  Using master.bib
    # version 1.205 (346KB, 10948 lines, 747 entries), we find:
    #
    # gawk: 203 sec (index search), 208 sec (table lookup)
    # nawk: 241 sec (index search), 236 sec (table lookup)
    #
    # To demonstrate the independence of run-time on the number of
    # citation label substitutions, one nawk run was repeated with
    # only 1 substitution, instead of 747; it took 235 sec using table
    # lookup, only 1 sec faster.
    #
    # The above numbers were obtained with an early version that first
    # tokenized the entire line, then substituted tokens, then output
    # tokens.  When these three steps were combined into a single
    # function to avoid having to store tokens, the time dropped to
    # 145 sec.

    for (k = 1; k <= length(line); k += n)
    {
	n = 0
#	if (index(LABELCHARS,substr(line,k+n,1)) == 0)
	if (!(substr(line,k+n,1) in ISLABELCHAR))
	{
	    n++
	    while (((k + n) <= length(line)) && \
		!(substr(line,k+n,1) in ISLABELCHAR))
#		(index(LABELCHARS,substr(line,k+n,1)) == 0))
		n++
	    t = substr(line,k,n)
	}
	else
	{
	    n++
	    while (((k + n) <= length(line)) && \
		(substr(line,k+n,1) in ISLABELCHAR))
#		(index(LABELCHARS,substr(line,k+n,1)) > 0))
		n++
	    t = substr(line,k,n)
	    if (t in new)
	    {
		prev_char = (k == 1) ? " " : substr(line,k-1,1)
		next_char = ((k + n) > length(line)) \
		    ? " " : substr(line,k+n,1)
		if ((index(LEADING_CONTEXT,prev_char) > 0) && \
		    (index(TRAILING_CONTEXT,next_char) > 0))
		    t = new[t]
	    }
	}
	printf("%s", t)
    }
    printf("\n")
}


function usage_and_exit()
{
    error("Usage: nawk -f citesub.awk " \
	"-v SUBFILE=substitution-file files(s) >outfile")
}


function warning(s)
{
    if (old_tag == "")
	print FILENAME ":" FNR ":" s >stderr
    else
	print FILENAME ":" FNR "\t[" old_tag "]\t" s >stderr
}
