#!/bin/sh
### ====================================================================
###  @UNIX-shell-file{
###     author          = "Nelson H. F. Beebe",
###     version         = "0.06",
###     date            = "13 January 1997",
###     time            = "15:30:08 MST",
###     filename        = "bibjoin.sh",
###     address         = "Center for Scientific Computing
###                        Department of Mathematics
###                        University of Utah
###                        Salt Lake City, UT 84112
###                        USA",
###     telephone       = "+1 801 581 5254",
###     FAX             = "+1 801 581 4148",
###     checksum        = "08022 1269 4412 39408",
###     email           = "beebe@math.utah.edu (Internet)",
###     codetable       = "ISO/ASCII",
###     keywords        = "bibliography, ordering, BibTeX",
###     supported       = "yes",
###     docstring       = "This file contains the bibjoin utility, a
###                        program for joining adjacent duplicate, or
###                        similar, entries.  Such action may be
###                        necessary when bibliography entries are
###                        collected from many sources.
###
###                        bibjoin should be applied to a bibliography
###                        file only after entries have been suitably
###                        ordered so that candidates for joining
###                        appear consecutively.  This can be done
###                        mostly automatically if standardized
###                        citation labels are first generated, then
###                        the bibliography is sorted by citation
###                        labels, such as by bibsort(1).
###
###                        Only a human reader can reliably decide
###                        when two bibliography entries are truly the
###                        same.  bibjoin can help automate much of
###                        this work, but manual editing will almost
###                        certainly still be necessary.  If two
###                        entries are joined, these conditions must
###                        be satisfied:
###
###                        (1) identical citation labels;
###                        (2) identical year;
###                        (3) if ISBNs are given in both entries, the
###                            ISBN lists must be identical;
###                        (4) if ISSNs are given in both entries, the
###                            ISSN lists must be identical;
###                        (5) if an Article entry, identical volume and
###                            initial page numbers, and if number and
###                            month are available, then they must also
###                            be identical.
###
###                        When two `equal' value strings are found for
###                        the same key, one of them is normally
###                        deleted.  Otherwise, both key/value pairs are
###                        output.  Manual editing will then be required
###                        to choose between them.
###
###                        Value strings are normally considered equal
###                        if they match after all characters other than
###                        letters, digits, and plus are removed, and
###                        letter case is ignored.  (The default set of
###                        retained characters can be redefined via the
###                        -ignore-characters option described later.)  This
###                        choice helps to eliminate many match failures
###                        that arise from minor variations in
###                        punctuation, spacing, and capitalization.
###                        bibjoin has no way of determining which of
###                        the two strings should be preserved, so it
###                        normally uniformly discards the shorter one
###                        (which presumably has less `information'):
###                        this means that it will be wrong about half
###                        the time. The shorter string will be
###                        preserved if the -keep-duplicate-values
###                        option described later is used.
###
###                        Syntax errors in the input stream will
###                        cause abrupt termination with a fatal error
###                        message and a non-zero exit code.  The
###                        output will be incomplete, so you should
###                        always examine the output file before
###                        assuming that you can replace the input
###                        file with the output file.
###
###                        Usage:
###                              bibjoin
###                                     [-author]
###                                     [-check-missing]
###                                     [-copyleft]
###                                     [-copyright]
###                                     [-ignore-characters regexp]
###                                     [-keep-duplicate-values]
###                                     [-version]
###                                     [<infile or bibfile(s)] >outfile
###
###                        Switch names may be abbreviated to the
###                        minimal unique prefix.  The switch prefix
###                        character, hyphen, may be doubled for
###                        compatibility with GNU and POSIX conventions.
###
###                        If -author is specified, author information
###                        will be printed on stderr, and the program
###                        will immediately exit with a success return
###                        code.
###
###                        If -check-missing is specified, missing
###                        expected fields will be supplied, with the
###                        field name prefixed with OPT, and the value
###                        string set to a pair of question marks,
###                        e.g.  OPTvolume = "??".  The OPT prefix
###                        ensures that the key is ignored by BibTeX
###                        (and thus that the question marks will not
###                        appear in an output .bbl file), and
###                        together with the question marks,
###                        highlights the missing data.  In addition,
###                        the GNU Emacs bibtex-mode editing support
###                        has convenient functions for removing the
###                        OPT prefixes, and so does bibclean(1).
###
###                        If -copyright or -copyleft is specified,
###                        copyright information will be printed on
###                        stderr, and the program will immediately exit
###                        with a success return code.
###
###                        If -ignore-characters is specified, the next
###                        argument is taken to be a regular expression
###                        defining the set of characters to be ignored
###                        in value string comparisons.  The default is
###                        '[^A-Za-z0-9+]'.
###
###                        If -keep-duplicate-values is specified, then
###                        instead of discarding the shorter of two
###                        value strings that are considered `equal',
###                        the shorter will be preserved with a key
###                        suffixed with the letter `z', e.g. title and
###                        titlez.  If such a key already exists,
###                        additional suffixing `z' letters will be
###                        provided to make the key unique.
###
###                        If -version is specified, the bibjoin version
###                        will be displayed on stderr, and then the
###                        program will immediately exit with a success
###                        return code.
###
###                        All remaining command-line words are assumed
###                        to be input files.  Should such a filename
###                        begin with a hyphen, it must be disguised by
###                        a leading absolute or relative directory
###                        path, e.g. /tmp/-foo.bib or ./-foo.bib.
###
###                        WARNINGS:
###
###                        (1) This simple version does NOT recognize
###                        bib entries with outer parentheses instead of
###                        braces, or with line breaks between the @Name
###                        and following opening brace.  Use bibclean(1)
###                        to standardize and syntax check the
###                        bibliography entries first.
###
###                        (2) Implementation limitations in nawk or
###                        gawk may result in premature termination
###                        because of maximum string lengths being
###                        exceeded.  This can happen with long
###                        abstract or summary strings.  This problem
###                        has been seen more frequently with some
###                        UNIX nawk implementations than with the
###                        Free Software Foundation GNU Project's
###                        gawk, so we prefer to use gawk, if
###                        available.
###
###                        The checksum field above contains a CRC-16
###                        checksum as the first value, followed by the
###                        equivalent of the standard UNIX wc (word
###                        count) utility output of lines, words, and
###                        characters.  This is produced by Robert
###                        Solovay's checksum utility.",
###  }
########################################################################

# Assign default initial values
AUTHOR=0
CHECKMISSING=0
COPYRIGHT=0
FILES=
IGNORECHARACTERS=
KEEPDUPLICATEVALUES=0
VERSION=0

# Loop over the command-line arguments, collecting bibjoin switches,
# and file names.
while [ $# -gt -0 ]
do
	case $1 in
	--a*|-a*)
		AUTHOR=1
		;;
	--ch*|-ch*)
		CHECKMISSING=1
		;;
	--co*|-co*)
		COPYRIGHT=1
		;;
	--i*|-i*)
		shift
		IGNORECHARACTERS=$1
		;;
	--k*|-k*)
		KEEPDUPLICATEVALUES=1
		;;
	--v*|-v*)
		VERSION=1
		;;
	--*|-*)
		echo Usage: $0
		echo '		[-author]'
		echo '		[-check-missing]'
		echo '		[-copyleft]'
		echo '		[-copyright]'
		echo '		[-ignore-characters regexp]'
		echo '		[-keep-duplicate-values]'
		echo '		[-version]'
		echo '		BibTeXfiles or <infile'
		echo '		>outfile'
		echo 'Try "man bibjoin" for program documentation.'
		exit 1
		;;
	*)			# everything else is assumed to be a filename
		FILES="$FILES $1"
		;;
	esac
	shift			# discard this switch or filename
done

# We store the awk program as a (large) string constant
PROGRAM='
BEGIN	{ initialize() }

/^[ \t]*@[ \t]*[Pp][Rr][Ee][Aa][Mm][Bb][Ll][Ee][ \t]*{/ {
	trim()
	squeeze()
	k = index($0,"{") + 1
	print_braced_item()
	next
}

/^[ \t]*@[ \t]*[sS][tT][rR][iI][nN][gG][ \t]*{/ {
	trim()
	squeeze()
	print_braced_item()
	next
}

# "@keyword{label,"
/^[ \t]*@[ \t]*[a-zA-Z0-9]*[ \t]*{/       {
	collect_braced_item()
	if (!("__ENTRY__" in Last_Key_Value_Pair))
	    ;			# nothing to do the first time through
	else if (same_entry())
	    join_entries()
	else
	    print_entry(Last_Key_Value_Pair)

	copy_array(Last_Key_Value_Pair,Key_Value_Pair)
	next
}

{				# all other line types match this
	trim()
	if (($0 == "") && (Last_Line == ""))
	    ;			# discard consecutive empty lines
	else
	    print
	Last_Line = $0
	next
}

END	{ terminate() }

function add_key_abbrev_pair(key,abbrev)
{
    Key_Value_Pair[key] = "  OPT" key " =" \
	substr("                 ",1,17 - (5 + length(key) + 2)) abbrev ","
}

function add_Key_Value_Pair(key,value)
{
    Key_Value_Pair[key] = "  OPT" key " =" \
	substr("                 ",1,17 - (5 + length(key) + 2)) \
	"\"" value "\","
}

function brace_count(s, k,n)
{
    n = 0
    for (k = 1; k <= length(s); ++k)
    {
	if (substr(s,k,1) == "{")
	    n++
	else if (substr(s,k,1) == "}")
	    n--
    }
    return (n)
}

function check_article()
{
    check_missing_key("author")
    check_missing_key("title")
    check_missing_key("journal")
    check_missing_key("volume")
    check_missing_key("number")
    check_missing_key("pages")
    check_missing_key("month")
    check_missing_key("year")
}

function check_author_editor()
{
    if (!("author" in Key_Value_Pair) && !("editor" in Key_Value_Pair))
	add_Key_Value_Pair("author",UNKNOWN_VALUE)
}

function check_book()
{
    check_author_editor()
    check_missing_key("title")
    check_missing_key("publisher")
    check_missing_key("address")
    check_missing_ISBN()
    check_missing_key("LCCN")
    check_missing_key("pages")
    check_missing_key("year")
}

function check_booklet()
{
    check_missing_key("author")
    check_missing_key("title")
    check_missing_key("howpublished")
    check_missing_key("address")
    check_missing_key("year")
}

function check_inbook()
{
    check_author_editor()
    check_missing_key("title")
    check_missing_key("chapter")
    if (!("crossref" in Key_Value_Pair))
    {
	check_missing_key("publisher")
	check_missing_key("address")
	check_missing_ISBN()
	check_missing_key("LCCN")
    }
    check_missing_key("pages")
    check_missing_key("year")
}

function check_incollection()
{
    check_author_editor()
    check_missing_key("title")
    if (!("crossref" in Key_Value_Pair))
    {
	check_missing_key("booktitle")
	check_missing_key("publisher")
	check_missing_key("address")
	check_missing_ISBN()
	check_missing_key("LCCN")
    }
    check_missing_key("pages")
    check_missing_key("year")
}

function check_inproceedings()
{
    check_author_editor()
    check_missing_key("title")
    if (!("crossref" in Key_Value_Pair))
    {
	check_missing_key("booktitle")
	check_missing_key("publisher")
	check_missing_key("address")
	check_missing_ISBN()
	check_missing_key("LCCN")
    }
    check_missing_key("pages")
    check_missing_key("year")
}

function check_manual()
{
    check_author_editor()
    check_missing_key("title")
    check_missing_key("organization")
    check_missing_key("address")
    check_missing_key("pages")
    check_missing_key("year")
}

function check_mastersthesis()
{
    check_missing_key("author")
    check_missing_key("title")
    check_missing_key("school")
    check_missing_key("address")
    check_missing_key("type")
    check_missing_key("month")
    check_missing_key("year")
}

function check_misc()
{
    check_missing_key("author")
    check_missing_key("title")
    check_missing_key("howpublished")
    check_missing_key("year")
}

function check_missing()
{
    if      (Entry_Type == "article")		check_article()
    else if (Entry_Type == "book")		check_book()
    else if (Entry_Type == "booklet")		check_booklet()
    else if (Entry_Type == "inbook")		check_inbook()
    else if (Entry_Type == "incollection")	check_incollection()
    else if (Entry_Type == "inproceedings")	check_inproceedings()
    else if (Entry_Type == "manual")		check_manual()
    else if (Entry_Type == "mastersthesis")	check_mastersthesis()
    else if (Entry_Type == "misc")		check_misc()
    else if (Entry_Type == "periodical")	check_periodical()
    else if (Entry_Type == "phdthesis")		check_phdthesis()
    else if (Entry_Type == "proceedings")	check_proceedings()
    else if (Entry_Type == "techreport")	check_techreport()
    else if (Entry_Type == "unpublished")	check_unpublished()
    else
	warning("unrecognized entry type [" Entry_Type "]")
}

function check_missing_ISBN()
{
    if (!("ISBN" in Key_Value_Pair) && \
	("year" in Key_Value_Pair) && \
	((0 + get_value("year",Key_Value_Pair["year"])) > 1971))
	add_Key_Value_Pair("ISBN",UNKNOWN_VALUE)
}

function check_missing_key(key)
{
    if (!(key in Key_Value_Pair))
	add_Key_Value_Pair(key,UNKNOWN_VALUE)
}

function check_periodical()
{
    check_missing_key("key")
    check_missing_key("address")
    check_missing_key("ISSN")
    check_missing_key("LCCN")
    check_missing_key("publisher")
    check_missing_key("title")
}

function check_phdthesis()
{
    check_missing_key("author")
    check_missing_key("title")
    check_missing_key("school")
    check_missing_key("address")
    check_missing_key("type")
    check_missing_key("month")
    check_missing_key("year")
}

function check_proceedings()
{
    if (!("author" in Key_Value_Pair) && \
	!("editor" in Key_Value_Pair))
	check_missing_key("key")
    check_missing_key("title")
    check_missing_key("publisher")
    check_missing_key("address")
    check_missing_ISBN()
    check_missing_key("LCCN")
    check_missing_key("pages")
    check_missing_key("year")
}

function check_techreport()
{
    check_missing_key("author")
    check_missing_key("title")
    check_missing_key("institution")
    check_missing_key("address")
    check_missing_key("type")
    check_missing_key("month")
    check_missing_key("year")
}

function check_unpublished()
{
    check_missing_key("author")
    check_missing_key("title")
    check_missing_key("note")
    check_missing_key("year")
}

function clear_array(array, key)
{
    for (key in array)
	delete array[key]
}

function collect_braced_item( count,s)
{
    # Starting with the current contents of $0, collect lines until we
    # reach a zero brace count. To guard against infinite loops in the
    # event of unbalanced braces, we abruptly terminate processing if
    # an at-sign is detected in column 1.  This function is used for
    # those entry types that require key/value pair reordering.

    Start_FNR = FNR
    Start_Line = $0
    Entry_Type = substr($0,2)
    sub(/ *{.*$/,"",Entry_Type)
    Entry_Type = tolower(Entry_Type)
    clear_array(Key_Value_Pair)
    squeeze()
    trim()
    count = brace_count($0)
    Key_Value_Pair["__ENTRY__"] = $0
    while (count != 0)
    {
	if (getline <= 0)
	    break
	if (substr($0,1,1) == "@") # should use match($0,/^[ \t]+@/), but
				   # this is faster, and usually correct
	    error("New entry encountered before balanced braces found")
	trim()
	# NB: syntax of abbrev, entry, key, and field names taken from
	# biblex source code: see Nelson H. F. Beebe, "Bibliography
	# prettyprinting and syntax checking", TUGboat 14(3), 222,
	# October (1993) and TUGboat 14(4), 395--419, December (1993).
	# NB: in match() below, \047 is an apostrophe, which we cannot
	# use inside the sh apostrophe-delimited string containing
	# this program.
	if (match($0,/^[ \t]*[A-Za-z][---A-Za-z0-9:.+\/\047]*[ \t]*=/))
	    s = collect_Key_Value_Pair()
	else
	    s = $0
	count += brace_count(s)
    }

    if (CHECKMISSING)
	check_missing()
}

function collect_Key_Value_Pair( key,s)
{
    # This function is called when a line of the form " key = ..." is met.
    s = $0
    match($0,/[A-Za-z][---A-Za-z0-9:.+\/\047]*/)
    key = substr($0,RSTART,RLENGTH)
    if (key in Key_Value_Pair)
    {
	warning("duplicate key [" key "]")
	while (key in Key_Value_Pair)	# append -z to get a unique sort key
	    key = key "-z"
    }
    if (match($0,/^[ \t]*[A-Za-z][---A-Za-z0-9:.+\/\047]*[ \t]*=[ \t]*\"/))
    {				# then we have key = "...value..."
	# Collect any multiline key/value assignment, using the simple
	# heuristic (guaranteeable by bibclean) that a quoted value string
	# ends with a quote, or quote comma.
	while (match($0,/\",?$/) == 0)
	{
	    if (getline <= 0)
		error("Unexpected end-of-file while collecting key/value pair")
	    if (substr($0,1,1) == "@")	# should use match($0,/^[ \t]+@/), but
					# this is faster, and usually correct
		error("New entry encountered before end of string")
	    trim()
	    s = s "\n" $0
	}
    }
    # else must be key = abbrev, which we assume takes just one line
    Key_Value_Pair[key] = s	# NB: omits final newline
    return (s)
}

function count_capitalized_words(s, cap_count,k,n,words)
{
    n = split(s,words," ")
    cap_count = 0
    for (k = 1; k <= n; ++k)
    {
	gsub(/[{}]/,"",words[k])		# discard protective braces
	if (words[k] ~ /^[A-Z][.]?$/)		# word or initials: A or B.
	    cap_count++
	else if (words[k] ~ /^[A-Z]\047[A-Z][a-z]/) # O<apostrophe>Malley
	    cap_count++
	else if (words[k] ~ /^[A-Z][a-z]/)	# Macdonald
	    cap_count++
	## print "DEBUG:" cap_count ":words[" k "] = [" words[k] "]"
    }
    ## print "DEBUG:" cap_count ":" s
    return (cap_count)
}

function copy_array(to_array,from_array, key)
{
    clear_array(to_array)
    for (key in from_array)
	to_array[key] = from_array[key]
}

function error(msg)
{		# print a message and terminate with failing exit code
    message("??FATAL ERROR:" msg)
    exit(1)
}

function get_pair(kv_pair,key, s)
{
    if (key in kv_pair)
    {
	s = kv_pair[key] "\n"
	delete kv_pair[key]
    }
    else
	s = ""
    return (s)
}

function get_value(key,key_value, s)
{
    if (match(key_value,"^[ \t]*" key "[ \t]*=[ \t]*\""))
    {				# have key = "value"
	s = substr(key_value,RSTART+RLENGTH)
	sub(/\",$/,"",s)
	sub(/\"$/,"",s)
	return (s)
    }
    else if (match(key_value,"^[ \t]*" key "[ \t]*=[ \t]*[a-zA-Z]"))
    {				# have key = abbrev
	s = substr(key_value,RSTART+RLENGTH-1)
	sub(/,$/,"",s)
    }
    else
	s = ""
    ## print "DEBUG: get_value(" key ",[" key_value "]) -> [" s "]"
    return (s)
}

function initialize()
{
    BIBJOIN_VERSION = "bibjoin version 0.06 [13-Jan-1997]"
    Stderr = "/dev/tty"	# nawk
    Stderr = "/dev/stderr"	# gawk, and some recent nawk installations

    if (AUTHOR + 0)		# coerce string to number
	print_author_and_exit(Stderr)

    CHECKMISSING += 0	# coerce string to number

    if (COPYRIGHT + 0)	# coerce string to number
	print_copyright_and_exit(Stderr)

    if (IGNORECHARACTERS == "") IGNORECHARACTERS = "[^A-Za-z0-9+]"
    KEEPDUPLICATEVALUES += 0 # coerce string to number
    UNKNOWN_VALUE = "??"

    if (VERSION + 0)	# coerce string to number
	print_version_and_exit(Stderr)

    Number_of_Month["Jan"]  = 1
    Number_of_Month["Feb"]  = 2
    Number_of_Month["Mar"]  = 3
    Number_of_Month["Apr"]  = 4
    Number_of_Month["May"]  = 5
    Number_of_Month["Jun"]  = 6
    Number_of_Month["Jul"]  = 7
    Number_of_Month["Aug"]  = 8
    Number_of_Month["Sep"]  = 9
    Number_of_Month["Oct"]  = 10
    Number_of_Month["Nov"]  = 11
    Number_of_Month["Dec"]  = 12
}

function ISO_date(bibdate, date,parts)
{
    # bibdate should be one of two formats:
    #	"Wed Aug 24 17:14:45 MDT 1994"
    #	"Wed Aug 21 17:23:30 1996"
    # These are converted to something like "1996.09.21.17:23:30"
    # unless bibdate cannot be recognized, in which case an empty
    # string is returned.  The time zone is necessarily ignored,
    # since we have no sensible way of supplying an omitted one.

    ## print "DEBUG 0:",bibdate

    if (match(bibdate,/^[A-Z][a-z][a-z] [A-Z][a-z][a-z] +[0-9][0-9]? [0-9][0-9]?:[0-9][0-9]:[0-9][0-9] [12][0-9][0-9][0-9]$/))
    {				# "Wed Aug 21 17:23:30 1996"
	split(bibdate,parts," ")
	date = sprintf("%s.%02d.%02d.%s", \
		       parts[5], Number_of_Month[parts[2]], \
		       0 + parts[3], \
		       parts[4])
	## print "DEBUG 1:",date
	return ((length(date) == 19) ? date : "")
    }
    else if (match(bibdate,/^[A-Z][a-z][a-z] [A-Z][a-z][a-z] +[0-9][0-9]? [0-9][0-9]?:[0-9][0-9]:[0-9][0-9] [A-Z][A-Z][A-Z] [12][0-9][0-9][0-9]$/))
    {				# "Wed Aug 24 17:14:45 MDT 1994"
	split(bibdate,parts," ")
	date = sprintf("%s.%02d.%02d.%s", \
		       parts[6], Number_of_Month[parts[2]], \
		       0 + parts[3], \
		       parts[4])
	## print "DEBUG 2",date
	return ((length(date) == 19) ? date : "")
    }
    else
	return("")
}

function join_bibdate( last_bibdate,this_bibdate)
{
    if (("bibdate" in Key_Value_Pair) && ("bibdate" in Last_Key_Value_Pair))
    {
	last_bibdate = ISO_date(get_value("bibdate",Last_Key_Value_Pair["bibdate"]))
	this_bibdate = ISO_date(get_value("bibdate",Key_Value_Pair["bibdate"]))
	## print "DEBUG: ", last_bibdate, this_bibdate
	if ((last_bibdate != "") && (this_bibdate != ""))
	{
	    if (last_bibdate >= this_bibdate)
	    {
	        Key_Value_Pair["bibdate"] = Last_Key_Value_Pair["bibdate"]
		delete Last_Key_Value_Pair["bibdate"]
	    }
	    else if (last_bibdate < this_bibdate)
		delete Last_Key_Value_Pair["bibdate"]
	}
    }
}

function join_entries( key)
{
    # Join entries by copying Last_Key_Value_Pair[] into Key_Value_Pair[]
    # so that on exit, Last_Key_Value_Pair[] is completely empty.

    join_bibdate()
    for (key in Last_Key_Value_Pair)
    {
	if (!(key in Key_Value_Pair))
	    Key_Value_Pair[key] = Last_Key_Value_Pair[key]
	else if (reduce_string(Last_Key_Value_Pair[key]) == \
	    reduce_string(Key_Value_Pair[key]))
	{			# same key, same reduced value: save longer
	    if (length(Last_Key_Value_Pair[key]) > length(Key_Value_Pair[key]))
	    {
		save_duplicate_key(key,Key_Value_Pair[key])
		Key_Value_Pair[key] = Last_Key_Value_Pair[key]
	    }
	    else if (length(Last_Key_Value_Pair[key]) < length(Key_Value_Pair[key]))
		save_duplicate_key(key,Last_Key_Value_Pair[key])
	    else if (Last_Key_Value_Pair[key] != Key_Value_Pair[key])
	    {
		# When two values are equal ignoring letter case, save
		# the one with more capitalized words, on the grounds
		# that it has information that is lost when words are
		# converted to a single letter case.  It is common
		# in some databases to downcase the original updowncase
		# titles, thereby losing information that we prefer to
		# preserve.
		if (count_capitalized_words(get_value(key,Last_Key_Value_Pair[key])) > \
			count_capitalized_words(get_value(key,Key_Value_Pair[key])))
		{
		    save_duplicate_key(key,Key_Value_Pair[key])
		    Key_Value_Pair[key] = Last_Key_Value_Pair[key]
		}
		else
		    save_duplicate_key(key,Last_Key_Value_Pair[key])
	    }
	}
	else if (((key == "author") || (key == "editor")) && \
	    same_personal_names(get_value(key,Last_Key_Value_Pair[key]),\
				get_value(key,Key_Value_Pair[key])))
	{			# names differ only in initials vs. full names
	    Key_Value_Pair[key] = "  " key " =       \"" Combined_Names "\","
				# global value Combined_Names is set by same_personal_names()
	}
	else			# same key, but different value
	{
	    ## print "DEBUG: join_entries [" Last_Key_Value_Pair[key] "] [" \
	    ##   Key_Value_Pair[key] "]"

	    Key_Value_Pair[new_key(Key_Value_Pair,key)] = \
		Last_Key_Value_Pair[key]
	}
	delete Last_Key_Value_Pair[key]
    }
}

function min(a,b)
{
    return ((a < b) ? a : b)
}

function message(msg)
{
    print FILENAME ":" FNR ":" msg "\tIn:" Start_FNR ":" Start_Line >Stderr
}

function max(a,b)
{
    return ((a > b) ? a : b)
}

function new_key(array,key, k,keynew)
{
    for (k = 1; ; ++k)
    {
	keynew = sprintf("%s%04d",key,k)
	if (!(keynew in array))
	{
	    ## print "DEBUG: new_key() -> " keynew
	    return (keynew)
	}
    }
}

function order_entry(kv_pair, item,k)
{
    ## for (k in kv_pair)
    ##	  print "DEBUG:",k,":",kv_pair[k] | "sort"
    ## close("sort")

    # For the purposes of manual merging, it is best to have the keys
    # appear in strictly sorted order.  Duplicate keys with different
    # string values will then appear consecutively, because their
    # table indexes take the form "key", "key0001", "key0002", etc.
    # Once manual editing is complete, biborder(1) can be used to
    # standardize the key order.

    item = get_pair(kv_pair,"__ENTRY__")

    sort_keys(kv_pair)

    for (k = 1; Sorted_Keys[k]; ++k) # output them in ordered by key
	item = item get_pair(kv_pair,Sorted_Keys[k])

    if (item != "")		# NB: empty item can happen in BEGIN and END actions
	item = item "}\n"
    return (item)
}

function print_braced_item(count)
{
    # Starting with the current contents of $0, print lines until we
    # reach a zero brace count.  This function is used for
    # @Preamble{...} and @String{...}, which require no special
    # processing.

    Start_FNR = FNR
    Start_Line = $0
    count = brace_count($0)
    print $0
    while (count != 0)
    {
	if (getline <= 0)
	    break
	if (substr($0,1,1) == "@")
	    error("New entry encountered before balanced braces found")
	print $0
	count += brace_count($0)
    }
    Last_Line = $0
}

function print_author_and_exit(tofile)
{
    print " Author:" 						>tofile
    print "	Nelson H. F. Beebe" 				>tofile
    print "	Center for Scientific Computing" 		>tofile
    print "	Department of Mathematics" 			>tofile
    print "	University of Utah" 				>tofile
    print "	Salt Lake City, UT 84112" 			>tofile
    print "	USA" 						>tofile
    print "	Email: beebe@math.utah.edu (Internet)"		>tofile
    print " 	WWW URL: http://www.math.utah.edu/~beebe"	>tofile
    exit(0)
}

function print_copyright_and_exit(tofile)
{
    print "*********************************************"	>tofile
    print "*********************************************"	>tofile
    print "*** This program is in the PUBLIC DOMAIN. ***"	>tofile
    print "*********************************************"	>tofile
    print "*********************************************"	>tofile
    exit(0)
}

function print_entry(kv_pair)
{
    Last_Line = order_entry(kv_pair)
    printf("%s", Last_Line)
}

function print_version_and_exit(tofile)
{
    print BIBJOIN_VERSION					>tofile
    exit(0)
}

function reduce_string(s, t)
{
    t = s
    gsub(IGNORECHARACTERS,"",t)	# remove all but letters and digits and plus (for C++)
    t = tolower(t)		# and collapse to one letter case
    ## print "DEBUG: reduce_string() ->", t
    return (t)
}

function same_entry( last_first_page,this_first_page, \
		     last_last_page,this_last_page, \
		     last_month,this_month, \
		     last_number,this_number, \
		     last_ISBN,this_ISBN, \
		     last_ISSN,this_ISSN)
{
    if (tolower(the_entry(Last_Key_Value_Pair)) != \
	tolower(the_entry(Key_Value_Pair)))
	return (0)
    else if (the_year(Last_Key_Value_Pair) != the_year(Key_Value_Pair))
	return (0)
    else
    {
	last_ISBN = the_ISBN(Last_Key_Value_Pair)
	this_ISBN = the_ISBN(Key_Value_Pair)
	if ((last_ISBN != "") && \
	    (this_ISBN != "") && \
	    (last_ISBN != this_ISBN))
	    return (0)

	last_ISSN = the_ISSN(Last_Key_Value_Pair)
	this_ISSN = the_ISSN(Key_Value_Pair)
	if ((last_ISSN != "") && \
	    (this_ISSN != "") && \
	    (last_ISSN != this_ISSN))
	    return (0)
    }

    # If we have a match so far, and this is not an article, then
    # the entries are considered to match.  Otherwise, we go on and
    # compare volume, number, pages, and month values.
    if (tolower(the_entry(Key_Value_Pair)) !~ /@article/)
	return (1)
    else if (the_volume(Last_Key_Value_Pair) != the_volume(Key_Value_Pair))
	return (0)
    else
    {
	last_month = the_month(Last_Key_Value_Pair)
	this_month = the_month(Key_Value_Pair)
	if ((last_month != "") && \
	    (this_month != "") && \
	    (last_month != this_month))
	    return (0)

	last_number = the_number(Last_Key_Value_Pair)
	this_number = the_number(Key_Value_Pair)
	if ((last_number != "") && \
	    (this_number != "") && \
	    (last_number != this_number))
	    return (0)

	last_first_page = the_first_page(Last_Key_Value_Pair)
	this_first_page = the_first_page(Key_Value_Pair)
	last_last_page = the_last_page(Last_Key_Value_Pair)
	this_last_page = the_last_page(Key_Value_Pair)
	if ((last_first_page == "") || (this_first_page == ""))
	    return (1)
        else if ((last_last_page ~ /^[?][?]*$/) && (this_last_page !~ /^[?][?]*$/))
	{			# merge pages = "123--??" with "123--124"
	    Last_Key_Value_Pair["pages"] = Key_Value_Pair["pages"]
	    return (1)
        }
        else if ((last_last_page !~ /^[?][?]*$/) && (this_last_page ~ /^[?][?]*$/))
	{			# merge pages = "123--??" with "123--124"
	    Key_Value_Pair["pages"] = Last_Key_Value_Pair["pages"]
	    return (1)
        }
	else if (last_first_page == this_first_page)
	    return (1)
	else
	    return (0)
    }
}

function same_personal_names(last_value,this_value, \
	k,last_persons,n_last,n_this,the_person,this_persons)
{
    # Given two author/editor values, such as
    #
    #	"W. H. Durdan and W. J. Bowhill and J. F. Brown"
    #	"William H. Durdan and W. Bowhill and J. Frederick Brown"
    #
    # return 1 if they match by word/initial, and otherwise 0.
    #
    # Also, if the return value is 1, set the global variable
    # Combined_Names to a string with the longest names (e.g. "William"
    # instead of "W.").  For the above example, it would be set to
    #
    #	"William H. Durdan and W. J. Bowhill and J. Frederick Brown"
    #
    # Because some bibliography sources drop initials, two personal
    # names are considered to match if one has extra initials AFTER the
    # first (e.g. "W. J. Bowhill" matches "W. Bowhill", but does not
    # match "J. Bowhill").
    #
    # For entries with more than three authors, the UnCover database
    # stores only the first, second, and last, so we also consider
    # values to match if one has exactly three names, the other has more
    # than three, and the first, second, and last in each match.

    ## print "DEBUG: same_personal_names([" last_value "],[" this_value "])"
    Combined_Names = ""
    n_last = split(last_value,last_persons,/ +and +/)
    n_this = split(this_value,this_persons,/ +and +/)
    if (n_last == n_this)	# easy case: same number of personal names
    {
	for (k = 1; k <= n_last; ++k)
        {
	    the_person = same_person(last_persons[k],this_persons[k])
	    if (the_person == "")
		return (0)	# names mismatch
	    Combined_Names = Combined_Names the_person ((k < n_last) ? " and " : "")
	}
	return (1)
    }
    else if ((min(n_last,n_this) == 3) && (max(n_last,n_this) > 3))
    {				# UnCover database special case
	if (n_last > 3)
	{			# interchange the two arguments by resplitting
	    n_last = split(this_value,last_persons,/ +and +/)
	    n_this = split(last_value,this_persons,/ +and +/)
	}
	for (k = 1; k <= 2; ++k)
        {
	    the_person = same_person(last_persons[k],this_persons[k])
	    if (the_person == "")
		return (0)	# names mismatch
	    Combined_Names = Combined_Names the_person " and "
	}
	the_person = same_person(last_persons[3],this_persons[n_this])
	if (the_person == "")
	    return (0)		# names mismatch
	for (k = 4; k < n_this; ++k)
	    Combined_Names = Combined_Names this_persons[k] " and "
        Combined_Names = Combined_Names the_person

	return (1)

    }
    else			# different count of names: no match possible
	return (0)
}

function same_person(last_person,this_person, \
	k,last_words,n_this,n_last,this_words,the_person,the_word)
{
    ## print "DEBUG: same_person([" last_person "],[" this_person "])"
    n_last = split(last_person,last_words,/ +/)
    n_this = split(this_person,this_words,/ +/)

    the_person = ""
    if (n_last == n_this)
    {
	for (k = 1; k <= n_last; ++k)
	{
	    ## print "DEBUG:" k, n_last, last_words[k], this_words[k]
	    the_word = same_word(last_words[k],this_words[k])
	    if (the_word == "")
		return ("")	# names differ
	    the_person = the_person the_word ((k < n_last) ? " " : "")
	}
    }
    else
    {				# e.g. "P. D. Bach" and "P. D. Q. Bach"
	if (n_last > n_this)
	{			# interchange the two arguments by resplitting
	    n_last = split(this_person,last_words,/ +/)
	    n_this = split(last_person,this_words,/ +/)
	}
	for (k = 1; k < n_last; ++k)
	{
	    the_word = same_word(last_words[k],this_words[k])
	    if (the_word == "")
		return ("")	# names differ
	    the_person = the_person the_word " "
	}
	the_word = same_word(last_words[n_last],this_words[n_this])
	if (the_word == "")
	    return ("")		# family names differ
	for (k = n_last; k < n_this; ++k)
	    the_person = the_person this_words[k] " "
        the_person = the_person the_word
    }
    return (the_person)
}

function same_word(last_word,this_word)
{
    ## print "DEBUG: same_word([" last_word "],[" this_word "])"
    if (last_word == this_word)
	return (last_word)
    else if (length(last_word) < length(this_word))
    {				# e.g. "J." and "James"
        sub(/[.]$/,"",last_word)
	if ((length(last_word) == 1) && (last_word == substr(this_word,1,1)))
	    return (this_word)
	else
	    return ("")
    }
    else if (length(this_word) < length(last_word))
    {				# e.g. "James" and "J."
        sub(/[.]$/,"",this_word)
	if ((length(this_word) == 1) && (this_word == substr(last_word,1,1)))
	    return (last_word)
	else
	    return ("")
    }
    else			# lengths match, but words differ
	return ("")
}

function save_duplicate_key(key,value, newkey)
{
    if (KEEPDUPLICATEVALUES)
    {
	newkey = key "z"
	while (newkey in Key_Value_Pair)
	    newkey = newkey "z"
	sub(("^ *" key),("  " newkey),value)
	Key_Value_Pair[newkey] = value
    }
}

function sort_keys(kv_pair, k,key,m,n)
{
    clear_array(Sorted_Keys)

    n = 0
    for (key in kv_pair)
    {
	n++
	Sorted_Keys[n] = key
    }
    for (k = 1; k < n; ++k)
    {
	for (m = k + 1; m <= n; ++m)
	{
	    if (tolower(Sorted_Keys[k]) > tolower(Sorted_Keys[m]))
	    {
		key = Sorted_Keys[m]
		Sorted_Keys[m] = Sorted_Keys[k]
		Sorted_Keys[k] = key
	    }
	}
    }
}

function squeeze( kbrace,kspace)
{
    sub(/^[ \t]*@[ \t]*/,"@")	# eliminate space before and after initial @
    kbrace = index($0,"{")	# eliminate space between entryname and brace
    kspace = match($0,"[ \t]")
    if (kspace < kbrace)	# then found intervening space
	sub(/[ \t]+{/,"{")	# NB: sub(), NOT gsub(), here
}

function terminate()
{
    if (Last_Line != "")
	print ""
    print_entry(Last_Key_Value_Pair)
}

function the_first_page(array, pages)
{
    split(the_pages(array),pages,"--")
    ## print "DEBUG: the_first_page() ->",pages[1]
    return (pages[1])
}

function the_last_page(array, pages)
{
    split(the_pages(array),pages,"--")
    ## print "DEBUG: the_last_page() ->",pages[2]
    if (pages[2] == "")		# supply default unknown ending page
	pages[2] = "??"
    return (pages[2])
}

function the_entry(array)
{
    return ("__ENTRY__" in array) ? array["__ENTRY__"] : ""
}

function the_ISBN(array)
{
    return ("ISBN" in array) ? get_value("ISBN",array["ISBN"]) : ""
}

function the_ISSN(array)
{
    return ("ISSN" in array) ? get_value("ISSN",array["ISSN"]) : ""
}

function the_month(array)
{
    return ("month" in array) ? get_value("month",array["month"]) : ""
}

function the_number(array)
{
    return ("number" in array) ? get_value("number",array["number"]) : ""
}

function the_pages(array)
{
    return ("pages" in array) ? get_value("pages",array["pages"]) : ""
}

function the_volume(array)
{
    return ("volume" in array) ? get_value("volume",array["volume"]) : ""
}

function the_year(array)
{
    return ("year" in array) ? get_value("year",array["year"]) : ""
}

function trim()
{
    sub(/[ \t]+$/,"")
}

function warning(msg)
{
    message("%%" msg)
}
'
# Use GNU gawk instead of nawk: Sun Solaris 2.x nawk often complains
# `input record too long'.
gawk \
	-v AUTHOR="$AUTHOR" \
	-v CHECKMISSING="$CHECKMISSING" \
	-v COPYRIGHT="$COPYRIGHT" \
	-v IGNORECHARACTERS="$IGNORECHARACTERS" \
	-v KEEPDUPLICATEVALUES="$KEEPDUPLICATEVALUES" \
	-v VERSION="$VERSION" \
	"$PROGRAM" \
	$FILES
################################[The End]###############################
