#!/bin/sh
### ====================================================================
###  @UNIX-shell-file{
###     author          = "Nelson H. F. Beebe",
###     version         = "0.07",
###     date            = "25 September 1994",
###     time            = "13:12:34 MDT",
###     filename        = "bibsort.sh",
###     address         = "Center for Scientific Computing
###                        Department of Mathematics
###                        University of Utah
###                        Salt Lake City, UT 84112
###                        USA",
###     telephone       = "+1 801 581 5254",
###     FAX             = "+1 801 581 4148",
###     checksum        = "27435 515 2349 20334",
###     email           = "beebe@math.utah.edu (Internet)",
###     codetable       = "ISO/ASCII",
###     keywords        = "bibliography, sorting, BibTeX",
###     supported       = "yes",
###     docstring       = "This file contains the bibsort utility, a
###                        program for sorting BibTeX data base files by
###                        their BibTeX citation label names, or by
###                        another order determined by command-line
###                        switches, as described below.
###
###                        Usage:
###                              bibsort [-byvolume or -byyear] \
###                                      [optional sort(1) switches] \
###                                      bibfile(s) >outfile
###                        or
###                              bibsort [-byvolume or -byyear] \
###                                      [optional sort(1) switches] \
###                                      <infile >outfile
###
###                        Bibliography items are normally sorted in
###                        ascending order by citation label.  However,
###                        this sorting is modified if a -byvolume or
###                        -byyear switch, or certain other sort(1)
###                        switches, are specified.
###
###                        -byvolume and -byyear may be abbreviated to
###                        unique prefixes -byv and -byy, if desired.
###                        Switches may appear in any order; however, if
###                        both -byvolume and -byyear are specified,
###                        only the last one is used.
###
###                        The -byvolume switch is intended for use with
###                        bibliographies of single journals.  It causes
###                        entries to be sorted by journal, volume,
###                        number, page, year, and citation label, so
###                        that the entries appear in their original
###                        publication order.  The journal name is
###                        included in the sort key, so that in a
###                        bibliography with multiple journals, output
###                        entries for each journal are kept together.
###
###                        With -byvolume sorting, warnings are issued
###                        for any entry in which any of these fields
###                        are missing, and a value of the missing field
###                        is supplied that will sort higher than any
###                        printable value.
###
###                        Because -byvolume sorting is first on journal
###                        name, it is essential that there be only one
###                        form of each journal name; the best way to
###                        ensure this is to always use @String{...}
###                        abbreviations for them.  Order -byvolume is
###                        convenient for checking a bibliography
###                        against the original journal, but less
###                        convenient for a bibliography user.
###
###                        The -byyear switch causes entries to be
###                        sorted first by year, then by citation label.
###                        This is useful for keeping a bibliography in
###                        approximate chronological order, ordered by
###                        citation label within each year.
###
###                        Other command-line words beginning with a
###                        hyphen are assumed to be options to be passed
###                        to sort(1).
###
###                        All remaining command-line words are assumed
###                        to be input files.  Should such a filename
###                        begin with a hyphen, it must be disguised by
###                        a leading absolute or relative directory
###                        path, e.g. /tmp/-foo.bib or ./-foo.bib.
###
###                        The sort(1) -f (ignore letter case
###                        differences) is always supplied.  The -r
###                        switch reverses the order of the sort. The -u
###                        switch removes duplicate bibliography entries
###                        from the input stream; however, such entries
###                        must match exactly, including all white
###                        space.
###
###                        CAVEATS:
###
###                        Sorting of bibliographic entries cannot
###                        safely be done in general, because @String
###                        and @Preamble entries should come first, and
###                        in the current BibTeX, cross-referenced
###                        entries MUST come last.  This is an
###                        unfortunate, undesirable, and non-intuitive
###                        implementation limitation that I hope will be
###                        lifted in the final version of BibTeX.
###
###                        There is no simple way to detect which
###                        entries might be cross-referenced, unless
###                        cross-references to them precede them.
###                        According to btxdoc.tex, cross-references can
###                        occur
###
###                             * from @InProceedings or @Conference to
###                               @Proceedings,
###                             * from @Book, @InBook, and @InCollection
###                               to @Book, and
###                             * from @Article to @Article.
###
###                        Thus, even though we can move all
###                        @Proceedings entries to the end on the
###                        grounds that there are likely to be
###                        cross-references to them, we cannot
###                        reasonably do so for @Book and @Article
###                        entries.  A cross-referenced @Book entry
###                        needs a booktitle assignment, and that
###                        requirement is used to recognize that special
###                        case, and sort it properly.
###
###                        When an entry contains a crossref assignment,
###                        the cross-referenced citation label is saved
###                        in a list of such labels, so that subsequent
###                        entries with matching labels can be
###                        recognized as needing special handling to
###                        place them in a separate group at the end.
###
###                        We deal with these constraints by giving
###                        leading commentary, @Preamble entries, and
###                        @String entries temporary sort keys that
###                        place them before other bibliography entries,
###                        and @Proceedings entries, and entries that
###                        are cross-referenced before they are defined,
###                        temporary sort keys to place them last, so
###                        that the output order is
###
###                             (1) leading commentary,
###                             (2) @Preamble entries,
###                             (3) @String entries,
###                             (4) all other entries, except
###                                 @Proceedings entries, and explicitly
###                                 cross-referenced entries, and
###                             (5) @Proceedings entries and explicitly
###                                 cross-referenced entries.
###
###                        Since cross-references are most common
###                        between @InProceedings and @Proceedings, this
###                        heuristic will usually be correct.
###
###                        However, to be completely safe, you should
###                        only apply bibsort to a fragment of a .bib
###                        file that you know in advance can be sorted.
###
###                        Commentary BETWEEN entries will sort with the
###                        preceding entry, rather than the following
###                        one.  This is usually NOT what is desired, so
###                        the recommendation is simply to avoid
###                        commentary altogether outside of the initial
###                        commentary at the start of the file.
###
###                        WARNINGS:
###
###                        (1) This simple version does NOT recognize
###                        bib entries with outer parentheses instead of
###                        braces, or with line breaks between the @Name
###                        and following opening brace.  Use bibclean(1)
###                        to standardize and syntax check the
###                        bibliography entries first.
###
###                        (2) This program may fail on some UNIX sort
###                        implementations that cannot handle very long
###                        lines, because for sorting purposes, each
###                        complete bib entry is temporarily folded into
###                        a single line.  You may be able to overcome
###                        this problem by supplying a command-line
###                        -z nnnnn switch value to set the maximum line
###                        size to nnnnn bytes.  You must supply quotes
###                        around the -z nnnnn pair in order to prevent
###                        interpretation of nnnnn as a file name.
###
###                        (3) The UNIX sort command does not provide a
###                        stable sort: the order of records with equal
###                        sort keys is not guaranteed to be preserved.
###                        Since BibTeX raises an error for duplicate
###                        bibliography entries, this should not be a
###                        limitation.
###
###                        The checksum field above contains a CRC-16
###                        checksum as the first value, followed by the
###                        equivalent of the standard UNIX wc (word
###                        count) utility output of lines, words, and
###                        characters.  This is produced by Robert
###                        Solovay's checksum utility.",
###  }
########################################################################

# Assign default initial values
BYVOLUME=0
BYYEAR=0
FILES=
OTHERSORTFLAGS=
SORTFLAGS=

# Loop over the command-line arguments, collecting bibsort switches,
# sort(1) switches, and file names.
while [ $# -gt -0 ]
do
	case $1 in
	-byy*)
		BYYEAR=1
		BYVOLUME=0
 		SORTFLAGS=
		;;
	-byv*)
		BYVOLUME=1
		BYYEAR=0
		# key = <group><C-k><journal><C-k><volume><C-k><number-1><C-k><number-2><C-k><pages><C-k><citation-label>
		SORTFLAGS="-t +0 -1 +1 -2 +2n -3 +3n -4 +4n -5 +5n -6 +6 -7"
		;;
	-*)			# all other switches are passed to sort
		OTHERSORTFLAGS="$OTHERSORTFLAGS $1"
		;;
	*)			# everything else is assumed to be a filename
		FILES="$FILES $1"
		;;
	esac
	shift			# discard this switch
done

# We store the awk program as a (large) string constant
PROGRAM='BEGIN {
	prefix_header		= "\001"
	prefix_preamble		= "\002"
	prefix_string		= "\003"
	prefix_normal_entry	= "\004"
	prefix_cross_referenced	= "\177"
        sort_prefix		= "\005"
        hidden_newline		= "\006"
	visible_newline		= "\007"
	unknown_value		= "\377" 	# such entries sort last
	volume_key_separator	= "\013"	# C-k character

        sort_key		= sort_prefix "%%SORTKEY:"

        print sort_key prefix_header hidden_newline
}

/^[ \t]*@[ \t]*[Pp][Rr][Ee][Aa][Mm][Bb][Ll][Ee][ \t]*{/ {
	trim()
	squeeze()
        k = index($0,"{") + 1
        print sort_key prefix_preamble substr($0,k) hidden_newline
        print_braced_item()
        next
}

/^[ \t]*@[ \t]*[sS][tT][rR][iI][nN][gG][ \t]*{/ {
	trim()
	squeeze()
        k = index($0,"{") + 1
        m = index($0,"=")
        print sort_key prefix_string substr($0,k,m-k) hidden_newline
        print_braced_item()
        next
}

/^[ \t]*@[ \t]*[Pp][Rr][Oo][Cc][Ee][Ee][Dd][Ii][Nn][Gg][Ss][ \t]*{/ {
	item = collect_braced_item()
        k = index($0,"{") + 1
        m = index($0,",")
	print_item(prefix_cross_referenced,substr($0,k,m-k),item)
        next
}

/^[ \t]*@[ \t]*[Bb][Oo][Oo][Kk][ \t]*{/ {
	# Need to do lookahead to find booktitle to decide whether to
	# sort like @Proceedings or @Article.  A cross-referenced @Book
	# must contain a booktitle assignment, which means that it
	# must be moved to the @Proceedings section of the .bib file.
	item = collect_braced_item()
        k = index(item,"{") + 1
        m = index(item,",")
	citation_key = substr(item,k,m-k)
	if (match(item,/[Bb][Oo][Oo][Kk][Tt][Ii][Tt][Ll][Ee] *=/))
	    prefix = prefix_cross_referenced	# sort like @Proceedings
	else
	    prefix = prefix_normal_entry # sort like @Article
	print_item(prefix,citation_key,item)
        next
}

# "@keyword{label,"
/^[ \t]*@[ \t]*[a-zA-Z0-9]*[ \t]*{/       {
	item = collect_braced_item()
        k = index(item,"{") + 1
        m = index(item,",")
	print_item(prefix_normal_entry,substr(item,k,m-k),item)
        next
}

{				# all other line types match this
	trim()
	print
	last_line = $0
}

END {
	if (last_line != "^[ \t]*$")
	    print hidden_newline
	printf(sort_prefix)
}

function brace_count(s, k,n)
{
    n = 0
    for (k = 1; k <= length(s); ++k)
    {
        if (substr(s,k,1) == "{")
            n++
        else if (substr(s,k,1) == "}")
            n--
    }
    return (n)
}

function collect_braced_item( count,item)
{
    # Starting with the current contents of $0, collect lines until we
    # reach a zero brace count. To guard against infinite loops in the
    # event of unbalanced braces, we abruptly terminate processing if
    # an at-sign is detected in column 1.  This function is used for
    # those entry types that require fancy sort preprocessing.

    squeeze()
    trim()
    count = brace_count($0)
    item = $0 "\n"
    while (count != 0)
    {
        if (getline <= 0)
            break
	if (substr($0,1,1) == "@") # should use match($0,/^[ \t]+@/),
				   # but this is faster, and usually correct
	    error("New entry encountered before balanced braces found")
        trim()
        item = item $0 visible_newline
        count += brace_count($0)
    }
    return item
}

function error(message)
{		# print a message and terminate with failing exit code
    warning(message)
    exit(1)
}

function print_braced_item(count)
{
    # Starting with the current contents of $0, print lines until we
    # reach a zero brace count.  This function is used for
    # @Preamble{...} and @String{...}, which require no special
    # processing.

    count = brace_count($0)
    print $0
    while (count != 0)
    {
        if (getline <= 0)
            break
        printf("%s%s",$0,visible_newline)
        count += brace_count($0)
    }
}

function print_item(prefix,citation_key,item, extra_key,v)
{
    if (citation_key in cross_referenced_item) # change prefix if this item
	prefix = prefix_cross_referenced # was cross-referenced earlier

    if (BYYEAR)
	extra_key = (match(item,/:[12][0-9][0-9x][0-9x]:/)) ? \
	    substr(item,RSTART,RLENGTH) : \
	    (":" value(item,citation_key,"year[ \t]*=[ \t]*") ":")
    else if (BYVOLUME)
	extra_key = volume_key(item,citation_key)
    else
	extra_key = ""

    print sort_key prefix extra_key citation_key hidden_newline
    printf("%s", item)

    # Check for use of crossref = "citation-key": such items must be
    # sorted last, like @Proceedings.  This will only succeed if the
    # input bibliography file follows the requirement of BibTeX 0.99
    # that cross-referenced items must follow items that
    # cross-reference them.
    v = value(item,citation_key,"crossref[ \t]*=[ \t]*")
    if (v != unknown)
	cross_referenced_item[v] = 1
}

function squeeze( kbrace,kspace)
{
    sub(/^[ \t]*@[ \t]*/,"@")	# eliminate space before and after initial @
    kbrace = index($0,"{")	# eliminate space between entryname and brace
    kspace = match($0,"[ \t]")
    if (kspace < kbrace)	# then found intervening space
	sub(/[ \t]+{/,"{")	# NB: sub(), NOT gsub(), here
}

function trim()
{
    sub(/[ \t]+$/,"")
}

function value(item,citation_key,keyword_pattern, s,v)
{
    match(item,keyword_pattern)
    ### print "DEBUG: value() [" substr(item,RSTART,RLENGTH) "] [" item "]\n\n" >"/dev/tty"
    if (substr(item,RSTART+RLENGTH,1) == "\"") # have key = "value"
    {
	s = substr(item,RSTART+RLENGTH)
	match(s,/["][^"]+["]/)
	v = (RLENGTH > 2) ? substr(s,RSTART+1,RLENGTH-2) : unknown_value
    }
    else if (substr(item,RSTART+RLENGTH,1) == "{") # have key = {value}
    {
	s = substr(item,RSTART+RLENGTH)
	match(s,/{[^}]+}/)
	v = (RLENGTH > 2) ? substr(s,RSTART+1,RLENGTH-2) : unknown_value
    }
    else if (substr(item,RSTART+RLENGTH,1) ~ /[0-9]/) # have key = number,
    {
	s = substr(item,RSTART+RLENGTH)
	match(s,/[^,]+,/)
	 v = (RLENGTH > 1) ? substr(s,RSTART,RLENGTH-1) : unknown_value
    }
    else if (substr(item,RSTART+RLENGTH,1) ~ /[A-Za-z]/) # have key = abbrev,
    {
	s = substr(item,RSTART+RLENGTH)
	match(s,/[^,]+,/)
	v = (RLENGTH > 1) ? substr(s,RSTART,RLENGTH-1) : unknown_value
    }
    else			# unexpected pattern
	v = unknown_value

    if ((index(v,unknown_value) > 0) &&
	(index(keyword_pattern,"crossref") == 0))
    {				# warn about missing values
	match(keyword_pattern,/[a-zA-Z]+/)
	warning("Missing " substr(keyword_pattern,RSTART,RLENGTH) \
	    " value in " citation_key)
    }

    return (v)
}

function volume_key(item,citation_key, n,parts)
{
    n = split(value(item,citation_key,"pages[ \t]*=[ \t]*"),parts,"--")
    return ( \
	volume_key_separator \
	value(item,citation_key,"journal[ \t]*=[ \t]*") volume_key_separator \
	value(item,citation_key,"volume[ \t]*=[ \t]*") volume_key_separator \
	value(item,citation_key,"number[ \t]*=[ \t]*") volume_key_separator \
	parts[1] volume_key_separator  \
	((n > 1) ? parts[2] : parts[1]) volume_key_separator )
}

function warning(message)
{
    print FILENAME ":" FNR ":%%" message >"/dev/tty"
}
'

# The bibliography sorting is implemented as a filter pipeline:
#
# Stage 1 (nawk) finds bib file entries and prefixes them with a line
# containing a special customized recognizable sort key, where each such
# line begins with a Ctl-E, and the file ends with Ctl-E.  The sort key
# contains unprintable characters, so as to essentially eliminate any
# possibility of confusion with bibliography data.
#
# Stage 2 (tr) turns LF into Ctl-G and Ctl-E into LF.  This hides
# line boundaries, and makes each bibliography item a separate `line'.
#
# Stage 3 (sort) sorts `lines' (i.e. bib entries), ignoring
# letter case differences.
#
# Stage 4 (tr) turns LF into Ctl-E, and Ctl-G back into LF.  This
# restores the original line boundaries.
#
# Stage 5 (tr) deletes all Ctl-E and Ctl-F characters.
#
# Stage 6 (egrep) removes the sort key lines.
#
# Finally, here is the pipeline that does all of the work:

nawk "$PROGRAM" BYVOLUME=$BYVOLUME BYYEAR=$BYYEAR $FILES | \
        tr '\012\005' '\007\012' | \
        sort -f $SORTFLAGS $OTHERSORTFLAGS | \
        tr '\007\012' '\012\005' | \
        tr -d '\005\006' | \
        egrep -v  '^%%SORTKEY:'
################################[The End]###############################
