# /u/sy/beebe/tex/bibcheck/bibcheck.awk, Tue Aug 16 10:55:58 1994
# Edit by Nelson H. F. Beebe <beebe@sunrise>
### ====================================================================
###  @Awk-file{
###     author          = "Nelson H. F. Beebe",
###     version         = "0.07",
###     date            = "25 November 1994",
###     time            = "00:16:31 MST",
###     filename        = "bibcheck.awk",
###     address         = "Center for Scientific Computing
###                        Department of Mathematics
###                        University of Utah
###                        Salt Lake City, UT 84112
###                        USA",
###     telephone       = "+1 801 581 5254",
###     FAX             = "+1 801 581 4148",
###     checksum        = "46500 378 1133 9243",
###     email           = "beebe@math.utah.edu (Internet)",
###     codetable       = "ISO/ASCII",
###     keywords        = "BibTeX, pattern matching, syntax checking",
###     supported       = "yes",
###     docstring       = "This program checks for miscellaneous
###                        formatting irregularities in one or more
###                        BibTeX files, using heuristic pattern
###                        matching to detect common problems.
###
###                        In this version, the input MUST come from
###                        biblex, or from "bibclean -no-pretty", to
###                        guarantee correct parsing of arbitrary BibTeX
###                        files, and to avoid the need for complex, and
###                        therefore, slow, input parsing in awk.
###
###                        Usage:
###                                biblex BibTeXfile(s) | nawk -f bibcheck.awk
###                                biblex <BibTeXfile | nawk -f bibcheck.awk
###
###                        lacheck(1) performs similar checks on LaTeX
###                        files, and its great utility was the
###                        inspiration for this program.
###
###                        The checksum field above contains a CRC-16
###                        checksum as the first value, followed by the
###                        equivalent of the standard UNIX wc (word
###                        count) utility output of lines, words, and
###                        characters.  This is produced by Robert
###                        Solovay's checksum utility.",
###  }
### ====================================================================

BEGIN { FS = "\t" }

/# line [0-9]+/		{ do_line_number() ; next }

# biblex output contains 3 fields: <nnn><tab><type><nnn><value>
# Our heuristics need to be applied only to certain field types,
# which speeds processing.

$2 == "PREAMBLE"	{ type = "PREAMBLE" }

$2 == "STRING"		{ type = "STRING" }

$2 == "ENTRY"		{ type = "ENTRY" }

$2 == "SPACE"		{ next } # ignore whitespace

$2 == "NEWLINE"		{
			    # if (tokens == 0) blank_line();
			    tokens = 0
			    next
			}

($2 == "VALUE") && (type != "PREAMBLE")	{ do_value($3) }

($2 == "ABBREV") && (type == "STRING") { do_string($3) }

$2 == "KEY"		{ do_label($3) }

$2 == "AT"		{ begin_entry() }

$2 == "RBRACE"		{ end_entry() }

$2 == "FIELD"		{ do_keyword() }

			{ end_blank() }

END			{}

function add_keyword(name)
{
    if ((name in keywords) && (keywords[name] != LINE_NUMBER))
    {
	warning("duplicate keyword seen at line(s) " keywords[name])
	keywords[name] = keywords[name] ", " LINE_NUMBER
    }
    else
	keywords[name] = LINE_NUMBER
    last_keyword = name
}

function ampersand(value)
{
    if (value !~ /^[ \t]*%/) # ampersands are okay in comments, e.g. AT&T
	warning("unbackslashed ampersand")
    end_blank()
}

function author()
{
    add_keyword("author")
    author_editor()
}

function author_editor( k,level,s)
{
    if (match($3,/[^{}0-9A-Za-z.\"][A-Z][^{}0-9A-Za-z.'\\\"]/))
	warning("period missing following initial(s)")
    k = index($3,"\"")
    s = substr($3,k+1)
    sub(/\", *$/,"",s)
    level = 0
    for (k = 1; k <= length(s); ++k)
    {
	if (substr(s,k,1) == "{")
	    level++
	else if (substr(s,k,1) == "}")
	    level--
	else if ((substr(s,k,1) == ",") && (level == 0))
	    warning("comma at brace-level zero in author/editor names")
	if (level < 0)
	    warning("unbalanced closing brace in author/editor")
    }
    if (level > 0)
	warning("unbalanced opening brace in author/editor")
}

function bad_period()
{
    if (last_keyword != "lccn")
	bad_punctuation()
    end_blank()
}

function bad_punctuation()
{
    warning("space before punctuation")
    end_blank()
}

function begin_entry( dummy)
{
    dummy = 0			# gawk needs non-empty function body
}

function blank_line()
{
    blanks++
}

function braced_letter()
{
    warning("old-style bracing hinders word searches")
    end_blank()
}

function colon_quote()
{
    warning("...: ``...'' may need protecting brace on first word inside quotation marks")
}

function comment()
{
    end_line_number = LINE_NUMBER
}

function do_keyword( key)
{
    key = tolower(unquote($3))
    add_keyword(key)
    if (key == "author")
	author()
    else if (key == "editor")
	editor()
    else if (key == "pages")
	pages()
}

function do_label( this_label)
{
    this_label = unquote($3)
    if (this_label in citation_labels)
    {
        warning("duplicate citation label [" this_label "] at lines " \
		citation_labels[this_label] " and " LINE_NUMBER)
        citation_labels[this_label] = citation_labels[this_label] ", " LINE_NUMBER
    }
    else
	citation_labels[this_label] = LINE_NUMBER
    if (this_label !~ /^[A-Za-z0-9:-]+$/)
	warning("non-standard citation tag")
}
function do_line_number( n,words)
{
    n = split($0,words," ");
    LINE_NUMBER = 0 + words[3]
    FILE_NAME = unquote(words[4])
}

function do_string(value, this_abbrev)
{
    this_abbrev = unquote($3)
    if (this_abbrev in abbreviations)
    {
	warning("duplicate abbreviation [" this_abbrev "] at lines " \
	    abbreviations[this_abbrev] " and " LINE_NUMBER)
	abbreviations[this_abbrev] = abbreviations[this_abbrev] ", " LINE_NUMBER
    }
    else
	abbreviations[this_abbrev] = LINE_NUMBER
}

function do_value(value)
{
    if (value ~ /[.][.][.]/)		{ ellipsis() }

    if (value ~ /[^\\]&/)		{ ampersand(value) }

    if (value ~ /^[^%].*%/)		{ percent() }

    if (value ~ /[^\\]~/)		{ tie() }

    if (value ~ /[^\\]\\t/)		{ tab() }

    if (value ~ / \\\"\"$/)		{ trailing_blank() }

    if (value ~ /[$^_]/)		{ math_mode() }

    if (value ~ /{"}/)			{ quotation_mark() }

    if (value ~ /{[A-Z]}[a-z]|[^\\A-Za-z][A-Za-z]+{[A-Z]}/)
					{ braced_letter() }

    if (value ~ / [.]/)			{ bad_period() }

    if (value ~ / [,?;:!]/)		{ bad_punctuation() }

    if (value ~ /[a-z][A-Z]/)		{ mixed_case() }

    if (value ~ /[A-Z][.][A-Z][.]/)	{ initials() }

    if ((value ~ /: *``[A-Z]/) && (last_keyword == "title"))
					{ colon_quote() }
}

function editor()
{
    add_keyword("editor")
    author_editor()
}

function ellipsis()
{
    warning("replace ... by \\ldots{}")
    end_blank()
}

function end_blank()
{
    if (blanks > 1)
	warning("multiple blank lines")
    blanks = 0
}

function end_entry( name)
{
    end_line_number = 0
    end_blank()
    for (name in keywords)	# forget all keywords in this entry
	delete keywords[name]
}

function initials()
{
    warning("adjacent dotted initials perhaps need a space")
}

function math_mode( dollars,k,subsup_error,value)
{
    value = unquote($3)

    # Ignore bibsource and URL value strings which may have special
    # characters, such as underscore
    if ((last_keyword == "url") || (last_keyword == "bibsource"))
	return

    dollars = 0
    subsup_error = 0
    for (k = 1; k <= length(value); ++k)
    {
        if ((substr(value,k,1) == "$") && (substr(value,k-1,1) != "\\"))
	    dollars++
        if (((dollars % 2) == 0) && (substr(value,k,1) ~ /[_^]/) && \
	    (substr(value,k-1,1) != "\\"))
	    subsup_error++
    }
    if (dollars % 2)
	warning("Unclosed math mode")
    if (subsup_error)
	warning("Subscript or superscript outside math mode")
}

function mixed_case( k,mixed,n,value,words)
{
    value = $3
    gsub(/\"/,"",value)
    gsub(/-/," ",value)		# separate hyphenated words
    n = split(value,words," ")
    mixed = 0
    for (k = 1; k <= n; ++k)
    {
	if ((words[k] ~ /[a-z][A-Z]/) && (words[k] ~ /^[^{a-zA-Z]*[a-zA-Z]/))
	    mixed++
    }
    if ((mixed > 0) && (last_keyword == "title"))
	warning("mixed-case word should be braced")
}

function pages()
{
    add_keyword("pages")
    if ($3 ~ /[0-9]-[0-9]/)
	page_range()
}

function page_range()
{
    warning("hyphen found where en-dash expected")
    end_blank()
}

function percent()
{
    if ($3 ~ /[^\\%]%/)
	warning("Non-backslashed percent")
    end_blank()
}

function quotation_mark()
{
    warning("braced quotation mark")
    end_blank()
}

function tab()
{
    warning("horizontal tab")
    end_blank()
}

function tie()
{
    warning("ties (tildes) normally provided by BibTeX")
}

function trailing_blank()
{
    warning("trailing space")
}

function unquote(s)
{   # reduce "value" to value
    return substr(s,2,length(s)-2)
}

function warning(message)
{
    printf("%s:%s:%%%%:%-45s\t[%s]\n", \
	FILE_NAME, LINE_NUMBER, message ":", $3) >"/dev/stderr"
}
