### /u/sy/beebe/tex/bib/check-bbl.awk, Fri Dec 4 07:52:31 1998 ### Edit by Nelson H. F. Beebe ### ======================================================================== ### Check a .bbl file for words that occur both in protecting braces, ### and entirely in lowercase, sending warnings to stdout. Since there ### is no relation between the order of entries in a .bbl file and ### entries in the .bib files from which it was made, the warnings are ### sorted by exception word, in order to facilitate recognition of ### words that may need bracing. ### ### As an added feature, this program keeps a history of protected words ### in check-bbl.wds, which it uses to augment the list of protected ### words in the current .bbl file, and which it normally writes out a ### fresh sorted copy of on completion. The optional +no-update switch ### suppresses updating of the history file, check-bbl.wds. A backup ### copy is made for safety in the event of an interruption or system ### crash. ### ### As the size of the check-bbl.wds file grew (to over 10,000 entries), ### many words were flagged for possible bracing that really should not ### be, resulting in diagnostic messages that are tedious to examine, and ### simply obscure the real errors. Such words might have started a ### sentence in a title, or simply have been braced when they should not ### have been (humans add the bracing in BibTeX files, and often do so ### incorrectly). Consequently, a second exception dictionary, ### check-bbl.ign, was developed to include words from the first field of ### check-bbl.wds which should be ignored, and thus, not be output in the ### exception list. Unlike the .wds file, which is automatically updated ### with each run of this program, the .ign file is maintained by hand, ### and only gets additions which one can be confident should never ### require bracing. There is also a built-in table, Ignore[], of common ### words that augment entries in the check-bbl.ign file. ### ### Bracing in BibTeX strings should never include following punctuation, ### and such cases are checked for, and warnings issued if necessary. ### ### Normally, this program should be run twice in order to handle cases ### of a braced word following an unbraced word. ### ### Usage: ### nawk -f check-bbl.awk [-v WIDTH=nnn] [+short] [+no-update] bblfile[s] ### ### [14-Jul-2020] -- In main word loop, assign $k to word, strip parentheses ### and brackets, and trailing punctuation, and then use ### word instead of $k in the rest of the loop. ### [10-Dec-1999] -- Change to Lisp-style comments (#, ##, ###). ### Add sorting code so that we can sort the output in ### print_results(). ### Add support for WIDTH option [default WIDTH=80], so ### that the user can control the length of lists in the ### short form ### [04-Dec-1998] -- When punctuation is discovered inside a braced word ### in write_wds_file(), do not write it to the ### dictionary file. ### [24-Oct-1998] -- add support for check-bbl.ign dictionary, and add ### new global variables Ignore_File and WDS_File to ### avoid repetition of the dictionary file names ### [14-Oct-1997] -- add +short option ### [13-Aug-1997] -- change to formatted output for better readability ### [06-Apr-1996] -- add +no-update switch ### [05-Apr-1996] -- add secondary sort field for filename:linenumber ### [10-Feb-1996] -- add comment about need to run this program twice ### [09-Feb-1996] -- original version ### ====================================================================== BEGIN { initialize() } /^\\bibitem/ \ { match($0,/{[^{}]*}$/) citation_tag = substr($0,RSTART+1,RLENGTH-2) } { ## Simplify some common markup gsub("\\slash ", " / ") for (k = 1; k <= NF; ++k) # scan words in current line { word = $k gsub("[][()]", "", word) gsub("['.,:;/]+$", "", word) gsub("^`+", "", word) if ((word ~ /^{[^ {}]*}$/) && (word ~ /[A-Z]/)) { # examine braced words with at least one uppercase letter name = tolower(substr(word,2,length(word)-2)) sub("^`+", "", name) # [08-Jul-2019] discard any leading quotes sub("'+$", "", name) # [08-Jul-2019] discard any final quotes sub("[.]$", "", name) # [08-Jul-2019] discard any final dot if (name in Ignore) continue Braced_Word[name] = word Braced_Entry[name] = citation_tag Braced_File[name] = FILENAME } else if (word in Braced_Word) { if (($(k+1) == "+") && (word ~ /^[ivxlcdm]+$/)) continue # ignore probable Roman numbers in page values else if (($(k-1) == "+") && (word ~ /^[ivxlcdm]+$/)) continue # ignore probable Roman numbers in page values else if (Short == 0) warning(word " [" citation_tag "] : " Braced_Word[word] \ " [" Braced_Entry[word] "] " Braced_File[word]) else if (word in Tags_of_word) { if ((length(Tags_of_word[word]) + length(citation_tag)) < (WIDTH - 32 - 4)) Tags_of_word[word] = Tags_of_word[word] " " citation_tag else if (!match(Tags_of_word[word],/[.][.][.]$/)) Tags_of_word[word] = Tags_of_word[word] " " "..." } else Tags_of_word[word] = citation_tag } } } END { if (Short) print_results() if (Update) write_wds_file() } # ====================================================================== function do_options( k) { Short = 0 Update = 1 File_Count = 0 for (k = 1; k < ARGC; k++) # collect +switches { if (ARGV[k] == "+no-update") { Update = 0 ARGV[k] = "" } else if (ARGV[k] == "+short") { Short = 1 ARGV[k] = "" } else File_Count++ } } function initialize() { WIDTH += 0 # coerce to a number if (WIDTH < 50) # enforce a reasonable lowerbound WIDTH = 50 Ignore_File = "check-bbl.ign" WDS_File = "check-bbl.wds" do_options() make_ignore_list() read_wds_file() } function less(a,b) { ## This is the user-defined comparison function used in quicksort() ## (though actually called from partition()). ## Our lists have ellipses, which we want to sort high, so they ## remain at the ends of the lists. if (a == "...") return 0 else if (b == "...") return 1 else return (tolower(a) < tolower(b)) } function make_ignore_list() { ## This list of words to be ignored was constructed from the output ## of this program on several large bibliographies, after fixing ## all legitimate errors. Use of this list helps to reduce the ## number of false matches in the output. Ignore["3a"] = 1 Ignore["a"] = 1 Ignore["administration"] = 1 Ignore["ai"] = 1 Ignore["algorithm"] = 1 Ignore["also"] = 1 Ignore["an"] = 1 Ignore["approach"] = 1 Ignore["as"] = 1 Ignore["askfor"] = 1 Ignore["assembler"] = 1 Ignore["assembly"] = 1 Ignore["at"] = 1 Ignore["b"] = 1 Ignore["basic"] = 1 Ignore["beam"] = 1 Ignore["bit"] = 1 Ignore["brief"] = 1 Ignore["calls"] = 1 Ignore["character"] = 1 Ignore["color"] = 1 Ignore["command"] = 1 Ignore["commands"] = 1 Ignore["committee"] = 1 Ignore["composition"] = 1 Ignore["computer"] = 1 Ignore["concept"] = 1 Ignore["core"] = 1 Ignore["cost"] = 1 Ignore["description"] = 1 Ignore["design"] = 1 Ignore["device"] = 1 Ignore["digital"] = 1 Ignore["display"] = 1 Ignore["do"] = 1 Ignore["document"] = 1 Ignore["dose"] = 1 Ignore["drawing"] = 1 Ignore["driver"] = 1 Ignore["dvi"] = 1 Ignore["e"] = 1 Ignore["em"] = 1 Ignore["essentials"] = 1 Ignore["everything"] = 1 Ignore["extract"] = 1 Ignore["family"] = 1 Ignore["font"] = 1 Ignore["formal"] = 1 Ignore["format"] = 1 Ignore["formulae"] = 1 Ignore["free"] = 1 Ignore["gamma"] = 1 Ignore["graphic"] = 1 Ignore["hyphenation"] = 1 Ignore["iazyk"] = 1 Ignore["identify"] = 1 Ignore["independent"] = 1 Ignore["inputting"] = 1 Ignore["interactive"] = 1 Ignore["introduction"] = 1 Ignore["is"] = 1 Ignore["its"] = 1 Ignore["k"] = 1 Ignore["kernel"] = 1 Ignore["keyboard"] = 1 Ignore["language"] = 1 Ignore["languages"] = 1 Ignore["laser"] = 1 Ignore["learned"] = 1 Ignore["library"] = 1 Ignore["look"] = 1 Ignore["low"] = 1 Ignore["macros"] = 1 Ignore["manipulation"] = 1 Ignore["manual"] = 1 Ignore["map"] = 1 Ignore["mark"] = 1 Ignore["matching"] = 1 Ignore["math"] = 1 Ignore["may"] = 1 Ignore["mode"] = 1 Ignore["modern"] = 1 Ignore["news"] = 1 Ignore["o"] = 1 Ignore["of"] = 1 Ignore["on"] = 1 Ignore["open"] = 1 Ignore["part"] = 1 Ignore["planning"] = 1 Ignore["prepare"] = 1 Ignore["printer"] = 1 Ignore["proceedings"] = 1 Ignore["release"] = 1 Ignore["run"] = 1 Ignore["s"] = 1 Ignore["scheme"] = 1 Ignore["scientific"] = 1 Ignore["scripts"] = 1 Ignore["seismic"] = 1 Ignore["series"] = 1 Ignore["simple"] = 1 Ignore["simulation"] = 1 Ignore["slip"] = 1 Ignore["software"] = 1 Ignore["some"] = 1 Ignore["standard"] = 1 Ignore["standards"] = 1 Ignore["string"] = 1 Ignore["structured"] = 1 Ignore["summary"] = 1 Ignore["superb"] = 1 Ignore["support"] = 1 Ignore["system"] = 1 Ignore["systems"] = 1 Ignore["tables"] = 1 Ignore["technical"] = 1 Ignore["test"] = 1 Ignore["text"] = 1 Ignore["the"] = 1 Ignore["third"] = 1 Ignore["ti"] = 1 Ignore["top"] = 1 Ignore["torture"] = 1 Ignore["type"] = 1 Ignore["typefaces"] = 1 Ignore["typesetting"] = 1 Ignore["typing"] = 1 Ignore["use"] = 1 Ignore["using"] = 1 Ignore["v"] = 1 Ignore["vector"] = 1 Ignore["version"] = 1 Ignore["window"] = 1 Ignore["word"] = 1 Ignore["x"] = 1 ## The Ignore_File contains a manually-maintained blank-free list of ## words, one per line, which should never be entered into the ## Braced_Word[] list. This list simply augments words from the ## builtin Ignore[] list. while ((getline < Ignore_File ) > 0) Ignore[$1] = 1 close (Ignore_File) } function partition(array,left,right, i,j,swap,v) { ## This is a helper function for quicksort(). i = left - 1 j = right v = array[right] for (;;) { while (less(array[++i],v)) ; while (less(v,array[--j])) { if (j == left) break } if (i >= j) break swap = array[i] array[i] = array[j] array[j] = swap } swap = array[i] array[i] = array[right] array[right] = swap return (i) } function print_results( sortpipe,word) { sortpipe = "sort" for (word in Tags_of_word) printf("%-31s\t%s\n", word, sort_list(Tags_of_word[word])) | sortpipe close(sortpipe) } function quicksort(array,left,right, i) { ## The code in partition() and quicksort() is a direct translation ## of the simple quicksort algorithm given in Robert Sedgewick's ## ``Algorithms in C'', 3rd edition, Addison-Wesley, 1998, ## pp. 305--307. We need an O(N lg N) algorithm here instead of a ## simpler O(N^2) algorithm because the font list has thousands of ## entries. There are many things that one can do to tweak ## quicksort() to make its worst-case behavior of O(N^2) unlikely, ## and to improve its performance on small sequences by switching ## to other sorting algorithms. However, we do not attempt any of ## those refinements here. ## ## The user-defined less(a,b) function conceals the details of how ## array items are compared. if (right <= left) return i = partition(array,left,right) quicksort(array, left, i - 1) quicksort(array, i + 1, right) } function read_wds_file( name) { ## The WDS_File file contains a list of braced words and ## citation labels, one pair per line, created on previous runs of ## this program. Read in and save the complete list. system("if test ! -f " WDS_File " ; then touch " WDS_File " ; fi") while ((getline < WDS_File) > 0) { if ($1 ~ /^{[^{}]*}$/) { name = tolower(substr($1,2,length($1)-2)) if (name in Ignore) continue Braced_Word[name] = $1 Braced_Entry[name] = $2 Braced_File[name] = $3 } } close(WDS_File) } function sort_list(list, n,parts) { ## Sort entries in a space-separated list into ascending order, ## and return the result. n = split(list,parts," ") quicksort(parts,1,n) return (table_to_list(parts,n)) } function table_to_list(table,n, k,list) { ## Convert a table indexed by 1..n to a space-separated list, and ## return that list. list = "" for (k = 1; k <= n; ++k) list = list " " table[k] return (substr(list,2)) # discard leading space, if any } function warning(s) { printf("%s:%06d:%s\n", FILENAME, FNR, s) | "sort +1 -2 +0 -1 -f" } function write_wds_file( name) { system("if test -s " WDS_File " ; then /bin/mv " WDS_File " check-bbl.sav ; fi") for (name in Braced_Word) { if (Braced_Word[name] ~ "[-,:/;]}") { print Braced_File[name] " " Braced_Entry[name] \ " Trailing punctuation inside braced word: " Braced_Word[name] >"/dev/stderr" delete Braced_Word[name] } else printf("%s\t%s\t%s\n", Braced_Word[name], \ Braced_Entry[name], Braced_File[name]) > WDS_File } close (WDS_File) system("/bin/sort -u -f +0 -1 " WDS_File " >/tmp/check-bbl.tmp") system("if test -s /tmp/check-bbl.tmp ; then /bin/mv /tmp/check-bbl.tmp " WDS_File " ; fi") }