/***********************************************************************
 @C-file{
    author              = "Nelson H. F. Beebe",
    version             = "0.00",
    date                = "05 November 1994",
    time                = "10:52:28 MST",
    filename            = "citesub.c",
    address             = "Center for Scientific Computing
			   Department of Mathematics
			   University of Utah
			   Salt Lake City, UT 84112
			   USA",
    telephone           = "+1 801 581 5254",
    FAX                 = "+1 801 581 4148",
    checksum            = "52403 406 1302 10829",
    email               = "beebe@math.utah.edu (Internet)",
    codetable           = "ISO/ASCII",
    keywords            = "bibliography, BibTeX, citation label, LaTeX,
			   TeX",
    supported           = "yes",
    docstring           = "This program filters a list of files, or
			   standard input if none are specified, to
			   standard output, carrying out BibTeX citation
			   label substitutions specified by entries in a
			   file given by a command-line argument.  Each
			   line of that file contains a pair of citation
			   labels, surrounded by an arbitrary amount of
			   whitespace.

			   Usage:
				citesub [-v] file(s) >newfile
				citesub -f subfile [-v] file(s) >newfile
				citesub -f subfile [-v] <oldfile >newfile

			   If `-f subfile' is omitted, the substitution
			   filename is determined by replacing the
			   extension of the first named file with .sub.

			   A filename of `-' means standard input.

			   The -v option requests display of the
			   version number on stderr.

			   Old citation labels are substituted only if
			   they are preceded by a left brace, quote,
			   comma, whitespace, or beginning of line, and
			   followed by right brace, comma, quote,
			   percent, whitespace, or end of line.

			   This program was written to deal efficiently
			   with large bibliographies.  Obvious citation
			   label substitution schemes using the sed
			   stream editor, or awk, to compare each input
			   line with all old citation labels perform
			   badly, because the execution time is
			   proportional to the PRODUCT of the number of
			   labels and the number of input lines.  Also,
			   with more than about 100 labels, most UNIX
			   sed implementations exceed internal table
			   sizes, and simply refuse to run.

			   The GNU sed implementation (version 2.05) can
			   be rebuilt with larger tables, but proves
			   impossibly slow on the large bibliographies
			   of the BibNet project.

			   This implementation in C runs about 100 times
			   faster than the prototype implementation in
			   awk in citesub.awk, but requires about 1.6
			   times as much code, and much more attention
			   to detail.

			   The checksum field above contains a CRC-16
			   checksum as the first value, followed by the
			   equivalent of the standard UNIX wc (word
			   count) utility output of lines, words, and
			   characters.  This is produced by Robert
			   Solovay's checksum utility.",
 }
***********************************************************************/

#define CITESUB_VERSION		"citesub version 0.00 [05-Nov-1994]"

#include "os.h"
#include "xstdlib.h"
#include "xstring.h"
#include "xalloc.h"
#include "hash.h"

#define ISLABELCHAR(c)	islabelchar[(unsigned int)(c)]

#define LEADING_CONTEXT	 "{,\" \t"   /* what must precede a citation label */
#define TRAILING_CONTEXT "},\" \t%"  /* what must follow a citation label */

/***********************************************************************
This string defines the set of characters which can occur in citation
labels.  See Nelson H. F. Beebe, "Bibliography prettyprinting and
syntax checking", TUGboat 14(3), 222, October (1993) and TUGboat
14(4), 395--419, December (1993).
***********************************************************************/
#define LABELCHARS \
  "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789:-+/.'_"

#define MAXLINE		65534	/* must be big enough for longest
				   expected line in any input file */

#define SUBFILEEXT	".sub"

static int		islabelchar[256];
static char		line[MAXLINE+1];
static HASH_TABLE	*substitution_table;

int		main ARGS((int argc_, char* argv_[]));
static void	process_file ARGS((FILE *fp_));
static void	process_line ARGS((void));
static void	read_substitution_file ARGS((FILE *fp_));
static void	version ARGS((void));


#if STDC
int
main(int argc, char* argv[])
#else /* old K&R C */
int
main(argc,argv)
int argc;
char* argv[];
#endif /* STDC */
{
    int k;
    int n;
    int errors = 0;
    char *p;
    const char *q;
    FILE *fp;
    char *subfilename = (char*)NULL;

    /* Initialize the islabelchar[] table for fast token classification */
    for (k = 0; k < sizeof(islabelchar)/sizeof(islabelchar[0]); ++k)
	islabelchar[k] = 0;

    for (q = LABELCHARS; *q ; ++q)
	islabelchar[(unsigned int)(*q)] = 1;

    /* Scan the command-line arguments for a substitution filename */

    for (k = 1; k < argc; ++k)
    {
	if (stricmp(argv[k],"-f") == 0)
	{
	    argv[k++] = (char*)NULL;
	    subfilename = (char*)argv[k];
	    argv[k] = (char*)NULL;
	    break;	/* accept only the first -f file argument pair */
	}
	else if (stricmp(argv[k],"-v") == 0)
	{
	    argv[k] = (char*)NULL;
	    version();
	}
    }

    if (subfilename == (char*)NULL)
    {		/* infer substitution filename from first input file name */
		/* by changing its extension ".xyz" to SUBFILEEXT */
	for (k = 1; k < argc; ++k)
	{
	    if (argv[k] != (char *)NULL)
	    {
		subfilename = (char*)xmalloc(strlen(argv[k]) +
					     sizeof(SUBFILEEXT));
		(void)strcpy(subfilename,argv[k]);
		p = strrchr(subfilename,'.');
		if (p != (char*)NULL)
		    strcpy(p,SUBFILEEXT);
		else
		    subfilename = (char*)NULL;
	    }
	}
    }

    /* Process the substitution file */

    fp = (subfilename == (char*)NULL) ? (FILE*)NULL :
	((strcmp(subfilename,"-") == 0) ? stdin : fopen(subfilename,"r"));
    if (fp == (FILE*)NULL)
    {
	errors++;
	(void)fprintf(stderr,"?Cannot open substitution file [%s]\n",
		      (subfilename == (char*)NULL) ?
		      "none given" : subfilename);
	subfilename = (char*)NULL;
    }
    else
    {
	read_substitution_file(fp);
	(void)fclose(fp);
    }

    /* Process the remaining arguments, which are expected to be filenames */

    for (k = 1, n = 0; (subfilename != (char*)NULL) && (k < argc); ++k)
    {
	if (argv[k] != (char*)NULL)
	{
	    fp = (strcmp(argv[k],"-") == 0) ? stdin : fopen(argv[k],"r");
	    if (fp == (FILE*)NULL)
	    {
		errors++;
		(void)fprintf(stderr,"?Cannot open input file [%s]\n",argv[k]);
	    }
	    else
	    {
		n++;
		process_file(fp);
		(void)fclose(fp);
	    }
	}
    }

    if ((n == 0) && (errors == 0))	/* no filenames specified: use stdin */
	process_file(stdin);

    exit (errors ? EXIT_FAILURE : EXIT_SUCCESS);
    return (errors ? EXIT_FAILURE : EXIT_SUCCESS);
}


#if STDC
static void
process_file(FILE *fp)
#else
static void
process_file(fp)
FILE *fp;
#endif
{
    while (fgets(line,sizeof(line),fp) != (char*)NULL)
	process_line();
}


static void
process_line(VOID_ARG)
{
    HASH_ENTRY *h;
    char *p;
    char save;
    char *token;
    int prev_char;
    int next_char;

    for (p = &line[0]; *p; )
    {
	while (*p && !ISLABELCHAR(*p))
	{				/* output non-label chars verbatim */
	    putchar(*p);
	    ++p;
	}
	for (token = p; *p && ISLABELCHAR(*p); ++p)
	    /* NO-OP */;
	if (*token)
	{
	    prev_char = (p == &line[0]) ? LEADING_CONTEXT[0] : token[-1];
	    next_char = *p;	/* OK if *p==NUL: NUL is in TRAILING_CONTEXT */
	    save = *p;
	    *p = '\0';
	    h = hash_lookup(token, substitution_table);
	    if (h == (HASH_ENTRY*)NULL)
		(void)fputs(token,stdout);
	    else if (h->hash_key == (const char*)NULL)
		(void)fputs(token,stdout);
	    else if ((strchr(LEADING_CONTEXT,prev_char) != (char*)NULL) &&
		     (strchr(TRAILING_CONTEXT,next_char) != (char*)NULL))
		(void)fputs((const char*)(h->hash_data),stdout);
	    else
		(void)fputs(token,stdout);
	    *p = save;
	}
    }
}


#if STDC
static void
read_substitution_file(FILE *fp)
#else
static void
read_substitution_file(fp)
FILE *fp;
#endif
{
    long label_count = 0L;
    long line_number = 0L;
    HASH_INT need;
    char *new_label;
    char *old_label;
    const char *p;

#define TOKEN_SEPARATOR " \t\f\r\n\b\a"	/* whitespace characters */

    while (fgets(line,sizeof(line),fp) != (char*)NULL)
	line_number++;

    need = (HASH_INT)(3*line_number)/2;	/* want table about 2/3 full */

    if (need < 10)			/* line_number == 0 or 1 is possible */
	need = 10;

    substitution_table = hash_alloc(need,0);
    if (substitution_table == (HASH_TABLE*)NULL)
    {
	(void)fprintf(stderr,"Cannot allocate hash table of %ld entries\n",
		      (long)need);
	exit(EXIT_FAILURE);
    }

    rewind(fp);

    line_number = 0;
    while (fgets(line,sizeof(line),fp) != (char*)NULL)
    {
	line_number++;
	old_label = strtok(line,TOKEN_SEPARATOR);
	new_label = strtok((char*)NULL,TOKEN_SEPARATOR);

	for (p = (const char*)old_label; *p; ++p)
	{				/* make sure old label is legal */
	    if (!ISLABELCHAR(*p))
	    {
		(void)fprintf(stderr,
		"Invalid citation label character `%c' in [%s] at line %ld\n",
			      (int)*p, old_label, line_number);
		(void)fprintf(stderr,
			      "No substitutions possible for this label\n");
		break;
	    }
	}

	if ((old_label == (char*)NULL) || (new_label == (char*)NULL))
	    (void)fprintf(stderr,"Ignoring bad substitution file line: %ld\n",
			  line_number);
	else if (stricmp(old_label,new_label) != 0)
	{				/* save only when labels differ */
	    HASH_ENTRY *h;
	    h = hash_install(STRDUP(old_label), STRDUP(new_label),
			     substitution_table);
	    if (h == (HASH_ENTRY*)NULL)
	    {				/* this CANNOT happen */
		(void)fprintf(stderr,"Catastrophic internal error: \
citation label table overflow at label [%s] line %ld\n",old_label,line_number);
		exit(EXIT_FAILURE);
	    }
	    else
		label_count++;
	}
    }
    if (label_count == 0)
	(void)fprintf(stderr,"Warning: no label substitutions found\n");
}


#if STDC
static void
version(void)
#else
static void
version()
#endif
{
    int k;
    static const char *version_string[] =
    {
	CITESUB_VERSION,
	"\n",

#if defined(HOST) || defined(USER) || defined(__DATE__) || defined(__TIME__)
	"Compiled",

#if defined(USER)
	" by <", USER,

#if defined(HOST)
	"@", HOST,
#endif /* defined(HOST) */

	">",
#endif /* defined(USER) */

#if defined(__DATE__)
	" on ", __DATE__,
#endif /* defined(__DATE__) */

#if defined(__TIME__)
	" ", __TIME__,
#endif /* defined(__TIME__) */

	"\n",
#endif /* defined(HOST)||defined(USER)||defined(__DATE__)||defined(__TIME__) */

	(const char*)NULL,
    };

    for (k = 0; version_string[k] != (const char*)NULL; ++k)
	(void)fprintf(stderr, "%s", version_string[k]);
}
