	/* bibparse - parse BibTeX file according to prototype grammar */
%{
 /**********************************************************************
 @Lex-file{
    author              = "Nelson H. F. Beebe",
    version             = "1.04",
    date                = "07 May 1999",
    time                = "07:53:04 MDT",
    filename            = "biblex.l",
    address             = "Center for Scientific Computing
                           University of Utah
                           Department of Mathematics, 322 INSCC
                           155 S 1400 E RM 233
                           Salt Lake City, UT 84112-0090
                           USA",
    telephone           = "+1 801 581 5254",
    FAX                 = "+1 801 585 1640, +1 801 581 4148",
    URL                 = "http://www.math.utah.edu/~beebe",
    checksum            = "04346 616 1389 12897",
    email               = "beebe@math.utah.edu, beebe@acm.org,
                           beebe@ieee.org (Internet)",
    codetable           = "ISO/ASCII",
    keywords            = "BibTeX, bibliography, lexical analysis, lexer",
    supported           = "yes",
    docstring           = "This lex file defines a lexical analyzer for
                           a prototype BibTeX grammar.

                           The resulting program reads one or more
                           BibTeX files specified on the command line,
                           or standard input, and produces on the
                           standard output a stream of tokens of the
                           form

                           <small-integer><tab><name><tab><quoted string>

                           This is similar to the format produced by
                           bibclean with the -no-prettyprint option.

                           Such token lines are interspersed with
                           input line identifier lines of the form
                           used by the ANSI/ISO Standard C preprocessor

                           # line 3 <quoted filename>

                           so that the error diagnostics can identify
                           the location in the input file(s).

                           The output stream can be filtered by other
                           utilities, and reconstructed into a BibTeX
                           file with an associated software tool,
                           bibunlex.

                           The checksum field above contains a CRC-16
                           checksum as the first value, followed by the
                           equivalent of the standard UNIX wc (word
                           count) utility output of lines, words, and
                           characters.  This is produced by Robert
                           Solovay's checksum utility.",
 }
 **********************************************************************/

#include <config.h>

#include <stdio.h>
#include <string.h>
#include <ctype.h>

#if defined(HAVE_STDLIB_H)
#include <stdlib.h>
#endif

#include "args.h"
#include "bibyydcl.h"

#include "token.h"

static void		compact_space ARGS((void));
static void		eof_error ARGS((void));
static int		next_char ARGS((void));
static token_t		out_braced_literal ARGS((void));
static token_t		out_braced_string ARGS((void));
static token_t		out_lbrace ARGS((void));
static token_t		out_lparen ARGS((void));
static token_t		out_protected_string ARGS((token_t t));
static token_t		out_rbrace ARGS((void));
static token_t		out_rparen ARGS((void));
static token_t		out_string ARGS((void));
static token_t		out_token ARGS((token_t t_));
static void		overflow ARGS((void));

static int		brace_level = 0;
int			do_lex_output = 1;
static token_t		last_object = TOKEN_UNKNOWN;
static token_t		last_token = TOKEN_UNKNOWN;
long			line_number = 1L; /* global: used in lexmain() */
static int		paren_level = 0;
const char		*the_filename = ""; /* global: used in lexmain() */

#define BYTE_VAL(c)	((unsigned int)((c) & 0xff))
#define EOFILE		0		/* end-of-file from lex input() */
#define ISPRINT(c)	isprint(BYTE_VAL(c)) /* ensure arg in 0..255 */
#define ISDIGIT(c)	isdigit(BYTE_VAL(c))
#define ISSPACE(c)	isspace(BYTE_VAL(c))
#define RETURN(n)	return (1000 + (n))	/* bibparse.y biases by 1000 */

#undef YYLMAX

#if defined(HAVE_IBMPC)
#define YYLMAX	32760
#else
#define YYLMAX	BIBYYLMAX
#endif

%}
			/* increase transition and output table sizes */
%a		3000
%o		6000
				/* abbrev, entry, key, field name syntax */
N		[A-Za-z][-A-Za-z0-9:.+/']*
					/* BibTeX entry opening delimiter */
O		[({]
					/* one white space character */
					/* \013 == \v, but lex doesn't */
					/* recognize \v */
W		[ \f\r\t\013]
					/* optional `horizontal' space */
S		{W}*

%%
[@]				RETURN (out_token(TOKEN_AT));

[Cc][Oo][Mm][Mm][Ee][Nn][Tt]	{RETURN ((last_token == TOKEN_AT) ?
					out_token(TOKEN_COMMENT) :
					out_token(TOKEN_ABBREV)); }

[Ii][Nn][Cc][Ll][Uu][Dd][Ee]	{RETURN ((last_token == TOKEN_AT) ?
					out_token(TOKEN_INCLUDE) :
					out_token(TOKEN_ABBREV)); }

[Pp][Rr][Ee][Aa][Mm][Bb][Ll][Ee] { RETURN ((last_token == TOKEN_AT) ?
					out_token(TOKEN_PREAMBLE) :
					out_token(TOKEN_ABBREV)); }

[Ss][Tt][Rr][Ii][Nn][Gg]	{ RETURN ((last_token == TOKEN_AT) ?
					out_token(TOKEN_STRING) :
					out_token(TOKEN_ABBREV)); }

{N}				{
				    if (last_object == TOKEN_STRING)
					RETURN(out_token(TOKEN_ABBREV));
				    switch (last_token)
				    {
				    case TOKEN_COMMA:
					RETURN(out_token(TOKEN_FIELD));
				    case TOKEN_LBRACE:
					RETURN(out_token(TOKEN_KEY));
				    case TOKEN_AT:
					RETURN(out_token(TOKEN_ENTRY));
				    default:
					RETURN(out_token(TOKEN_ABBREV));
				    }
				}

[0-9]+				RETURN (out_token(TOKEN_VALUE));

[%].*[\n]{S}			RETURN (out_token(TOKEN_INLINE));

[#]				RETURN (out_token(TOKEN_SHARP));

["]				RETURN (out_string());

[{]				RETURN (out_lbrace());

[}]				RETURN (out_rbrace());

[(]				RETURN (out_lparen());

[)]				RETURN (out_rparen());

[=]				RETURN (out_token(TOKEN_EQUALS));

[,]				RETURN (out_token(TOKEN_COMMA));

[\n]				RETURN (out_token(TOKEN_NEWLINE));

{W}+				RETURN (out_token(TOKEN_SPACE));

.				RETURN (out_token(TOKEN_LITERAL));

%%
static void
compact_space(VOID) /* compact runs of space to single blank */
{
    char *p;
    char *q;

    for (p = q = (char*)&yytext[0]; *p ; )
    {
	*q++ = ISSPACE(*p) ? ' ' : *p;
	if (ISSPACE(*p))
	{
	    while (ISSPACE(*p))
		++p;
	}
	else
	    ++p;
    }
    *q = '\0';
}


static void
eof_error(VOID)
{
    (void)fprintf(stderr,"End-of-file in value string\n");
    exit(EXIT_FAILURE);
}


static int
next_char(VOID)
{
    int c;

    c = input();
    if ((c == EOFILE) || (c == EOF))	/* lex uses EOFILE, flex uses EOF */
	eof_error();
    else if (c == '\n')
	line_number++;

    return (c);
}


static token_t
out_braced_literal(VOID)
{
    int c;
    int n;
    int plevel = paren_level;

    for (n = 1; brace_level > 0; )
    {
	c = next_char();
	if (c == EOF)
	    break;
	if (n > (YYLMAX - 2))
	    overflow();
	yytext[n++] = c;
	switch (c)
	{
	case '(':
	    plevel++;
	    break;
	case ')':
	    plevel--;
	    break;
	case '{':
	    brace_level++;
	    break;
	case '}':
	    brace_level--;
	    break;
	default:
	    break;
	}
	if ((paren_level > 0) && (plevel == 0))
	    break;
    }
    yytext[0] = '{';
    yytext[n-1] = '}';
    yytext[n] = '\0';
    return (out_token(TOKEN_LITERAL));
}


static token_t
out_braced_string(VOID)
{					/* convert braced to quoted string */
    int blevel;
    int c;
    int n;

    for (blevel = 1, n = 1; (blevel > 0); )
    {
	c = next_char();
	if (c == EOF)
	    break;
	if (n > (YYLMAX - 5))
	    overflow();
	yytext[n++] = c;
	switch (c)
	{
	case '{':
	    blevel++;
	    break;

	case '}':
	    blevel--;
	    break;

	case '"':
	    if (blevel == 1)
	    {
		if (yytext[n-2] == '\\')
		{
		    c = next_char();
		    if (c == EOF)
			break;
		    yytext[n-2] = '{';
		    yytext[n-1] = '\\';
		    yytext[n++] = '"';
		    yytext[n++] = c;
		    yytext[n++] = '}';
		}
		else
		{
		    yytext[n-1] = '{';
		    yytext[n++] = '"';
		    yytext[n++] = '}';
		}
	    }
	    break;

	default:
	    break;
	}
    }
    yytext[0] = '"';
    yytext[n-1] = '"';
    yytext[n] = '\0';
    return (out_token(TOKEN_VALUE));
}


static token_t
out_lbrace(VOID)
{
    if (brace_level == 0)
    {
	brace_level++;
	switch (last_object)
	{
	case TOKEN_COMMENT:
	case TOKEN_INCLUDE:
	    return (out_braced_literal());
	default:
	    return (out_token(TOKEN_LBRACE));
	}
    }
    else
	return (out_braced_string());
}


static token_t
out_lparen(VOID)
{
    switch (last_token)
    {
    case TOKEN_ENTRY:
    case TOKEN_PREAMBLE:
    case TOKEN_STRING:
	yytext[0] = '{';
	paren_level++;
	brace_level++;
	return (out_token(TOKEN_LBRACE));

    case TOKEN_COMMENT:
    case TOKEN_INCLUDE:
	yytext[0] = '{';
	paren_level++;
	brace_level++;
	return (out_braced_literal());

    default:
	return (out_token(TOKEN_LITERAL));
    }
}


#if NEW_STYLE
static token_t
out_protected_string(token_t t)
#else /* K&R style */
static token_t
out_protected_string(t)
token_t t;
#endif /* NEW_STYLE */
{
    char octal[4 + 1];
    const char *token = (const char*)&yytext[0];

    if (*token == (char)'\0')	/* ignore empty tokens */
	return (TOKEN_VALUE);
    output('"');
    if ((yytext[0] != '"') && (t == TOKEN_VALUE))
    {					/* supply missing quote delimiters */
	output('\\');
	output('\"');
    }
    for (; *token; ++token)
    {
	switch (*token)
	{
	case '"':
	case '\\':
	    output('\\');
	    output(*token);
	    break;
	case '\b':
	    output('\\');
	    output('b');
	    break;
	case '\f':
	    output('\\');
	    output('f');
	    break;
	case '\n':
	    output('\\');
	    output('n');
	    break;
	case '\r':
	    output('\\');
	    output('r');
	    break;
	case '\t':
	    output('\\');
	    output('t');
	    break;
	case '\v':
	    output('\\');
	    output('v');
	    break;
	default:
	    if (ISPRINT(*token))
		output(*token);
	    else
	    {
		(void)sprintf(octal,"\\%03o",BYTE_VAL(*token));
		output(octal[0]);
		output(octal[1]);
		output(octal[2]);
		output(octal[3]);
	    }
	    break;
	}
    }
    if ((yytext[0] != '"') && (t == TOKEN_VALUE))
    {					/* supply missing quote delimiters */
	output('\\');
	output('\"');
    }
    output('"');
    output('\n');
    return (TOKEN_VALUE);
}


static token_t
out_rbrace(VOID)
{
    if (brace_level == 1)
    {
	brace_level--;
	return (out_token(TOKEN_RBRACE));
    }
    else
	return (out_token(TOKEN_LITERAL));
}


static token_t
out_rparen(VOID)
{
    paren_level--;
    if (paren_level  == 0)
    {
	yytext[0] = '}';
	brace_level--;
	return (out_token(TOKEN_RBRACE));
    }
    else
	return (out_token(TOKEN_LITERAL));
}


static token_t
out_string(VOID)
{
    int blevel;
    int c;
    int n;

    for (blevel = 0, n = 1; ; )
    {
	c = next_char();
	if (c == EOF)
	    break;
	if (n > (YYLMAX - 2))
	    overflow();
	yytext[n++] = c;
	switch (c)
	{
	case '{':
	    blevel++;
	    break;
	case '}':
	    blevel--;
	    break;
	case '"':
	    if (blevel == 0)
		goto LOOP_EXIT;
	default:
	    break;
	}
    }
 LOOP_EXIT:
    yytext[n++] = '\0';
    return (out_token(TOKEN_VALUE));
}


#if NEW_STYLE
static token_t
out_token(token_t t)
#else /* K&R style */
static token_t
out_token(t)
token_t t;
#endif /* NEW_STYLE */
{		/* ALL token output is directed through this function */
    int n;

    if (do_lex_output)
	(void)printf("%d\t%s\t", (int)t, type_name[(int)t]);
    switch (t)
    {
    case TOKEN_AT:
	last_object = TOKEN_UNKNOWN;
	if (do_lex_output)
	    (void)printf("\"%s\"\n", yytext);
	break;

    case TOKEN_VALUE:
	if (do_lex_output)
	{
	    if (ISDIGIT(yytext[0]))
	    {				/* supply surrounding quotes */
		n = strlen((const char*)&yytext[0]);
		if ((n + 3) > YYLMAX)
		    overflow();
		yytext[n+2] = '\0';
		yytext[n+1] = '"';
		for (; n > 0; --n)
		    yytext[n] = yytext[n-1];
	    }
	    else
		compact_space();
	    out_protected_string(t);
	}
	break;

    case TOKEN_COMMENT:
    case TOKEN_INCLUDE:
	if (do_lex_output)
	    out_protected_string(t);
	last_object = t;
	break;

    case TOKEN_ENTRY:
    case TOKEN_PREAMBLE:
    case TOKEN_STRING:
	if (do_lex_output)
	    (void)printf("\"%s\"\n", yytext);
	last_object = t;
	break;

    case TOKEN_FIELD:
    case TOKEN_KEY:
	if (do_lex_output)
	    (void)printf("\"%s\"\n", yytext);
	break;

    case TOKEN_INLINE:
    case TOKEN_NEWLINE:
	line_number++;
	if (do_lex_output)
	{
	    out_protected_string(t);
	    (void)printf("# line %ld \"%s\"\n", line_number, the_filename);
	}
	break;

    case TOKEN_LITERAL:
    default:
	if (do_lex_output)
	    out_protected_string(t);
	break;
    }
    if (!((t == TOKEN_INLINE) ||
	  (t == TOKEN_SPACE) ||
	  (t == TOKEN_NEWLINE)))
	last_token = t;		/* remember last non-space token type */
    return (t);
}


static void
overflow(VOID)
{
    (void)fprintf(stderr,
	    "String too long for %ld-character buffer\n",YYLMAX);
    exit (EXIT_FAILURE);
}


#if defined(HAVE_IBMPC)
int
yywrap(VOID)
{
    return 1;
}
#endif
