#ifndef lint static char rcsId[]="$Header: /cvs/gnome/gnome-libs/gtk-xmhtml/Parser.c,v 1.2 1997/12/18 00:39:19 unammx Exp $"; #endif /***** * Parser.c : htmlParserObjectClass routines * * This file Version $Revision: 1.2 $ * * Creation date: Sun Apr 13 00:58:49 GMT+0100 1997 * Last modification: $Date: 1997/12/18 00:39:19 $ * By: $Author: unammx $ * Current State: $State: Exp $ * * Author: newt * * Copyright (C) 1994-1997 by Ripley Software Development * All Rights Reserved * * This file is part of the XmHTML TWidget Library * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Library General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Library General Public License for more details. * * You should have received a copy of the GNU Library General Public * License along with this library; if not, write to the Free * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * *****/ /***** * ChangeLog * $Log: Parser.c,v $ * Revision 1.2 1997/12/18 00:39:19 unammx * It compiles and links -miguel * * Revision 1.1 1997/11/28 03:38:54 gnomecvs * Work in progress port of XmHTML; No, it does not compile, don't even try -mig * * Revision 1.6 1997/10/23 00:24:36 newt * XmHTML Beta 1.1.0 release * * Revision 1.5 1997/08/31 17:31:10 newt * Removed HT_TEXTFLOW * * Revision 1.4 1997/08/30 00:26:56 newt * my_strdup -> strdup and _XmHTMLWarning changes. * * Revision 1.3 1997/08/01 12:53:09 newt * Minor bugfixes in HTML rules + state stack backtracking added. * * Revision 1.2 1997/05/28 01:30:58 newt * Fixes in HTML comment parsing. Modified the parser to properly pick up the * contents of the SCRIPT and STYLE head elements. * * Revision 1.1 1997/04/29 14:19:21 newt * Initial Revision * *****/ #include #include #include #include #include /* parser termination */ /* our private header file */ #include "ParserP.h" #include "XmHTMLfuncs.h" /*** External Function Prototype Declarations ***/ /*** Public Variable Declarations ***/ /*** Private Datatype Declarations ****/ /*** Private Function Prototype Declarations ****/ /** Usefull macros **/ #define ATTR(ID) parser->parser.ID #ifdef WITH_MOTIF # define PARSER XmHTMLParserObject parser #else # define PARSER GtkXmHTMLParser *parser #endif /* push id on state stack */ static void pushState(PARSER, htmlEnum id); /* pop id from state stack */ static htmlEnum popState(PARSER); /* is id on the stack? */ static Boolean onStack(PARSER, htmlEnum id); /* clear and reset the stack */ static void clearStack(PARSER); /* convert token to and internal id using an XmHTMLAliasTable */ static htmlEnum tokenToAlias(PARSER, String token); /* convert token to an internal id */ static htmlEnum tokenToId(PARSER, String token, Boolean warn); /* see if id is a terminated HTML token or not */ static Boolean getTerminatorState(htmlEnum id); /* see if id may appear inside */ static Boolean isBodyElement(htmlEnum id); /* verify occurance of current in parser state state */ static int checkOccurance(PARSER, htmlEnum current, htmlEnum state); /* verify if current may appear as content of parser state state */ static Boolean checkContent(htmlEnum current, htmlEnum state); /* create a new object */ static XmHTMLObject *newElement(PARSER, htmlEnum id, char *element, char *attributes, Boolean is_end, Boolean terminated); /* create and insert a new element object */ static void insertElement(PARSER, String element, htmlEnum new_id, Boolean is_end); /* create and store a text object */ static void storeTextElement(PARSER, char *start, char *end); /* create and store an element object */ static String storeElement(PARSER, char *start, char *end); /* verify presence of id, interactive checking */ static int verifyElement(PARSER, htmlEnum id, Boolean is_end); /* verify presence of id, automatic checking */ static int verifyDefault(PARSER, htmlEnum id, Boolean is_end); /* verify the verification and repair of the current document */ static XmHTMLObject *verifyVerification(PARSER); /* create a valid parser tree for viewing image image_file */ static int parseIMAGE(PARSER, char *image_file); /* parse (also progressive) using PARSER */ static int parseHTML(PARSER); /* remove comments from given text. */ static String cutComment(PARSER, String start); /* main parser driver routine */ static Boolean parserDriver(PARSER, String source); /* perform parser wrapup */ static Boolean parserEndSource(PARSER); /* create a text block for the given id */ static void makeTextBlockFromId(XmHTMLTextBlock block, htmlEnum id, Boolean is_end); /* make a text block from current position in the current source text */ static void makeTextBlockFromInput(PARSER, XmHTMLTextBlock block); /* XmNmodifyVerifyCallback driver */ static void modifyCallback(PARSER, Byte action, htmlEnum id, Boolean is_end); /* XmNdocumentCallback driver */ static Boolean documentCallback(PARSER, Boolean verified); /* XmNparserCallback driver */ static int parserCallback(PARSER, htmlEnum id, htmlEnum current, htmlEnum new_id, parserError error, Boolean is_end); /* alphabetically sort the given alias table */ static void sortAliasTable(XmHTMLAliasTable table, int nalias); /* add an alias to the given alias table */ static XmHTMLAliasTable addAliasToTable(PARSER, XmHTMLAliasTable table, int *num, String element, htmlEnum alias); /* remove an alias from the given alias table */ static XmHTMLAliasTable removeAliasFromTable(PARSER, XmHTMLAliasTable table, int *num, String element, htmlEnum alias, Boolean *error); /* copy the given alias table to a new alias table */ static XmHTMLAliasTable copyAliasTable(XmHTMLAliasTable source, int nalias, int *copied); /* elements for which a closing counterpart is optional */ #define OPTIONAL_CLOSURE(id) ((id) == HT_DD || (id) == HT_DT || \ (id) == HT_LI || (id) == HT_P || (id) == HT_OPTION || (id) == HT_TD || \ (id) == HT_TH || (id) == HT_TR) /* physical/logical markup elements */ #define IS_MARKUP(id) ((id) == HT_TT || (id) == HT_I || (id) == HT_B || \ (id) == HT_U || (id) == HT_STRIKE || (id) == HT_BIG || (id) == HT_SMALL || \ (id) == HT_SUB || (id) == HT_SUP || (id) == HT_EM || (id) == HT_STRONG || \ (id) == HT_DFN || (id) == HT_CODE || (id) == HT_SAMP || (id) == HT_KBD || \ (id) == HT_VAR || (id) == HT_CITE || (id) == HT_FONT) #define IS_MISC(id) ((id) == HT_P || (id) == HT_H1 || (id) == HT_H2 || \ (id) == HT_H3 || (id) == HT_H4 || (id) == HT_H5 || (id) == HT_H6 || \ (id) == HT_PRE || (id) == HT_ADDRESS || (id) == HT_APPLET || \ (id) == HT_CAPTION || (id) == HT_A) /* text containers */ #define IS_CONTAINER(id) ((id) == HT_BODY || (id) == HT_DIV || \ (id) == HT_CENTER || (id) == HT_BLOCKQUOTE || (id) == HT_TD || \ (id) == HT_FORM || (id) == HT_TH || (id) == HT_DT || (id) == HT_DD || \ (id) == HT_LI || (id) == HT_NOFRAMES) /* all elements that may be nested */ #define NESTED_ELEMENT(id) (IS_MARKUP(id) || (id) == HT_APPLET || \ (id) == HT_BLOCKQUOTE || (id) == HT_DIV || (id) == HT_CENTER || \ (id) == HT_FRAMESET) #define DEFAULT_CONTENT(id) ((id) == HT_A || (id) == HT_APPLET || \ (id) == HT_B || (id) == HT_BIG || (id) == HT_BR || (id) == HT_CITE || \ (id) == HT_CODE || (id) == HT_DFN || (id) == HT_EM || (id) == HT_FONT || \ (id) == HT_I || (id) == HT_IMG || (id) == HT_INPUT || (id) == HT_KBD || \ (id) == HT_MAP || (id) == HT_NOFRAMES || (id) == HT_SAMP || \ (id) == HT_SCRIPT || (id) == HT_SELECT || (id) == HT_SMALL || \ (id) == HT_STRIKE || (id) == HT_STRONG || (id) == HT_SUB || \ (id) == HT_SUP || (id) == HT_TEXTAREA || (id) == HT_TT || (id) == HT_U || \ (id) == HT_VAR || (id) == HT_ZTEXT) /*** Private Variable Declarations ***/ static Boolean parser_terminated; /* for parser termination */ static jmp_buf parser_jmp; /* for parser termination */ /* parseHTML return codes */ #define PARSER_END 0 /* no new source added since last call */ #define PARSER_CONTINUE 1 /* parser parsed the text successfully */ #define PARSER_ERROR 2 /* parser encountered an error */ /* no translations */ /* no actions */ #ifdef WITH_MOTIF # include "Parser-motif.c" #else # include "gtk-xmhtml-parser.c" #endif /***** * Name: Destroy * Return Type: void * Description: XmHTMLParserObjectClass destroy method. * In: * w: parser to destroy * Returns: * nothing. *****/ static void Destroy(TWidget w) { XmHTMLParserObject parser = (XmHTMLParserObject)w; /* free current source */ if(ATTR(source)) free(ATTR(source)); /* clear current parser tree */ if(ATTR(head)) _XmHTMLParserFreeObjects(ATTR(head)); /* clear open state stack */ if(ATTR(stack)) clearStack(parser); /* destroy alias table */ if(ATTR(nalias)) XmHTMLParserDestroyAliasTable(ATTR(alias_table), ATTR(nalias)); } /***************************************************************************** * Chapter 2 * Private Functions *****************************************************************************/ /*** * Section 2.1 Parser state stack routines * * This set of routines maintains the parser's internal stack of HTML block * elements. This stack defines the current parsers state, which is important * for checking both occurance and contents of newly encountered HTML elements. ***/ /***** * Name: pushState * Return Type: void * Description: pushes the given id on the state stack * In: * PARSER: current parser * id: element id to push * Returns: * nothing. *****/ static void pushState(PARSER, htmlEnum id) { stateStack *tmp; tmp = (stateStack*)malloc(sizeof(stateStack)); tmp->id = id; tmp->next = ATTR(stack); ATTR(stack) = tmp; ATTR(depth) += 1; } /***** * Name: popState * Return Type: htmlEnum * Description: pops an element of the state stack * In: * PARSER: current parser * Returns: * id of element popped. *****/ static htmlEnum popState(PARSER) { htmlEnum id; stateStack *tmp; if(ATTR(stack)->next != NULL) { tmp = ATTR(stack); ATTR(stack) = ATTR(stack)->next; id = tmp->id; free((char*)tmp); ATTR(depth) -= 1; } else id = ATTR(stack)->id; return(id); } /***** * Name: onStack * Return Type: Boolean * Description: checks whether the given id is somewhere on the current * state stack. * In: * PARSER: current parser * id: element id to check. * Returns: * True when present, False if not. *****/ static Boolean onStack(PARSER, htmlEnum id) { stateStack *tmp = ATTR(stack); while(tmp->next != NULL && tmp->id != id) tmp = tmp->next; return(tmp->id == id); } /***** * Name: clearStack * Return Type: void * Description: clears and resets the state stack of a parser * In: * parser: XmHTMLParserObject id * Returns: * nothing *****/ static void clearStack(PARSER) { while(ATTR(stack)->next != NULL) (void)popState(parser); ATTR(depth) = 0; ATTR(base).next = NULL; ATTR(base).id = HT_DOCTYPE; ATTR(stack) = &(ATTR(base)); } /*** * Section 2.2 Token Resolvers. * * This set of routines convert HTML element identifiers to their corresponding * internal id. All these routines use a binary search thru a list of * alphabetically sorted strings. ***/ /***** * Name: tokenToAlias * Return Type: htmlEnum * Description: searches the current parser alias table to convert an * unknown html token to a known internal id. * In: * PARSER: current parser * token: unknown token * Returns: * aliased id if found, -1 if not found. *****/ static htmlEnum tokenToAlias(PARSER, String token) { register int mid, lo = 0, hi = ATTR(nalias); XmHTMLAliasTable alias_table = ATTR(alias_table); int cmp; /* sanity */ if(hi == 0) return(-1); while(lo <= hi) { mid = (lo + hi)/2; if((cmp = strcmp(token, alias_table[mid].element)) == 0) return(alias_table[mid].alias); else if(cmp < 0) /* in lower end of array */ hi = mid - 1; else /* in higher end of array */ lo = mid + 1; } return(-1); } /***** * Name: tokenToId * Return Type: int * Description: converts the html token passed to an internal id. * In: * PARSER: current parser * token: token for which to fetch an internal id. * warn: if true, spits out a warning for unknown tokens. * Returns: * The internal id upon success, -1 upon failure * * Note: this routine uses a binary search into an array of all possible * HTML 3.2 tokens. It is very important that _BOTH_ the array * html_tokens _AND_ the enumeration htmlEnum are *NEVER* changed. * Both arrays are alphabetically sorted. Modifying any of these two * arrays will have VERY SERIOUS CONSEQUENCES, the return value of this * function matches a corresponding htmlEnum value. * As the table currently contains about 70 elements, a match will always * be found in at most 7 iterations (2^7 = 128) *****/ static htmlEnum tokenToId(PARSER, String token, Boolean warn) { register int mid, lo = 0, hi = HT_ZTEXT-1; int cmp; while(lo <= hi) { mid = (lo + hi)/2; if((cmp = strcmp(token, html_tokens[mid])) == 0) return(mid); else if(cmp < 0) /* in lower end of array */ hi = mid - 1; else /* in higher end of array */ lo = mid + 1; } /* * Not found, invalid token passed * We don't want always have warnings. When scanning for SGML shorttags, * this routine is used to check whether we a / is right behind a token or * not. */ if(warn) { int ret_val = -1; if(ATTR(nalias)) ret_val = tokenToAlias(parser, token); if(ret_val == -1) { ret_val = parserCallback(parser, HT_ZTEXT, HT_ZTEXT, HT_ZTEXT, HTML_UNKNOWN_ELEMENT, False); if(ret_val == HTML_REMOVE) return(-1); else /* * unknown element was aliased to a known one. Recheck this * element. Parser aliases are only supported for the standalone * parsers, so we assume the aliased id is a valid one. If it * isn't too bad then, its the programmers responsibility to set * up a correct parser alias table. */ return(tokenToAlias(parser, token)); } return(ret_val); } return(-1); } /*** * Section 2.3 Element property routines. * * These are probably the most important routines used by the parser. They * define the ending state of an element, verify if a new element is allowed * to appear inside the current parser state and whether an element may contain * a given element. Depending on the state of the parser, the latter two * routines also suggest which element should be inserted to allow the offending * element (this basically constitutes the parser's ability to verify and * repair HTML documents). ***/ /***** * Name: getTerminatorState * Return Type: Boolean * Description: checks if the given element has a terminating counterpart * In: * id: element to check * Returns: * True when the given element is terminated, false if not. *****/ static Boolean getTerminatorState(htmlEnum id) { switch(id) { /* Elements that are never terminated */ case HT_AREA: case HT_BASE: case HT_BASEFONT: case HT_BR: case HT_DOCTYPE: case HT_FRAME: case HT_HR: case HT_IMG: case HT_INPUT: case HT_ISINDEX: case HT_LINK: case HT_META: case HT_STYLE: case HT_TAB: case HT_ZTEXT: return(False); /* all other elements are always terminated */ default: return(True); } return(False); /* not reached */ } /***** * Name: isBodyElement * Return Type: Boolean * Description: checks whether the given id is allowed to appear inside the * tag. * In: * id: id to check. * Returns: * True when allowed, False if not. *****/ static Boolean isBodyElement(htmlEnum id) { switch(id) { /* all elements but these belong in body */ case HT_DOCTYPE: case HT_BASE: case HT_HTML: case HT_HEAD: case HT_LINK: case HT_META: case HT_STYLE: case HT_TITLE: case HT_ZTEXT: case HT_FRAMESET: case HT_FRAME: return(False); default: return(True); } return(True); /* not reached */ } /***** * Name: checkOccurance * Return Type: Boolean * Description: checks whether the appearence of the current token is * allowed in the current parser state. * In: * PARSER: current parser * current: HTML token to check * state: parser state * Returns: * When current is not allowed, the id of the element that should be * preceeding this one. If no suitable preceeding element can be deduced, * it returns -1. When the element is allowed, HT_ZTEXT is returned. *****/ static int checkOccurance(PARSER, htmlEnum current, htmlEnum state) { stateStack *curr; switch(current) { case HT_DOCTYPE: return((int)HT_ZTEXT); /* always allowed */ case HT_HTML: if(state == HT_DOCTYPE) return((int)HT_ZTEXT); return(-1); case HT_BODY: if(state == HT_HTML || state == HT_FRAMESET) return((int)HT_ZTEXT); else { /* try and guess an appropriate return value */ if(state == HT_HEAD) return((int)HT_HEAD); else return((int)HT_HTML); } return(-1); /* not reached */ case HT_HEAD: case HT_FRAMESET: if(state == HT_HTML || state == HT_FRAMESET) return((int)HT_ZTEXT); else return((int)HT_HTML); /* obvious */ break; case HT_NOFRAMES: if(state == HT_BODY) return((int)HT_ZTEXT); else return((int)HT_BODY); /* not really obvious */ break; case HT_FRAME: if(state == HT_FRAMESET) return((int)HT_ZTEXT); else return((int)HT_FRAMESET); /* obvious */ break; case HT_BASE: case HT_ISINDEX: case HT_LINK: case HT_META: case HT_SCRIPT: case HT_STYLE: case HT_TITLE: if(state == HT_HEAD) return((int)HT_ZTEXT); /* only allowed in the section */ else return((int)HT_HEAD); /* obvious */ break; case HT_A: if(state == HT_A) return(-1); /* no nested anchors */ case HT_APPLET: case HT_B: case HT_BASEFONT: case HT_BIG: case HT_BR: case HT_CITE: case HT_CODE: case HT_DFN: case HT_EM: case HT_FONT: case HT_I: case HT_IMG: case HT_INPUT: case HT_KBD: case HT_MAP: case HT_SMALL: case HT_SAMP: case HT_SELECT: case HT_STRIKE: case HT_STRONG: case HT_SUB: case HT_SUP: case HT_TAB: case HT_TEXTAREA: case HT_TT: case HT_U: case HT_VAR: if(IS_CONTAINER(state) || IS_MARKUP(state) || IS_MISC(state)) return((int)HT_ZTEXT); else return(-1); /* too bad, obliterate it */ case HT_ZTEXT: return(HT_ZTEXT); /* always allowed */ case HT_AREA: /* only allowed when inside a */ if(state == HT_MAP) return((int)HT_ZTEXT); else return((int)HT_MAP); /* obvious */ break; case HT_P: if(state == HT_ADDRESS || IS_CONTAINER(state)) return((int)HT_ZTEXT); /* guess a proper return value */ switch(state) { case HT_OL: case HT_UL: case HT_DIR: case HT_MENU: return((int)HT_LI); case HT_TABLE: return((int)HT_TR); case HT_DL: default: return(-1); /* too bad, obliterate it */ } return(-1); /* not reached */ case HT_FORM: if(state == HT_FORM) return(-1); /* no nested forms */ /* fall thru */ case HT_ADDRESS: case HT_BLOCKQUOTE: case HT_CENTER: case HT_DIV: case HT_H1: case HT_H2: case HT_H3: case HT_H4: case HT_H5: case HT_H6: case HT_HR: case HT_TABLE: case HT_DIR: case HT_MENU: case HT_PRE: case HT_OL: case HT_UL: if(IS_CONTAINER(state)) return((int)HT_ZTEXT); return(-1); /* too bad, obliterate it */ case HT_LI: if(state == HT_UL || state == HT_OL || state == HT_DIR || state == HT_MENU) return((int)HT_ZTEXT); /* * Guess a return value: walk the current parser state and * see if a list is already present. If it's not, return HT_UL, * else return -1. */ for(curr = ATTR(stack); curr->next != NULL; curr = curr->next) { if(curr->id == HT_UL || curr->id == HT_OL || curr->id == HT_DIR || curr->id == HT_MENU) return(-1); } return((int)HT_UL); /* start a new list */ /* *

receives special treatment: people often use this element * out of context to get an indented outline by using the following * sequence:

..