/* netrik -- The ANTRIK Internet Viewer Copyright (C) Olaf D. Buddenhagen AKA antrik, et al (see AUTHORS) Published under the GNU GPL; see LICENSE for details. */ /* * parse-syntax.c -- this one parses the HTML file to a parse tree. * * (C) 2001 - 2004 antrik * * The generated parse tree contains all HTML elements, their attributes, and * all content. (Text inside the elements.) */ #include #include #include #include #include #include "debug.h" #include "cfg.h" #include "screen.h" #include "syntax.h" #include "load.h" static struct Element *add_element(void); /* create new element node in parse tree */ static void buf_add_char(char chr); /* append single character to text buffer */ static void insert_buf(char *ptr[]); /* insert "text_buf" into parse tree at specified position */ static int html_error(struct Resource *input_res, enum Syntax_error old_level, enum Syntax_error level, const char *err_msg, const char *handling_msg, ...); /* handle syntax error */ static struct Element *cur_el; /* element whose content is currently parsed */ static struct Element *last_el; /* last encountered sub-element (if currently parsing a tag, this is already the new element opened by that tag; if no children, this is identical to cur_el) */ static char *text_buf; /* buffer containing currently extracted content before inserted into parse tree (can also contain tag name, attribute name, or attribute value in respective parsing modes) */ static int text_buf_len; /* current length of "text_buf" (=position to append next char) */ /* create new element node in parse tree; * returns pointer to new node */ static struct Element *add_element(void) { struct Element *new_el; /* newly created element node */ /* alloc element */ new_el=malloc(sizeof(struct Element)); if(new_el==NULL) { fprintf(stderr, "memory allocation error while syntax parsing (in function add_element)\n"); exit(1); } /* insert into tree */ new_el->parent=cur_el; /* my child, so I am parent... */ new_el->list_next=NULL; /* no elements yet behind the new one */ if(cur_el!=NULL) /* has parent (shouldn't happen only while creating root) -> insert (append) to existing tree */ last_el->list_next=new_el; /* no other data yet */ new_el->name.str=NULL; new_el->content=NULL; new_el->attr_count=0; new_el->attr=NULL; new_el->closed=0; return(new_el); } /* append a char to text_buf */ static void buf_add_char(chr) char chr; /* char to append */ { text_buf=realloc(text_buf, ++text_buf_len); /* resize buf */ if(text_buf==NULL) { fprintf(stderr, "memory allocation error while syntax parsing (in function buf_add_char)\n"); exit(1); } text_buf[text_buf_len-1]=chr; /* append char */ } /* insert text_buf into parse tree at specified position; * text_buf can be used for new text blocks afterwards */ static void insert_buf(ptr) char *ptr[]; /* string to be set to "text_buf" (by reference) */ { buf_add_char(0); /* 0-terminate string */ *ptr=text_buf; /* insert */ text_buf=NULL; /* free "text_buf" pointer for further use */ text_buf_len=0; } /* * Handle HTML syntax errors. * * Print error message; if "XHTML_ONLY" or if "CORRECT_HTML" mode, quit * immediately; otherwise, print how error will be handled, and return highest * error level up to now. */ static int html_error( struct Resource *input_res, /* resource descriptor of input page (may need to close input pipe) */ enum Syntax_error old_level, /* highest error level up to now */ enum Syntax_error level, /* new error level */ const char *err_msg, /* what error occured */ const char *handling_msg, /* how error will be handled */ ... /* additional paramters for err_msg or handling_msg */ ) { va_list arg_ptr; if(!cfg.debug) { /* handle repeated errors */ static int ignored=0; static char **ignore=NULL; if(err_msg==NULL) { /* flush ignore list */ if(ignored) fprintf(stderr, "\n(%d more errors suppressed)\n", ignored); ignored=0; free(ignore); ignore=NULL; return old_level; } else { /* normal operation -> check whether this error already occured */ int msg; if(ignore==NULL) { ignore=malloc(sizeof(char *)); if(ignore==NULL) { fprintf(stderr, "Memory allocation error while parsing syntax (in function html_error())\n"); exit(1); } ignore[0]=NULL; } for(msg=0; ignore[msg]!=NULL; ++msg) if(!strcmp(ignore[msg], err_msg)) break; if(ignore[msg]!=NULL) { /* found in list */ ++ignored; return old_level; } else { /* new error */ ignore=realloc(ignore, sizeof(char *[msg+2])); if(ignore==NULL) { fprintf(stderr, "Memory allocation error while parsing syntax (in function html_error())\n"); exit(1); } ignore[msg]=(char *)err_msg; ignore[msg+1]=NULL; } /* new error */ } /* normal operation */ } /* handle repeated errors */ /* print error message */ DMSG(("\n")); #ifdef DEBUG if(cfg.debug) set_color_raw(COLOR_WHITE|8); #endif printf("HTML %s: ", level>=SE_WORKAROUND?"error":"warning"); #ifdef DEBUG reset_color_raw(); #endif va_start(arg_ptr, handling_msg); vprintf(err_msg, arg_ptr); va_end(arg_ptr); printf("\n"); /* exit if necessary */ #ifndef XHTML_ONLY if(cfg.parser==FUSSY_HTML) /* break on all errors */ #endif { if(input_res->type==RES_PIPE) pclose(input_res->handle.stream); exit(2); } /* print handling message */ va_start(arg_ptr, handling_msg); vprintf(handling_msg, arg_ptr); va_end(arg_ptr); printf("\n"); fflush(stdout); /* return new err_level */ if(level>old_level) return level; else return old_level; } /* parse syntax of (X)HTML file; * returns pointer to top of resulting syntax tree */ struct Element *parse_syntax(input, err_level) struct Resource *input; enum Syntax_error *err_level; /* return: syntax errors encountered in page */ { /* mode to parse next char in (<<8 to allow simple combining with "dispatch_char") */ enum { PM_CONTENT=0x0000, /* content (text between/inside elements) */ PM_BLANK=0x0100, /* spaces/newlines in content mode */ PM_PRE=0x0200, /* inside
 element */

      PM_AMP=0x1000,    /* beginning of entity/character reference (after '&') */
      PM_REF=0x1100,    /* inside character entity reference */
      PM_REF_NUM_START=0x1200,    /* beginning of numerical character reference (after "&#") */
      PM_REF_NUM=0x1300,    /* inside numerical character reference */
      PM_REF_HEX=0x1400,    /* inside hexal character reference */

      PM_TAG_START=0x2000,    /* inside tag, before name (after '<') */
      PM_TAG_NAME=0x2100,    /* name of start tag (or single tag) */
      PM_END_TAG_START=0x2200,    /* after '/' */
      PM_END_TAG_NAME=0x2300,    /* name of end tag */
      PM_END_TAG_SPACE=0x2400,    /* (optional) space between name of end tag and '>' */
      PM_TAG=0x2500,    /* inside tag, neither name nor attribute (between attributes) */
      PM_SINGLE_TAG=0x2600,    /* end of empty element tag (after '/') */
      PM_ATTR_NAME=0x2700,
      PM_ATTR_NAME_END=0x2800,    /* (optional) space between attribute name and '=' */
      PM_ATTR_VALUE=0x2900,    /* after '=', but before actual data */
      PM_ATTR_DATA_QUOT=0x2a00,    /* inside attribute value string (after the starting '"') */
      PM_ATTR_DATA_APOS=0x2b00,    /* inside attribute value string (after the starting '\'') */
#ifndef XHTML_ONLY
      PM_ATTR_DATA_NOQUOTE=0x2c00,    /* not quoted attribute value */
#endif

      PM_EXCLAM=0x3000,    /* '!' after '<' (comment, DOCTYPE declaration or CDATA section) */
      PM_COMMENT_START=0x3100,    /* first '-' after " */


   *err_level=SE_NO;    /* no errors yet */

   text_buf=NULL; text_buf_len=0;    /* nothing extracted yet */
   cur_el=NULL;    /* no elements yet */
   last_el=cur_el=tree_top=add_element();    /* create parse tree root */

   for(;;) {    /* process file char-wise */
      int	dispatch_char;    /* input character after modification for dispatch */
      int	dispatch;    /* combined state variable used to determine action in main dispatcher */
      int	recycle=0;    /* need additional dispatch pass (sometimes necessary after switching parsing mode */

      if(input->buf_ptr==input->buf_end) {    /* buf empty */
	 load(input);    /* -> read next block */
	 input->buf_ptr=input->buf;    /* reset read pointer */
      }

      if(input->buf_ptrbuf_end) {    /* buf nonempty */
	 in=*input->buf_ptr++;    /* read char */
	 have_data=1;
      } else {    /* still empty => eof */
	 if(!have_data && !input->user_break)
	    *err_level=SE_NODATA;
	 break;
      }

      DMSG(("%c", in));

      new_cr_mode=0;
      if(tolower(in)>='a' && tolower(in)<='z')    /* letter */
	 dispatch_char='a';    /* indicated by 'a' (shared dispatch case) */
      else if(in>='0' && in<='9')    /* digit */
	 dispatch_char='0';
      else if(strchr("_.:", in)!=NULL)    /* other chars allowed in names */
	 dispatch_char='~';
      else if(strchr(" \t\r\n\f", in)!=NULL) {    /* white space character */
	 if(cr_mode && in=='\n') {    /* '\n' after '\r' -> discard */
	    new_cr_mode=0;    /* discard only one '\n' */
	    continue;
	 } else if(in=='\r') {
	    in='\n';    /* treat '\r' like '\n' */
	    new_cr_mode=1;    /* if real '\n' follows, it has to be discarded */
	 }
	 dispatch_char=' ';
      } else if(!iscntrl(in) && strchr("&<>\";=/\'-![?#", in)==NULL)    /* other normal character (not control char and no html function in any situation) */
	 dispatch_char='$';
      else    /* characters which may have html function */
	 dispatch_char=in;    /* handled each seperately in dispatcher */
      cr_mode=new_cr_mode;

      do {    /* while recycle (additional pass needed) */
	 recycle=0;
	 dispatch=dispatch_char|parse_mode;    /* dispatch on new char and current state */
	 switch(dispatch) {

	    /* normal text char */
	    case 'a'|PM_CONTENT:
	    case '0'|PM_CONTENT:
	    case '~'|PM_CONTENT:
	    case '$'|PM_CONTENT:
	    case ';'|PM_CONTENT:
	    case '='|PM_CONTENT:
	    case '/'|PM_CONTENT:
	    case '>'|PM_CONTENT:
	    case '"'|PM_CONTENT:
	    case '\''|PM_CONTENT:
	    case '-'|PM_CONTENT:
	    case '!'|PM_CONTENT:
	    case '['|PM_CONTENT:
	    case '?'|PM_CONTENT:
	    case '#'|PM_CONTENT:
	       buf_add_char(in);
	       break;

	    /* blank: don't insert blanks immediately; only switch parse mode */
	    case ' '|PM_CONTENT:
	       parse_mode=PM_BLANK;
	       break;

	    /* successing blanks -> do nothing */
	    case ' '|PM_BLANK:
	       break;

	    /* normal char after blanks (any amount) -> insert single ' ' */
	    case 'a'|PM_BLANK:
	    case '0'|PM_BLANK:
	    case '~'|PM_BLANK:
	    case '$'|PM_BLANK:
	    case ';'|PM_BLANK:
	    case '='|PM_BLANK:
	    case '/'|PM_BLANK:
	    case '>'|PM_BLANK:
	    case '"'|PM_BLANK:
	    case '\''|PM_BLANK:
	    case '-'|PM_BLANK:
	    case '!'|PM_BLANK:
	    case '['|PM_BLANK:
	    case '?'|PM_BLANK:
	    case '#'|PM_BLANK:
	    case '&'|PM_BLANK:    /* reference after blank */
	       buf_add_char(' ');
	       parse_mode=PM_CONTENT;
	       recycle=1;    /* still need to process the new char */
	       break;

	    /* normal text char inside 
 element */
	    case 'a'|PM_PRE:
	    case '0'|PM_PRE:
	    case '~'|PM_PRE:
	    case '$'|PM_PRE:
	    case ';'|PM_PRE:
	    case '='|PM_PRE:
	    case '/'|PM_PRE:
	    case '>'|PM_PRE:
	    case '"'|PM_PRE:
	    case '\''|PM_PRE:
	    case '-'|PM_PRE:
	    case '!'|PM_PRE:
	    case '['|PM_PRE:
	    case '?'|PM_PRE:
	    case '#'|PM_PRE:
	       buf_add_char(in);
	       break;

	    case ' '|PM_PRE:    /* whitespace in 
 -> store as normal char */
	       if(in=='\n' || textarea)    /* newline (or anything in