/* * Program: Synonym * File: html_parser.c * Author: Cristian Draghici * Date: 11 Sep 2003 * * $Id: html_parser.c,v 1.2 2003/10/30 14:54:10 diciu Exp $ * * Licensed under the Modulo Consulting Software License * (see file license.txt) * */ #include #include #include #include #include #include #include "disclaimer.h" #include "html_parser.h" #define print_debug syslog /* Called by the SAX parser when it reaches the end of an element (e.g. ) */ static void disclaimer_end_element(void *ctx, const xmlChar *name) { tag_locator *locator; /* ideally added before the or if that's missing the */ if(name != NULL && !strcasecmp(name, "body")) { print_debug(LOG_DEBUG, "html_parse: End line is %d.", getLineNumber(ctx)); locator = (tag_locator *)(((xmlParserCtxtPtr)ctx)->_private); locator->lin = getLineNumber(ctx); locator->col = getColumnNumber(ctx); print_debug(LOG_DEBUG, "html_parse: Body is ending at %d, %d.", locator->lin, locator->col); } } /* Called by the SAX parser on warning, just proxies the log message to syslog */ static void disclaimer_warning(void *ctx, const char *msg, ...) { va_list args; va_start(args, msg); syslog(LOG_DEBUG, "SAX.warning: "); vsyslog(LOG_DEBUG, msg, args); va_end(args); } /* Called by the SAX parser on error, just proxies the log message to syslog */ static void disclaimer_error(void *ctx, const char *msg, ...) { va_list args; va_start(args, msg); syslog(LOG_DEBUG, "SAX.error: "); vsyslog(LOG_DEBUG, msg, args); va_end(args); } /* Called by the SAX parser on fatal error, just proxies the log message to syslog */ static void disclaimer_fatal_error(void *ctx, const char *msg, ...) { va_list args; va_start(args, msg); syslog(LOG_DEBUG, "SAX.fatalError: "); vsyslog(LOG_DEBUG, msg, args); va_end(args); } /* the XMLSAXHandler structure, with our callbacks defined */ xmlSAXHandler disclaimer_SAXHandler_struct = { NULL, /* internalSubset */ NULL, /* isStandalone */ NULL, /* hasInternalSubset */ NULL, /* hasExternalSubset */ NULL, /* resolveEntity */ NULL, /* getEntity */ NULL, /* entityDecl */ NULL, /* notationDecl */ NULL, /* attributeDecl */ NULL, /* elementDecl */ NULL, /* unparsedEntityDecl */ NULL, /* setDocumentLocator */ NULL, /* startDocument */ NULL, /* endDocument */ NULL, /* startElement */ disclaimer_end_element, /* endElement */ NULL, /* reference */ NULL, //disclaimer_characters, /* characters */ NULL, /* ignorableWhitespace */ NULL, /* processingInstruction */ NULL, /* comment */ disclaimer_warning, /* xmlParserWarning */ disclaimer_error, /* xmlParserError */ disclaimer_fatal_error, /* xmlParserError */ NULL, /* getParameterEntity */ NULL, /* cdataBlock */ NULL, /* externalSubset */ 1 }; /* function that attempts to locate the tag offset inside an input stream */ int locate_end_tag(FILE *input, char * buffer, int buffer_size, int * p_column, int * p_line, long html_chunk_size) { int bytes_read, last_readable_chunk; htmlParserCtxtPtr ctxt; tag_locator locator; if(buffer_size < 4) { print_debug(LOG_ERR, "html_parse: Go away. buffer too small!"); return 0; } if(html_chunk_size < 4) { print_debug(LOG_ERR, "html_parse: Html chunk size smaller than 4 chars!"); return 0; } print_debug(LOG_DEBUG, "html_parse: Creating html parser\n"); locator.lin = -1; locator.col = -1; bytes_read = fread(buffer, 1, 4, input); if(bytes_read > 0) { ctxt = htmlCreatePushParserCtxt(&disclaimer_SAXHandler_struct, NULL, buffer, bytes_read, NULL, 0); if(ctxt == NULL) { /* TODO: treat this error */ print_debug(LOG_DEBUG, "NULL context"); } ctxt->_private = (void *)&locator; last_readable_chunk = buffer_size + bytes_read <= html_chunk_size ? buffer_size : html_chunk_size - bytes_read; while ((bytes_read = fread(buffer, 1, last_readable_chunk, input))) { htmlParseChunk(ctxt, buffer, bytes_read, 0); last_readable_chunk = buffer_size + bytes_read <= html_chunk_size ? buffer_size : html_chunk_size - bytes_read; } } htmlParseChunk(ctxt, buffer, 0, 1 ); print_debug(LOG_DEBUG, "html_parse: All done %d %d\n", ((tag_locator *)ctxt->_private)->lin, ((tag_locator *)ctxt->_private)->col); *p_line = ((tag_locator *)ctxt->_private)->lin; *p_column = ((tag_locator *)ctxt->_private)->col; htmlFreeParserCtxt(ctxt); if(*p_column == -1 && *p_line == -1) return DISCLAIMER_FAILURE; return DISCLAIMER_SUCCESS; } /* function that attempts to locate the tag offset inside a string */ int locate_end_tag_in_string(char *input, char * buffer, int buffer_size, int * p_column, int * p_line, long html_chunk_size) { htmlParserCtxtPtr ctxt; tag_locator locator; if(buffer_size < 4) { print_debug(LOG_ERR, "html_parse: Go away. buffer too small!"); return 0; } if(html_chunk_size < 4) { print_debug(LOG_ERR, "html_parse: Html chunk size smaller than 4 chars!"); return 0; } locator.lin = -1; locator.col = -1; print_debug(LOG_DEBUG, "html_parse: Creating html parser from string"); ctxt = htmlCreatePushParserCtxt(&disclaimer_SAXHandler_struct, NULL, input, 4, NULL, 0); if(ctxt == NULL) { /* TODO: treat this error */ printf("NULL context"); } ctxt->_private = (void *)&locator; htmlParseChunk(ctxt, (input+4), html_chunk_size-4, 0); htmlParseChunk(ctxt, buffer, 0, 1 ); print_debug(LOG_DEBUG, "html_parse: All done %d %d\n", ((tag_locator *)ctxt->_private)->lin, ((tag_locator *)ctxt->_private)->col); *p_line = ((tag_locator *)ctxt->_private)->lin; *p_column = ((tag_locator *)ctxt->_private)->col; htmlFreeParserCtxt(ctxt); if(*p_column == -1 && *p_line == -1) return DISCLAIMER_FAILURE; return DISCLAIMER_SUCCESS; }