/*
* Program: Synonym
* File: html_parser.c
* Author: Cristian Draghici
* Date: 11 Sep 2003
*
* $Id: html_parser.c,v 1.2 2003/10/30 14:54:10 diciu Exp $
*
* Licensed under the Modulo Consulting Software License
* (see file license.txt)
*
*/
#include <libxml/HTMLparser.h>
#include <libxml/tree.h>
#include <string.h>
#include <stdarg.h>
#include <sys/stat.h>
#include <syslog.h>
#include "disclaimer.h"
#include "html_parser.h"
#define print_debug syslog
/* Called by the SAX parser when it reaches the end of an element (e.g. </body>) */
static void disclaimer_end_element(void *ctx, const xmlChar *name)
{
tag_locator *locator;
/* ideally added before the </body> or if that's missing the </html>*/
if(name != NULL && !strcasecmp(name, "body"))
{
print_debug(LOG_DEBUG, "html_parse: End line is %d.", getLineNumber(ctx));
locator = (tag_locator *)(((xmlParserCtxtPtr)ctx)->_private);
locator->lin = getLineNumber(ctx);
locator->col = getColumnNumber(ctx);
print_debug(LOG_DEBUG, "html_parse: Body is ending at %d, %d.",
locator->lin, locator->col);
}
}
/* Called by the SAX parser on warning, just proxies the log message to syslog */
static void disclaimer_warning(void *ctx, const char *msg, ...)
{
va_list args;
va_start(args, msg);
syslog(LOG_DEBUG, "SAX.warning: ");
vsyslog(LOG_DEBUG, msg, args);
va_end(args);
}
/* Called by the SAX parser on error, just proxies the log message to syslog */
static void disclaimer_error(void *ctx, const char *msg, ...)
{
va_list args;
va_start(args, msg);
syslog(LOG_DEBUG, "SAX.error: ");
vsyslog(LOG_DEBUG, msg, args);
va_end(args);
}
/* Called by the SAX parser on fatal error, just proxies the log message to syslog */
static void disclaimer_fatal_error(void *ctx, const char *msg, ...)
{
va_list args;
va_start(args, msg);
syslog(LOG_DEBUG, "SAX.fatalError: ");
vsyslog(LOG_DEBUG, msg, args);
va_end(args);
}
/* the XMLSAXHandler structure, with our callbacks defined */
xmlSAXHandler disclaimer_SAXHandler_struct = {
NULL, /* internalSubset */
NULL, /* isStandalone */
NULL, /* hasInternalSubset */
NULL, /* hasExternalSubset */
NULL, /* resolveEntity */
NULL, /* getEntity */
NULL, /* entityDecl */
NULL, /* notationDecl */
NULL, /* attributeDecl */
NULL, /* elementDecl */
NULL, /* unparsedEntityDecl */
NULL, /* setDocumentLocator */
NULL, /* startDocument */
NULL, /* endDocument */
NULL, /* startElement */
disclaimer_end_element, /* endElement */
NULL, /* reference */
NULL, //disclaimer_characters, /* characters */
NULL, /* ignorableWhitespace */
NULL, /* processingInstruction */
NULL, /* comment */
disclaimer_warning, /* xmlParserWarning */
disclaimer_error, /* xmlParserError */
disclaimer_fatal_error, /* xmlParserError */
NULL, /* getParameterEntity */
NULL, /* cdataBlock */
NULL, /* externalSubset */
1
};
/* function that attempts to locate the </body> tag offset inside an input stream */
int locate_end_tag(FILE *input, char * buffer, int buffer_size, int * p_column, int * p_line, long html_chunk_size)
{
int bytes_read, last_readable_chunk;
htmlParserCtxtPtr ctxt;
tag_locator locator;
if(buffer_size < 4)
{
print_debug(LOG_ERR, "html_parse: Go away. buffer too small!");
return 0;
}
if(html_chunk_size < 4)
{
print_debug(LOG_ERR, "html_parse: Html chunk size smaller than 4 chars!");
return 0;
}
print_debug(LOG_DEBUG, "html_parse: Creating html parser\n");
locator.lin = -1;
locator.col = -1;
bytes_read = fread(buffer, 1, 4, input);
if(bytes_read > 0)
{
ctxt = htmlCreatePushParserCtxt(&disclaimer_SAXHandler_struct, NULL, buffer, bytes_read, NULL, 0);
if(ctxt == NULL)
{
/* TODO: treat this error */
print_debug(LOG_DEBUG, "NULL context");
}
ctxt->_private = (void *)&locator;
last_readable_chunk = buffer_size + bytes_read <= html_chunk_size
? buffer_size : html_chunk_size - bytes_read;
while ((bytes_read = fread(buffer, 1, last_readable_chunk, input)))
{
htmlParseChunk(ctxt, buffer, bytes_read, 0);
last_readable_chunk = buffer_size + bytes_read <= html_chunk_size
? buffer_size : html_chunk_size - bytes_read;
}
}
htmlParseChunk(ctxt, buffer, 0, 1 );
print_debug(LOG_DEBUG, "html_parse: All done %d %d\n",
((tag_locator *)ctxt->_private)->lin,
((tag_locator *)ctxt->_private)->col);
*p_line = ((tag_locator *)ctxt->_private)->lin;
*p_column = ((tag_locator *)ctxt->_private)->col;
htmlFreeParserCtxt(ctxt);
if(*p_column == -1 && *p_line == -1)
return DISCLAIMER_FAILURE;
return DISCLAIMER_SUCCESS;
}
/* function that attempts to locate the </body> tag offset inside a string */
int locate_end_tag_in_string(char *input, char * buffer, int buffer_size, int * p_column, int * p_line, long html_chunk_size)
{
htmlParserCtxtPtr ctxt;
tag_locator locator;
if(buffer_size < 4)
{
print_debug(LOG_ERR, "html_parse: Go away. buffer too small!");
return 0;
}
if(html_chunk_size < 4)
{
print_debug(LOG_ERR, "html_parse: Html chunk size smaller than 4 chars!");
return 0;
}
locator.lin = -1;
locator.col = -1;
print_debug(LOG_DEBUG, "html_parse: Creating html parser from string");
ctxt = htmlCreatePushParserCtxt(&disclaimer_SAXHandler_struct, NULL, input, 4, NULL, 0);
if(ctxt == NULL)
{
/* TODO: treat this error */
printf("NULL context");
}
ctxt->_private = (void *)&locator;
htmlParseChunk(ctxt, (input+4), html_chunk_size-4, 0);
htmlParseChunk(ctxt, buffer, 0, 1 );
print_debug(LOG_DEBUG, "html_parse: All done %d %d\n",
((tag_locator *)ctxt->_private)->lin,
((tag_locator *)ctxt->_private)->col);
*p_line = ((tag_locator *)ctxt->_private)->lin;
*p_column = ((tag_locator *)ctxt->_private)->col;
htmlFreeParserCtxt(ctxt);
if(*p_column == -1 && *p_line == -1)
return DISCLAIMER_FAILURE;
return DISCLAIMER_SUCCESS;
}
syntax highlighted by Code2HTML, v. 0.9.1