/*
 * Program: Synonym
 * File: html_parser.c
 * Author: Cristian Draghici
 * Date: 11 Sep 2003
 *
 * $Id: html_parser.c,v 1.2 2003/10/30 14:54:10 diciu Exp $
 *
 * Licensed under the Modulo Consulting Software License
 * (see file license.txt)
 * 
 */
#include <libxml/HTMLparser.h>
#include <libxml/tree.h>
#include <string.h>
#include <stdarg.h>
#include <sys/stat.h>

#include <syslog.h>
#include "disclaimer.h"
#include "html_parser.h"


#define print_debug syslog

/* Called by the SAX parser when it reaches the end of an element (e.g. </body>) */
static void disclaimer_end_element(void *ctx, const xmlChar *name)
{
	tag_locator *locator;
	
	/* ideally added before the </body> or if that's missing the </html>*/
	if(name != NULL && !strcasecmp(name, "body"))
	{		
		print_debug(LOG_DEBUG, "html_parse: End line is %d.", getLineNumber(ctx));
		locator = (tag_locator *)(((xmlParserCtxtPtr)ctx)->_private);
		locator->lin = getLineNumber(ctx);
		locator->col = getColumnNumber(ctx);
		print_debug(LOG_DEBUG, "html_parse: Body is ending at %d, %d.", 
			locator->lin, locator->col);
	}
}

/* Called by the SAX parser on warning, just proxies the log message to syslog */
static void disclaimer_warning(void *ctx, const char *msg, ...)
{
    va_list args;

    va_start(args, msg);
    syslog(LOG_DEBUG, "SAX.warning: ");
    vsyslog(LOG_DEBUG, msg, args);
    va_end(args);
}

/* Called by the SAX parser on error, just proxies the log message to syslog */
static void disclaimer_error(void *ctx, const char *msg, ...)
{
    va_list args;

    va_start(args, msg);
    syslog(LOG_DEBUG, "SAX.error: ");
    vsyslog(LOG_DEBUG, msg, args);
    va_end(args);
}

/* Called by the SAX parser on fatal error, just proxies the log message to syslog */
static void disclaimer_fatal_error(void *ctx, const char *msg, ...)
{
    va_list args;

    va_start(args, msg);
    syslog(LOG_DEBUG, "SAX.fatalError: ");
    vsyslog(LOG_DEBUG, msg, args);
    va_end(args);
}

/* the XMLSAXHandler structure, with our callbacks defined */
xmlSAXHandler disclaimer_SAXHandler_struct = {
	NULL, /* internalSubset */
	NULL, /* isStandalone */
	NULL, /* hasInternalSubset */
	NULL, /* hasExternalSubset */
	NULL, /* resolveEntity */
	NULL, /* getEntity */
	NULL, /* entityDecl */
	NULL, /* notationDecl */
	NULL, /* attributeDecl */
	NULL, /* elementDecl */
	NULL, /* unparsedEntityDecl */
	NULL, /* setDocumentLocator */
	NULL, /* startDocument */
	NULL, /* endDocument */
	NULL, /* startElement */
	disclaimer_end_element, /* endElement */
	NULL, /* reference */
	NULL, //disclaimer_characters, /* characters */
	NULL, /* ignorableWhitespace */
	NULL, /* processingInstruction */
	NULL, /* comment */
	disclaimer_warning, /* xmlParserWarning */
	disclaimer_error, /* xmlParserError */
	disclaimer_fatal_error, /* xmlParserError */
	NULL, /* getParameterEntity */
	NULL, /* cdataBlock */
	NULL, /* externalSubset */
	1
};

/* function that attempts to locate the </body> tag offset inside an input stream */
int locate_end_tag(FILE *input, char * buffer, int buffer_size, int * p_column, int * p_line, long html_chunk_size)
{
	int bytes_read, last_readable_chunk;
	htmlParserCtxtPtr ctxt;
	tag_locator locator;
	
	if(buffer_size < 4)
	{
		print_debug(LOG_ERR, "html_parse: Go away. buffer too small!");
		return 0;
	}
	if(html_chunk_size < 4)
	{
		print_debug(LOG_ERR, "html_parse: Html chunk size smaller than 4 chars!");
		return 0;
	}

	print_debug(LOG_DEBUG, "html_parse: Creating html parser\n");
	locator.lin = -1;
	locator.col = -1;
	
	bytes_read = fread(buffer, 1, 4, input);
	if(bytes_read > 0)
	{
		ctxt = htmlCreatePushParserCtxt(&disclaimer_SAXHandler_struct, NULL, buffer, bytes_read, NULL, 0);
		if(ctxt == NULL)
		{
			/* TODO: treat this error */
			print_debug(LOG_DEBUG, "NULL context");
		}
		
		ctxt->_private = (void *)&locator;
		
		last_readable_chunk = buffer_size + bytes_read <= html_chunk_size 
			? buffer_size : html_chunk_size - bytes_read;
		while ((bytes_read = fread(buffer, 1, last_readable_chunk, input)))
		{
			htmlParseChunk(ctxt, buffer, bytes_read, 0);
			last_readable_chunk = buffer_size + bytes_read <= html_chunk_size 
			? buffer_size : html_chunk_size - bytes_read;
		}
	}

	htmlParseChunk(ctxt, buffer, 0, 1 );
	
	print_debug(LOG_DEBUG, "html_parse: All done %d %d\n", 
		((tag_locator *)ctxt->_private)->lin, 
		((tag_locator *)ctxt->_private)->col);
	
	*p_line = ((tag_locator *)ctxt->_private)->lin;
	*p_column = ((tag_locator *)ctxt->_private)->col;
	
	htmlFreeParserCtxt(ctxt);
	
	if(*p_column == -1 && *p_line == -1)
		return DISCLAIMER_FAILURE;
	
	return DISCLAIMER_SUCCESS;
}

/* function that attempts to locate the </body> tag offset inside a string */
int locate_end_tag_in_string(char *input, char * buffer, int buffer_size, int * p_column, int * p_line, long html_chunk_size)
{
	htmlParserCtxtPtr ctxt;
	tag_locator locator;

	if(buffer_size < 4)
	{
		print_debug(LOG_ERR, "html_parse: Go away. buffer too small!");
		return 0;
	}
	if(html_chunk_size < 4)
	{
		print_debug(LOG_ERR, "html_parse: Html chunk size smaller than 4 chars!");
		return 0;
	}

	locator.lin = -1;
	locator.col = -1;
	
	print_debug(LOG_DEBUG, "html_parse: Creating html parser from string");
	ctxt = htmlCreatePushParserCtxt(&disclaimer_SAXHandler_struct, NULL, input, 4, NULL, 0);
	if(ctxt == NULL)
	{
		/* TODO: treat this error */
		printf("NULL context");
	}
	ctxt->_private = (void *)&locator;

	htmlParseChunk(ctxt, (input+4), html_chunk_size-4, 0);
	
	htmlParseChunk(ctxt, buffer, 0, 1 );
	print_debug(LOG_DEBUG, "html_parse: All done %d %d\n", 
		((tag_locator *)ctxt->_private)->lin, 
		((tag_locator *)ctxt->_private)->col);
	
	
	*p_line = ((tag_locator *)ctxt->_private)->lin;
	*p_column = ((tag_locator *)ctxt->_private)->col;
	
	htmlFreeParserCtxt(ctxt);
	
	if(*p_column == -1 && *p_line == -1)
		return DISCLAIMER_FAILURE;
	
	return DISCLAIMER_SUCCESS;
}


syntax highlighted by Code2HTML, v. 0.9.1