#include <stdio.h>
#include "ml.h"

void main(int argc,char **argv, char **envp)
{

  Binary_Buffer *b = load_binary_file("/u/max/foo");
  char *ptr = (char *) b->data;

  char *ret = strip_html(ptr);

  printf("%s\n",ret);

}

#define fs_get malloc

TagPair HTMLtags[] = {
  "<P>", "\n",
  "<BR>", "\n",
  "<LI>", "\n\t",
  "<UL>", "\n",
  "<TR>", "\n",
  "</BLOCKQUOTE>", "\n-----\n",
  NULL, NULL
};

TagPair AMPtags[] = {
  "&nbsp;", " ",
  "&amp;", "&",
  "&lt;", "<",
  "&gt;", ">",
  "&copy", "\251",
  NULL, NULL
};
  


#ifdef __STDC__
char * strip_html(char *src)
#else
char *strip_html(src)
char *src;
#endif
{

  char *ret = NULL;
  char *sptr = src;
  char *rptr = NULL;
  int tptr;
  int preformatted = 0;

  if((src == NULL) || (*src == NUL_TERM))
    return ret;

  rptr = ret = fs_get(strlen(src) + FILEBUFFLEN);
  
  while(*sptr  != NUL_TERM) {
    if(*sptr == '\n') {
      *rptr++ = ' ';
      *sptr++;
      continue;
    }
    if(*sptr != '<' && *sptr != '&' && !preformatted) {
      *rptr++ = *sptr++;
      continue;
    }

    if(preformatted) {
      if((strncasecmp(sptr,"</PRE>",6)) == STRMATCH) {
	preformatted = 0;
      }
      else {
	*rptr++ = *sptr++;
	continue;
      }
    }
	

    if(*sptr == '&') {
      for(tptr = 0; AMPtags[tptr].tag != NULL; tptr ++) {
	if((strncasecmp(sptr,
			AMPtags[tptr].tag,
			strlen(AMPtags[tptr].tag))) == STRMATCH) {
	  strcpy(rptr,AMPtags[tptr].value);
	  rptr += strlen(AMPtags[tptr].value);
	  break;
	}
      }
      /* advance to end of symbol */
      while((*sptr != NUL_TERM) && (*sptr != ';'))
	sptr ++;
      if(*sptr) 
	sptr ++;
      continue;
    }

    if(*sptr == '<') {
      for(tptr = 0; HTMLtags[tptr].tag != NULL; tptr ++) {
	if((strncasecmp(sptr,
			HTMLtags[tptr].tag,
			strlen(HTMLtags[tptr].tag))) == STRMATCH) {
	  strcpy(rptr,HTMLtags[tptr].value);
	  rptr += strlen(HTMLtags[tptr].value);
	  sptr ++; /* advance past tag start so no more matches */
	  break;
	}
      }
      /* fall through to find tag end */

      /* special cases: */

      /* link - retain the href */

      if((strncasecmp(sptr,"<A ",3)) == STRMATCH) {
	while((*sptr != NUL_TERM) && (*sptr != '>')) {
	  if((strncasecmp(sptr," HREF=",6)) == STRMATCH) {
	    sptr += 6;
	    *rptr++ = '<';
	    while((*sptr != NUL_TERM) && (*sptr != ' ') && (*sptr != '>'))
	      *rptr++ = *sptr++;
	    *rptr++ = '>';
	    continue;
	  }
	  else
	    sptr++;
	}
	if(*sptr) sptr ++;
	continue; /* do not fall through */
      }

      /* blockquote - pretty it up a bit */

      if((strncasecmp(sptr,"<BLOCKQUOTE",11)) == STRMATCH) {
	strcpy(rptr,"\n-----\n");
	rptr += 7;
	/* fall through to find tag end */
      }

      /* paragraph tags with args, very common */

      if((strncasecmp(sptr,"<P ",3)) == STRMATCH)
	*rptr++ = '\n';
      /* fall through to find tag end */

      /* preformatted text - flag it for special handling */

      if(((strncasecmp(sptr,"<PRE>",5)) == STRMATCH)
	 || ((strncasecmp(sptr,"<PRE ",5)) == STRMATCH))
	preformatted = 1;


      /* catch fall throughs and all other unrecognized tags */
      while((*sptr != NUL_TERM) && (*sptr != '>'))
	sptr ++;
      if(*sptr) sptr ++;
      continue;
    }
  }

  *rptr = NUL_TERM;
  return ret;  

}


#ifdef __STDC__
Binary_Buffer *load_binary_file(char *filename)
#else
Binary_Buffer *load_binary_file(filename)
     char *filename;
#endif
{
  FILE *fp;
  Binary_Buffer *binary_buffer = NULL;
  struct stat st;
  unsigned long bytes_read  = 0L;
  unsigned long bytes_total = 0L;
  char *ptr;

  if(((stat(filename,&st)) != SYSCALL_SUCCESS)
     || (st.st_size == 0)
     || ((fp = fopen(filename,"r")) == NULL))
    return(NULL);

  binary_buffer = (Binary_Buffer *) fs_get(sizeof(Binary_Buffer));
  binary_buffer->length = (unsigned long) st.st_size;
  binary_buffer->data = (unsigned char *) fs_get(st.st_size + 1);

  ptr = (char *) binary_buffer->data;

  do {
    bytes_read = fread(ptr, 1, ((unsigned long) st.st_size) - bytes_total, fp);
    bytes_total += bytes_read;
    ptr += bytes_read;
  } while ((bytes_read != 0) && (bytes_total < (unsigned long) st.st_size));
  
  fclose(fp);

  binary_buffer->data[binary_buffer->length] = NUL_TERM;
  if(bytes_total != (unsigned long) st.st_size) {
    /*    free_binary_buffer(binary_buffer); */
    binary_buffer = NULL;
  }
  return(binary_buffer);
}




syntax highlighted by Code2HTML, v. 0.9.1