#include <stdio.h>
#include "ml.h"
void main(int argc,char **argv, char **envp)
{
Binary_Buffer *b = load_binary_file("/u/max/foo");
char *ptr = (char *) b->data;
char *ret = strip_html(ptr);
printf("%s\n",ret);
}
#define fs_get malloc
TagPair HTMLtags[] = {
"<P>", "\n",
"<BR>", "\n",
"<LI>", "\n\t",
"<UL>", "\n",
"<TR>", "\n",
"</BLOCKQUOTE>", "\n-----\n",
NULL, NULL
};
TagPair AMPtags[] = {
" ", " ",
"&", "&",
"<", "<",
">", ">",
"©", "\251",
NULL, NULL
};
#ifdef __STDC__
char * strip_html(char *src)
#else
char *strip_html(src)
char *src;
#endif
{
char *ret = NULL;
char *sptr = src;
char *rptr = NULL;
int tptr;
int preformatted = 0;
if((src == NULL) || (*src == NUL_TERM))
return ret;
rptr = ret = fs_get(strlen(src) + FILEBUFFLEN);
while(*sptr != NUL_TERM) {
if(*sptr == '\n') {
*rptr++ = ' ';
*sptr++;
continue;
}
if(*sptr != '<' && *sptr != '&' && !preformatted) {
*rptr++ = *sptr++;
continue;
}
if(preformatted) {
if((strncasecmp(sptr,"</PRE>",6)) == STRMATCH) {
preformatted = 0;
}
else {
*rptr++ = *sptr++;
continue;
}
}
if(*sptr == '&') {
for(tptr = 0; AMPtags[tptr].tag != NULL; tptr ++) {
if((strncasecmp(sptr,
AMPtags[tptr].tag,
strlen(AMPtags[tptr].tag))) == STRMATCH) {
strcpy(rptr,AMPtags[tptr].value);
rptr += strlen(AMPtags[tptr].value);
break;
}
}
/* advance to end of symbol */
while((*sptr != NUL_TERM) && (*sptr != ';'))
sptr ++;
if(*sptr)
sptr ++;
continue;
}
if(*sptr == '<') {
for(tptr = 0; HTMLtags[tptr].tag != NULL; tptr ++) {
if((strncasecmp(sptr,
HTMLtags[tptr].tag,
strlen(HTMLtags[tptr].tag))) == STRMATCH) {
strcpy(rptr,HTMLtags[tptr].value);
rptr += strlen(HTMLtags[tptr].value);
sptr ++; /* advance past tag start so no more matches */
break;
}
}
/* fall through to find tag end */
/* special cases: */
/* link - retain the href */
if((strncasecmp(sptr,"<A ",3)) == STRMATCH) {
while((*sptr != NUL_TERM) && (*sptr != '>')) {
if((strncasecmp(sptr," HREF=",6)) == STRMATCH) {
sptr += 6;
*rptr++ = '<';
while((*sptr != NUL_TERM) && (*sptr != ' ') && (*sptr != '>'))
*rptr++ = *sptr++;
*rptr++ = '>';
continue;
}
else
sptr++;
}
if(*sptr) sptr ++;
continue; /* do not fall through */
}
/* blockquote - pretty it up a bit */
if((strncasecmp(sptr,"<BLOCKQUOTE",11)) == STRMATCH) {
strcpy(rptr,"\n-----\n");
rptr += 7;
/* fall through to find tag end */
}
/* paragraph tags with args, very common */
if((strncasecmp(sptr,"<P ",3)) == STRMATCH)
*rptr++ = '\n';
/* fall through to find tag end */
/* preformatted text - flag it for special handling */
if(((strncasecmp(sptr,"<PRE>",5)) == STRMATCH)
|| ((strncasecmp(sptr,"<PRE ",5)) == STRMATCH))
preformatted = 1;
/* catch fall throughs and all other unrecognized tags */
while((*sptr != NUL_TERM) && (*sptr != '>'))
sptr ++;
if(*sptr) sptr ++;
continue;
}
}
*rptr = NUL_TERM;
return ret;
}
#ifdef __STDC__
Binary_Buffer *load_binary_file(char *filename)
#else
Binary_Buffer *load_binary_file(filename)
char *filename;
#endif
{
FILE *fp;
Binary_Buffer *binary_buffer = NULL;
struct stat st;
unsigned long bytes_read = 0L;
unsigned long bytes_total = 0L;
char *ptr;
if(((stat(filename,&st)) != SYSCALL_SUCCESS)
|| (st.st_size == 0)
|| ((fp = fopen(filename,"r")) == NULL))
return(NULL);
binary_buffer = (Binary_Buffer *) fs_get(sizeof(Binary_Buffer));
binary_buffer->length = (unsigned long) st.st_size;
binary_buffer->data = (unsigned char *) fs_get(st.st_size + 1);
ptr = (char *) binary_buffer->data;
do {
bytes_read = fread(ptr, 1, ((unsigned long) st.st_size) - bytes_total, fp);
bytes_total += bytes_read;
ptr += bytes_read;
} while ((bytes_read != 0) && (bytes_total < (unsigned long) st.st_size));
fclose(fp);
binary_buffer->data[binary_buffer->length] = NUL_TERM;
if(bytes_total != (unsigned long) st.st_size) {
/* free_binary_buffer(binary_buffer); */
binary_buffer = NULL;
}
return(binary_buffer);
}
syntax highlighted by Code2HTML, v. 0.9.1