/* NETWOX Network toolbox Copyright(c) 1999-2006 Laurent Constantin ----- Main server : http://www.laurentconstantin.com/ Backup servers : http://go.to/laurentconstantin/ http://laurentconstantin.est-la.com/ http://laurentconstantin.free.fr/ http://membres.lycos.fr/lauconstantin/ [my current email address is on the web servers] ----- This file is part of Netwox. Netwox is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License version 2 as published by the Free Software Foundation. Netwox is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details (http://www.gnu.org/). ------------------------------------------------------------------------ */ /*-------------------------------------------------------------*/ #include "../../netwox.h" /*-------------------------------------------------------------*/ netwib_err netwox_htmlfile_urllist_item_create(netwib_bool supportreplacing, netwib_ptr *ppitem) { netwox_htmlfile_urllist_item *pitem; netwib_er(netwib_ptr_malloc(sizeof(netwox_htmlfile_urllist_item), ppitem)); pitem = *ppitem; netwib_er(netwib_buf_init_mallocdefault(&pitem->value)); pitem->supportreplacing = supportreplacing; if (supportreplacing) { pitem->suppresstag = NETWIB_FALSE; pitem->suppressattribute = NETWIB_FALSE; netwib_er(netwib_buf_init_mallocdefault(&pitem->replacingvalue)); } return(NETWIB_ERR_OK); } /*-------------------------------------------------------------*/ netwib_err netwox_htmlfile_urllist_item_erase(netwib_ptr pitem) { netwox_htmlfile_urllist_item *purlitem; purlitem = (netwox_htmlfile_urllist_item *)pitem; netwib_er(netwib_buf_close(&purlitem->value)); if (purlitem->supportreplacing) { netwib_er(netwib_buf_close(&purlitem->replacingvalue)); } netwib_er(netwib_ptr_free(&pitem)); return(NETWIB_ERR_OK); } /*-------------------------------------------------------------*/ netwib_err netwox_htmlfile_urllist_ring_display(netwib_ring *pring) { netwib_ring_index *pringindex; netwox_htmlfile_urllist_item *pitem; netwib_err ret; netwib_er(netwib_ring_index_init(pring, &pringindex)); ret = NETWIB_ERR_OK; while(NETWIB_TRUE) { ret = netwib_ring_index_next(pringindex, (netwib_ptr*)&pitem); if (ret != NETWIB_ERR_OK) { if (ret == NETWIB_ERR_DATAEND) ret = NETWIB_ERR_OK; break; } netwib_er(netwib_fmt_display("%{uint32} [%{uint32}-%{uint32}] [%{uint32}-%{uint32}] [%{uint32}-%{uint32}] : %{buf}\n", pitem->elmtattr, pitem->tagbegin, pitem->tagend, pitem->attributebegin, pitem->attributeend, pitem->valuebegin, pitem->valueend, &pitem->value)); if (pitem->supportreplacing) { netwib_er(netwib_fmt_display(" tag=%{bool:t} att=%{bool:t} val=%{buf}\n", pitem->suppresstag, pitem->suppressattribute, &pitem->replacingvalue)); } } netwib_er(netwib_ring_index_close(&pringindex)); return(ret); } /*-------------------------------------------------------------*/ typedef struct { netwib_ring *pring; netwib_bool supportreplacing; netwib_buf buf; netwib_uint32 fileoffset; } netwox_htmlfile_ctx; /*-------------------------------------------------------------*/ static netwib_err netwox_htmlfile_attribute(netwox_htmlfile_ctx *pctx, netwib_constbuf *ptag, netwox_htmltag_elmt elmt, netwib_uint32 elementendoffset) { netwib_buf att, val; netwib_data data, datasave, pcattbegin, pcattend, pcvalbegin, pcvalend; netwib_data pcfullvalbegin, pcfullvalend; netwib_uint32 datasize; netwox_htmltag_elmtattr elmtattr; netwox_htmlfile_urllist_item *pitem; netwib_byte delim; netwib_bool cansaverefresh; /* obtain pointers */ data = netwib__buf_ref_data_ptr(ptag); datasave = data; datasize = netwib__buf_ref_data_size(ptag); /* skip element */ data += elementendoffset; datasize -= elementendoffset; /* decode "att=val att="val" att= 'val' att" */ #define netwox__attribute_skip_char() { data++; datasize--; if (datasize == 0) return(NETWIB_ERR_DATANOTAVAIL); } #define netwox__attribute_skip_spaces() { while(netwib_c2_isspace(*data)) { netwox__attribute_skip_char() } } #define netwox__attribute_check_gt() { if (*data == '>') break; } cansaverefresh = NETWIB_FALSE; while(NETWIB_TRUE) { netwox__attribute_skip_spaces(); netwox__attribute_check_gt(); /* skip attribute name */ pcattbegin = data; while(!netwib_c2_isspace(*data) && *data != '=' && *data != '>') { netwox__attribute_skip_char(); } pcattend = data; netwox__attribute_skip_spaces(); netwox__attribute_check_gt(); /* check for '=', then value */ if (*data == '=') { netwox__attribute_skip_char(); netwox__attribute_skip_spaces(); netwox__attribute_check_gt(); /* no value : ignore */ /* check for value */ pcfullvalbegin = data; if (*data == '\'' || *data == '"') { delim = *data; netwox__attribute_skip_char(); pcvalbegin = data; while(*data != delim) { netwox__attribute_skip_char(); } pcvalend = data; netwox__attribute_skip_char(); } else { pcvalbegin = data; while(!netwib_c2_isspace(*data) && *data != '>') { netwox__attribute_skip_char(); } pcvalend = data; } pcfullvalend = data; /* now, check if this attribute is interesting */ netwib_er(netwib_buf_init_ext_arrayfilled(pcattbegin, pcattend - pcattbegin, &att)); netwib_er(netwox_htmltag_elmtattr_init_buf(elmt, &att, &elmtattr)); if (elmtattr == NETWOX_HTMLTAG_ELMTATTR_META_REFRESHCONTENT) { /* We save content only if http-equiv was seen : */ if (!cansaverefresh) { /* setting to unknown will not save it below */ elmtattr = NETWOX_HTMLTAG_ELMTATTR_UNKNOWN; } } if (elmtattr == NETWOX_HTMLTAG_ELMTATTR_META_HTTPEQUIV) { if (pcvalend - pcvalbegin == 7) { if (!netwib_c_memcasecmp(pcvalbegin, (netwib_constdata)"refresh", 7)) { /* http-equiv was seen, and contain "refresh" */ cansaverefresh = NETWIB_TRUE; } } } else if (elmtattr != NETWOX_HTMLTAG_ELMTATTR_UNKNOWN) { /* add it in the ring */ netwib_er(netwox_htmlfile_urllist_item_create(pctx->supportreplacing, (netwib_ptr*)&pitem)); netwib_er(netwib_buf_init_ext_arrayfilled(pcvalbegin, pcvalend - pcvalbegin, &val)); #define NETWOX_HTMLFILE_COMPATIBILITY_TRICK 1 #if NETWOX_HTMLFILE_COMPATIBILITY_TRICK == 1 /* Now, be compatible with Internet Explorer and Mozilla, but not with HTML specification. If HTML contains for example, the leading and ending spaces, tabulation and newlines are ignored. */ { netwox_htmltag_elmtattr_uriformat uriformat; netwib_data valdata; netwib_uint32 valdatasize, i; netwib_er(netwox_htmltag_uriformat_init_elmtattr(elmtattr, &uriformat)); if (uriformat == NETWOX_HTMLTAG_ELMTATTR_URIFORMAT_ONE) { valdata = netwib__buf_ref_data_ptr(&val); valdatasize = netwib__buf_ref_data_size(&val); for (i = 0; i < valdatasize; i++) { if (valdata[i] != ' ' && valdata[i] != '\t' && valdata[i] != '\r' && valdata[i] != '\n' ) { break; } } if (i == valdatasize) { val.endoffset = val.beginoffset; } else { val.beginoffset += i; for (i = valdatasize-1; i != 0/* stop one before is ok */; i--) { if (valdata[i] != ' ' && valdata[i] != '\t' && valdata[i] != '\r' && valdata[i] != '\n' ) { break; } } val.endoffset -= valdatasize - i - 1; } } } #endif netwib_er(netwox_html_data_decode_best(&val, &pitem->value)); pitem->elmtattr = elmtattr; pitem->tagbegin = pctx->fileoffset; pitem->tagend = pctx->fileoffset + netwib__buf_ref_data_size(ptag); pitem->attributebegin = pctx->fileoffset + pcattbegin - datasave; pitem->attributeend = pctx->fileoffset + pcattend - datasave; pitem->valuebegin = pctx->fileoffset + pcfullvalbegin - datasave; pitem->valueend = pctx->fileoffset + pcfullvalend - datasave; netwib_er(netwib_ring_add_last(pctx->pring, (netwib_ptr)pitem)); } } } /* skip entire TAG */ pctx->fileoffset += netwib__buf_ref_data_size(ptag); pctx->buf.beginoffset += netwib__buf_ref_data_size(ptag); return(NETWIB_ERR_OK); } /*-------------------------------------------------------------*/ static netwib_err netwox_htmlfile_tag_end(netwox_htmlfile_ctx *pctx, netwib_uint32 *pelementendoffset, netwib_uint32 *ptagendoffset) { netwib_data data, datasave; netwib_uint32 datasize; netwib_byte delim; data = netwib__buf_ref_data_ptr(&pctx->buf); datasave = data; datasize = netwib__buf_ref_data_size(&pctx->buf); #define netwox__tag_end_skip_char() { data++; datasize--; if (datasize == 0) return(NETWIB_ERR_DATANOTAVAIL); } #define netwox__tag_end_skip_spaces() { while(netwib_c2_isspace(*data)) { netwox__tag_end_skip_char() } } #define netwox__tag_end_check_gt() { if (*data == '>') { *ptagendoffset = data - datasave + 1; return(NETWIB_ERR_OK); } } /* find element end */ netwox__tag_end_skip_char(); /* leading '<' */ while(!netwib_c2_isspace(*data) && *data != '>') { netwox__tag_end_skip_char(); } *pelementendoffset = data - datasave; /* find tag end */ while(NETWIB_TRUE) { netwox__tag_end_skip_spaces(); netwox__tag_end_check_gt(); /* skip attribute name */ while(!netwib_c2_isspace(*data) && *data != '=' && *data != '>') { netwox__tag_end_skip_char(); } netwox__tag_end_skip_spaces(); netwox__tag_end_check_gt(); /* check for '=', then value */ if (*data == '=') { netwox__tag_end_skip_char(); netwox__tag_end_skip_spaces(); netwox__tag_end_check_gt(); /* no value : ignore */ /* check for value */ if (*data == '\'' || *data == '"') { delim = *data; netwox__tag_end_skip_char(); while(*data != delim) { netwox__tag_end_skip_char(); } netwox__tag_end_skip_char(); } else { while(!netwib_c2_isspace(*data) && *data != '>') { netwox__tag_end_skip_char(); } } } } return(NETWIB_ERR_DATANOTAVAIL); } /*-------------------------------------------------------------*/ static netwib_err netwox_htmlfile_tag(netwox_htmlfile_ctx *pctx) { netwib_buf element, tag; netwib_data data, pc; netwib_uint32 datasize, tmpsize, elementendoffset, tagendoffset; netwox_htmltag_elmt elmt; datasize = netwib__buf_ref_data_size(&pctx->buf); if (datasize < 3) { /* no sufficient data because smallest chunk is "" */ return(NETWIB_ERR_DATANOTAVAIL); } data = netwib__buf_ref_data_ptr(&pctx->buf); /* The second char determine tag type : <[a-Z] : starting tag ' (ex : ), so skipping 1 char ("<") will not perturb the searching tag algorithm. - a comment can contain :