/*
NETWOX
Network toolbox
Copyright(c) 1999-2006 Laurent Constantin
-----
Main server : http://www.laurentconstantin.com/
Backup servers : http://go.to/laurentconstantin/
http://laurentconstantin.est-la.com/
http://laurentconstantin.free.fr/
http://membres.lycos.fr/lauconstantin/
[my current email address is on the web servers]
-----
This file is part of Netwox.
Netwox is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
version 2 as published by the Free Software Foundation.
Netwox is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
General Public License for more details (http://www.gnu.org/).
------------------------------------------------------------------------
*/
/*-------------------------------------------------------------*/
#include "../../netwox.h"
/*-------------------------------------------------------------*/
netwib_err netwox_htmlfile_urllist_item_create(netwib_bool supportreplacing,
netwib_ptr *ppitem)
{
netwox_htmlfile_urllist_item *pitem;
netwib_er(netwib_ptr_malloc(sizeof(netwox_htmlfile_urllist_item), ppitem));
pitem = *ppitem;
netwib_er(netwib_buf_init_mallocdefault(&pitem->value));
pitem->supportreplacing = supportreplacing;
if (supportreplacing) {
pitem->suppresstag = NETWIB_FALSE;
pitem->suppressattribute = NETWIB_FALSE;
netwib_er(netwib_buf_init_mallocdefault(&pitem->replacingvalue));
}
return(NETWIB_ERR_OK);
}
/*-------------------------------------------------------------*/
netwib_err netwox_htmlfile_urllist_item_erase(netwib_ptr pitem)
{
netwox_htmlfile_urllist_item *purlitem;
purlitem = (netwox_htmlfile_urllist_item *)pitem;
netwib_er(netwib_buf_close(&purlitem->value));
if (purlitem->supportreplacing) {
netwib_er(netwib_buf_close(&purlitem->replacingvalue));
}
netwib_er(netwib_ptr_free(&pitem));
return(NETWIB_ERR_OK);
}
/*-------------------------------------------------------------*/
netwib_err netwox_htmlfile_urllist_ring_display(netwib_ring *pring)
{
netwib_ring_index *pringindex;
netwox_htmlfile_urllist_item *pitem;
netwib_err ret;
netwib_er(netwib_ring_index_init(pring, &pringindex));
ret = NETWIB_ERR_OK;
while(NETWIB_TRUE) {
ret = netwib_ring_index_next(pringindex, (netwib_ptr*)&pitem);
if (ret != NETWIB_ERR_OK) {
if (ret == NETWIB_ERR_DATAEND) ret = NETWIB_ERR_OK;
break;
}
netwib_er(netwib_fmt_display("%{uint32} [%{uint32}-%{uint32}] [%{uint32}-%{uint32}] [%{uint32}-%{uint32}] : %{buf}\n", pitem->elmtattr, pitem->tagbegin, pitem->tagend, pitem->attributebegin, pitem->attributeend, pitem->valuebegin, pitem->valueend, &pitem->value));
if (pitem->supportreplacing) {
netwib_er(netwib_fmt_display(" tag=%{bool:t} att=%{bool:t} val=%{buf}\n", pitem->suppresstag, pitem->suppressattribute, &pitem->replacingvalue));
}
}
netwib_er(netwib_ring_index_close(&pringindex));
return(ret);
}
/*-------------------------------------------------------------*/
typedef struct {
netwib_ring *pring;
netwib_bool supportreplacing;
netwib_buf buf;
netwib_uint32 fileoffset;
} netwox_htmlfile_ctx;
/*-------------------------------------------------------------*/
static netwib_err netwox_htmlfile_attribute(netwox_htmlfile_ctx *pctx,
netwib_constbuf *ptag,
netwox_htmltag_elmt elmt,
netwib_uint32 elementendoffset)
{
netwib_buf att, val;
netwib_data data, datasave, pcattbegin, pcattend, pcvalbegin, pcvalend;
netwib_data pcfullvalbegin, pcfullvalend;
netwib_uint32 datasize;
netwox_htmltag_elmtattr elmtattr;
netwox_htmlfile_urllist_item *pitem;
netwib_byte delim;
netwib_bool cansaverefresh;
/* obtain pointers */
data = netwib__buf_ref_data_ptr(ptag);
datasave = data;
datasize = netwib__buf_ref_data_size(ptag);
/* skip element */
data += elementendoffset;
datasize -= elementendoffset;
/* decode "att=val att="val" att= 'val' att" */
#define netwox__attribute_skip_char() { data++; datasize--; if (datasize == 0) return(NETWIB_ERR_DATANOTAVAIL); }
#define netwox__attribute_skip_spaces() { while(netwib_c2_isspace(*data)) { netwox__attribute_skip_char() } }
#define netwox__attribute_check_gt() { if (*data == '>') break; }
cansaverefresh = NETWIB_FALSE;
while(NETWIB_TRUE) {
netwox__attribute_skip_spaces();
netwox__attribute_check_gt();
/* skip attribute name */
pcattbegin = data;
while(!netwib_c2_isspace(*data) && *data != '=' && *data != '>') {
netwox__attribute_skip_char();
}
pcattend = data;
netwox__attribute_skip_spaces();
netwox__attribute_check_gt();
/* check for '=', then value */
if (*data == '=') {
netwox__attribute_skip_char();
netwox__attribute_skip_spaces();
netwox__attribute_check_gt(); /* no value : ignore */
/* check for value */
pcfullvalbegin = data;
if (*data == '\'' || *data == '"') {
delim = *data;
netwox__attribute_skip_char();
pcvalbegin = data;
while(*data != delim) {
netwox__attribute_skip_char();
}
pcvalend = data;
netwox__attribute_skip_char();
} else {
pcvalbegin = data;
while(!netwib_c2_isspace(*data) && *data != '>') {
netwox__attribute_skip_char();
}
pcvalend = data;
}
pcfullvalend = data;
/* now, check if this attribute is interesting */
netwib_er(netwib_buf_init_ext_arrayfilled(pcattbegin,
pcattend - pcattbegin, &att));
netwib_er(netwox_htmltag_elmtattr_init_buf(elmt, &att, &elmtattr));
if (elmtattr == NETWOX_HTMLTAG_ELMTATTR_META_REFRESHCONTENT) {
/* We save content only if http-equiv was seen :
*/
if (!cansaverefresh) {
/* setting to unknown will not save it below */
elmtattr = NETWOX_HTMLTAG_ELMTATTR_UNKNOWN;
}
}
if (elmtattr == NETWOX_HTMLTAG_ELMTATTR_META_HTTPEQUIV) {
if (pcvalend - pcvalbegin == 7) {
if (!netwib_c_memcasecmp(pcvalbegin,
(netwib_constdata)"refresh", 7)) {
/* http-equiv was seen, and contain "refresh" */
cansaverefresh = NETWIB_TRUE;
}
}
} else if (elmtattr != NETWOX_HTMLTAG_ELMTATTR_UNKNOWN) {
/* add it in the ring */
netwib_er(netwox_htmlfile_urllist_item_create(pctx->supportreplacing,
(netwib_ptr*)&pitem));
netwib_er(netwib_buf_init_ext_arrayfilled(pcvalbegin,
pcvalend - pcvalbegin,
&val));
#define NETWOX_HTMLFILE_COMPATIBILITY_TRICK 1
#if NETWOX_HTMLFILE_COMPATIBILITY_TRICK == 1
/* Now, be compatible with Internet Explorer and Mozilla, but not
with HTML specification.
If HTML contains for example, the leading
and ending spaces, tabulation and newlines are ignored. */
{
netwox_htmltag_elmtattr_uriformat uriformat;
netwib_data valdata;
netwib_uint32 valdatasize, i;
netwib_er(netwox_htmltag_uriformat_init_elmtattr(elmtattr,
&uriformat));
if (uriformat == NETWOX_HTMLTAG_ELMTATTR_URIFORMAT_ONE) {
valdata = netwib__buf_ref_data_ptr(&val);
valdatasize = netwib__buf_ref_data_size(&val);
for (i = 0; i < valdatasize; i++) {
if (valdata[i] != ' ' && valdata[i] != '\t' &&
valdata[i] != '\r' && valdata[i] != '\n' ) {
break;
}
}
if (i == valdatasize) {
val.endoffset = val.beginoffset;
} else {
val.beginoffset += i;
for (i = valdatasize-1; i != 0/* stop one before is ok */; i--) {
if (valdata[i] != ' ' && valdata[i] != '\t' &&
valdata[i] != '\r' && valdata[i] != '\n' ) {
break;
}
}
val.endoffset -= valdatasize - i - 1;
}
}
}
#endif
netwib_er(netwox_html_data_decode_best(&val, &pitem->value));
pitem->elmtattr = elmtattr;
pitem->tagbegin = pctx->fileoffset;
pitem->tagend = pctx->fileoffset + netwib__buf_ref_data_size(ptag);
pitem->attributebegin = pctx->fileoffset + pcattbegin - datasave;
pitem->attributeend = pctx->fileoffset + pcattend - datasave;
pitem->valuebegin = pctx->fileoffset + pcfullvalbegin - datasave;
pitem->valueend = pctx->fileoffset + pcfullvalend - datasave;
netwib_er(netwib_ring_add_last(pctx->pring, (netwib_ptr)pitem));
}
}
}
/* skip entire TAG */
pctx->fileoffset += netwib__buf_ref_data_size(ptag);
pctx->buf.beginoffset += netwib__buf_ref_data_size(ptag);
return(NETWIB_ERR_OK);
}
/*-------------------------------------------------------------*/
static netwib_err netwox_htmlfile_tag_end(netwox_htmlfile_ctx *pctx,
netwib_uint32 *pelementendoffset,
netwib_uint32 *ptagendoffset)
{
netwib_data data, datasave;
netwib_uint32 datasize;
netwib_byte delim;
data = netwib__buf_ref_data_ptr(&pctx->buf);
datasave = data;
datasize = netwib__buf_ref_data_size(&pctx->buf);
#define netwox__tag_end_skip_char() { data++; datasize--; if (datasize == 0) return(NETWIB_ERR_DATANOTAVAIL); }
#define netwox__tag_end_skip_spaces() { while(netwib_c2_isspace(*data)) { netwox__tag_end_skip_char() } }
#define netwox__tag_end_check_gt() { if (*data == '>') { *ptagendoffset = data - datasave + 1; return(NETWIB_ERR_OK); } }
/* find element end */
netwox__tag_end_skip_char(); /* leading '<' */
while(!netwib_c2_isspace(*data) && *data != '>') {
netwox__tag_end_skip_char();
}
*pelementendoffset = data - datasave;
/* find tag end */
while(NETWIB_TRUE) {
netwox__tag_end_skip_spaces();
netwox__tag_end_check_gt();
/* skip attribute name */
while(!netwib_c2_isspace(*data) && *data != '=' && *data != '>') {
netwox__tag_end_skip_char();
}
netwox__tag_end_skip_spaces();
netwox__tag_end_check_gt();
/* check for '=', then value */
if (*data == '=') {
netwox__tag_end_skip_char();
netwox__tag_end_skip_spaces();
netwox__tag_end_check_gt(); /* no value : ignore */
/* check for value */
if (*data == '\'' || *data == '"') {
delim = *data;
netwox__tag_end_skip_char();
while(*data != delim) {
netwox__tag_end_skip_char();
}
netwox__tag_end_skip_char();
} else {
while(!netwib_c2_isspace(*data) && *data != '>') {
netwox__tag_end_skip_char();
}
}
}
}
return(NETWIB_ERR_DATANOTAVAIL);
}
/*-------------------------------------------------------------*/
static netwib_err netwox_htmlfile_tag(netwox_htmlfile_ctx *pctx)
{
netwib_buf element, tag;
netwib_data data, pc;
netwib_uint32 datasize, tmpsize, elementendoffset, tagendoffset;
netwox_htmltag_elmt elmt;
datasize = netwib__buf_ref_data_size(&pctx->buf);
if (datasize < 3) {
/* no sufficient data because smallest chunk is "" */
return(NETWIB_ERR_DATANOTAVAIL);
}
data = netwib__buf_ref_data_ptr(&pctx->buf);
/* The second char determine tag type :
<[a-Z] : starting tag
: ending tag
'
(ex : ), so skipping 1 char ("<") will not
perturb the searching tag algorithm.
- a comment can contain :