/* WebDownloader for X-Window * Copyright (C) 1999-2002 Koshelev Maxim * This Program is free but not GPL!!! You can't modify it * without agreement with author. You can't distribute modified * program but you can distribute unmodified program. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. */ #include "html.h" #include "locstr.h" #include "var.h" #include #include #include #include #include #include struct tHtmlTegTable{ char *tag,*field; int mod; }; enum HTML_FIELDS_TYPES{ HF_TYPE_LINK, HF_TYPE_BASE, HF_TYPE_BASE_CLOSE, HF_TYPE_META }; enum HTML_FIELDS{ HF_PROFILE, HF_STYLE, HF_BACKGROUND, HF_HREF, HF_SRC, HF_USEMAP, HF_LONGDESC, HF_LOWSRC, HF_CONTENT, HF_CITE, HF_DATA, HF_HT, HF_CODEBASE, HF_ACTION, HF_HTTP_EQUIV }; char *HTML_FIELDS_NAMES[]={ "profile", "style", "background", "href", "src", "usemap", "longdesc", "lowsrc", "content", "cite", "data", "ht", "codebase", "action", "http-equiv" }; tHtmlTegTable HTML_TEGS[]={ {"head", HTML_FIELDS_NAMES[HF_PROFILE], 0}, {"body", HTML_FIELDS_NAMES[HF_BACKGROUND], 0}, // {"body", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"table", HTML_FIELDS_NAMES[HF_BACKGROUND], 0}, // {"table", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"th", HTML_FIELDS_NAMES[HF_BACKGROUND], 0}, // {"th", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"td", HTML_FIELDS_NAMES[HF_BACKGROUND], 0}, // {"td", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"col", HTML_FIELDS_NAMES[HF_BACKGROUND], 0}, // {"col", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"thead", HTML_FIELDS_NAMES[HF_BACKGROUND], 0}, // {"thead", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"tfoot", HTML_FIELDS_NAMES[HF_BACKGROUND], 0}, // {"tfoot", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"tbody", HTML_FIELDS_NAMES[HF_BACKGROUND], 0}, // {"tbody", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"a", HTML_FIELDS_NAMES[HF_HREF], 0}, // {"a", HTML_FIELDS_NAMES[HF_STYLE], 0}, // {"address", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"area", HTML_FIELDS_NAMES[HF_HREF], 0}, // {"area", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"img", HTML_FIELDS_NAMES[HF_SRC], 0}, {"img", HTML_FIELDS_NAMES[HF_LOWSRC], 0}, {"img", HTML_FIELDS_NAMES[HF_LONGDESC], 0}, // {"img", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"img", HTML_FIELDS_NAMES[HF_USEMAP], 0}, {"link", HTML_FIELDS_NAMES[HF_HREF], 0}, // {"link", HTML_FIELDS_NAMES[HF_STYLE], 0}, // {"input", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"input", HTML_FIELDS_NAMES[HF_SRC], 0}, {"input", HTML_FIELDS_NAMES[HF_USEMAP], 0}, {"applet", HTML_FIELDS_NAMES[HF_CODEBASE], 0}, // {"applet", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"iframe", HTML_FIELDS_NAMES[HF_LONGDESC], 0}, // {"iframe", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"iframe", HTML_FIELDS_NAMES[HF_SRC], 0}, {"frame", HTML_FIELDS_NAMES[HF_LONGDESC], 0}, // {"frame", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"frame", HTML_FIELDS_NAMES[HF_SRC], 0}, {"sound", HTML_FIELDS_NAMES[HF_SRC], 0}, // {"sound", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"bgsound", HTML_FIELDS_NAMES[HF_SRC], 0}, // {"bgsound", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"script", HTML_FIELDS_NAMES[HF_SRC], 0}, {"embed", HTML_FIELDS_NAMES[HF_SRC], 0}, // {"embed", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"fig", HTML_FIELDS_NAMES[HF_SRC], 0}, // {"fig", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"overlay", HTML_FIELDS_NAMES[HF_SRC], 0}, // {"overlay", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"layer", HTML_FIELDS_NAMES[HF_SRC], 0}, // {"layer", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"ins", HTML_FIELDS_NAMES[HF_CITE], 0}, // {"ins", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"del", HTML_FIELDS_NAMES[HF_CITE], 0}, // {"del", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"q", HTML_FIELDS_NAMES[HF_CITE], 0}, // {"q", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"blockqute", HTML_FIELDS_NAMES[HF_CITE], 0}, // {"blockqute", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"span", HTML_FIELDS_NAMES[HF_HREF], 0}, // {"span", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"div", HTML_FIELDS_NAMES[HF_HREF], 0}, // {"div", HTML_FIELDS_NAMES[HF_STYLE], 0}, // {"object", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"object", HTML_FIELDS_NAMES[HF_USEMAP], 0}, {"object", HTML_FIELDS_NAMES[HF_DATA], 0}, /* {"center", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"h1", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"h2", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"h3", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"h4", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"h5", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"h6", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"hr", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"isindex", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"p", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"pre", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"noscript", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"dir", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"dl", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"dt", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"dd", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"li", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"menu", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"ol", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"ul", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"caption", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"colgroup", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"optgroup", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"form", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"fieldset", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"button", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"legend", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"label", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"select", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"option", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"textarea", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"bdo", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"br", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"font", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"map", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"sub", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"sup", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"abbr", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"acronym", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"cite", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"code", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"dfn", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"em", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"samp", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"strong", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"var", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"b", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"big", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"i", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"s", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"small", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"strike", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"tt", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"u", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"csobj", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"noframes", HTML_FIELDS_NAMES[HF_STYLE], 0}, {"frameset", HTML_FIELDS_NAMES[HF_STYLE], 0}, */ {"base", HTML_FIELDS_NAMES[HF_HREF], HF_TYPE_BASE}, {"/base", NULL, HF_TYPE_BASE_CLOSE}, {"meta", HTML_FIELDS_NAMES[HF_HTTP_EQUIV], HF_TYPE_META} }; const int HTML_TEGS_NUM=sizeof(HTML_TEGS)/sizeof(tHtmlTegTable); tHtmlTagField::tHtmlTagField(){ name=value=NULL; saved=0; }; void tHtmlTagField::print(){ printf("Name:\t%s\n",name==NULL?"*NULL*":name); printf("Value:\t%s\n",value==NULL?"*NULL*":value); }; tHtmlTagField::~tHtmlTagField(){ if (name) delete[] name; if (value) delete[] value; }; tHtmlTag::tHtmlTag(){ descr=name=NULL; fields=NULL; }; void tHtmlTag::print(){ printf("Name:\t%s\n",name==NULL?"*NULL*":name); }; tHtmlTagField *tHtmlTag::find_field(const char *n){ tHtmlTagField *fld=(tHtmlTagField *)(fields->first()); while(fld){ if (strcasecmp(fld->name,n)==0) return fld; fld=(tHtmlTagField *)(fields->prev()); }; return 0; }; tHtmlTag::~tHtmlTag(){ if (name) delete[] name; if (descr) delete[] descr; if (fields) delete(fields); }; /********************************************************/ tHtmlUrl::tHtmlUrl(){ descr=NULL; }; void tHtmlUrl::print(){ }; tHtmlUrl::~tHtmlUrl(){ if (descr) delete[] descr; }; /* Will be parse htmls in next assumption: */ char *tHtmlParser::get_string_back(int len,int shift){ WL->shift(-(len+shift),SEEK_CUR); char *temp=new char[len+1]; WL->read(temp,len); temp[len]=0; return temp; }; char *tHtmlParser::get_word(){ int i=0; char p; while(WL->read(&p,sizeof(p))>0){ if (isspace(p) || p=='>'){ return(get_string_back(i,1)); }; i++; }; return NULL; }; char *tHtmlParser::get_word_o(int shift){ int i=shift; char p; while(WL->read(&p,sizeof(p))>0){ if (isspace(p) || p=='>'){ return(get_string_back(i,1)); }; i++; }; return NULL; }; char *tHtmlParser::get_word(int shift){ int i=shift; char p; while(WL->read(&p,sizeof(p))>0){ if (isspace(p) || p=='=' || p=='>'){ return(get_string_back(i,1)); }; i++; }; return NULL; }; /* compact_string(char *str) before exec: str="\naa\nbb\ncc" after execution: str="aabbcc" */ void tHtmlParser::compact_string(char *str){ char *a=str,*b; while (*a){ if (*a=='\n' || *a=='\r') break; a+=1; }; b=a; while (*a){ *a=*b; if (*a!='\n' && *a!='\r') b+=1; a+=1; }; *b=0; }; char *tHtmlParser::get_word_icommas(){ int i=1; char p; while(WL->read(&p,sizeof(p))>0){ if (p=='>'){ return(get_string_back(i,1)); }; i++; if (p=='\"'){ return(get_string_back(i,0)); }; }; return NULL; }; char *tHtmlParser::get_word_icommas2(){ int i=1; char p; while(WL->read(&p,sizeof(p))>0){ if (p=='>'){ return(get_string_back(i,1)); }; i++; if (p=='\''){ return(get_string_back(i,0)); }; }; return NULL; }; void tHtmlParser::get_fields(tHtmlTag *tag){ char p; tag->fields=new tQueue; tag->fields->init(0); while(WL->read(&p,sizeof(p)>0)){ if (p=='>') return; if (!isspace(p)){ tHtmlTagField *field=new tHtmlTagField; field->name=get_word(1); tag->fields->insert(field); int eqsign=0; while(WL->read(&p,sizeof(p)>0)){ if (p=='>') return; if (p=='=') eqsign=1; if (!isspace(p) && p!='='){ if (!eqsign){ WL->shift(-1,SEEK_CUR); break; }; switch(p){ case '\"':{ field->value=get_word_icommas(); break; }; case '\'':{ field->value=get_word_icommas2(); break; }; default: field->value=get_word_o(1); }; if (field->value) compact_string(field->value); break; }; }; }; }; }; char *tHtmlParser::extract_from_icommas(char *str){ if (str==NULL) return NULL; char *temp=str; char *end=NULL; switch (*temp){ case '\'':{ temp+=1; end=index(temp,'\''); break; }; case '\"':{ temp+=1; end=index(temp,'\"'); break; }; }; if (end) return(copy_string2(temp,end-temp)); return(copy_string(temp)); }; void tHtmlParser::get_tag_descr(tHtmlTag *tag){ if (strcasecmp(tag->name,"img")==0){ //get descr from ALT or TITLE field tHtmlTagField *fld=tag->find_field("alt"); if (!fld) fld=tag->find_field("title"); if (fld) tag->descr=copy_string(fld->value); }else if (strcasecmp(tag->name,"a")==0){ //get descr from content of tag char *str=new char[51]; char *p=str; while(p-str<50 && WL->read(p,sizeof(char))>0){ if (*p=='<'){ WL->shift(-1,SEEK_CUR); break; }; p++; }; *p=0; WL->shift(str-p,SEEK_CUR); tag->descr=str; }; }; tHtmlTag *tHtmlParser::get_tag(){ char p; tHtmlTag *rvalue=NULL; while(WL->read(&p,sizeof(p))>0){ if (p=='<'){ char *name=NULL; if ((name=get_word())){ if (name && equal(name,"!--")){ if (out_fd>=0){ f_wstr(out_fd,"