/* $Id: url.c,v 1.7 2003/04/05 18:07:24 ossi Exp $ * * * puf 0.9 Copyright (C) 2000-2003 by Oswald Buddenhagen * based on puf 0.1.x (C) 1999,2000 by Anders Gavare * * You may modify and distribute this code under the terms of the GPL. * There is NO WARRANTY of any kind. See COPYING for details. * * url.c - parse and manage urls * */ #include "puf.h" int economize_dns; url_t *urllist; proxy_t *proxylist; /* calculate hash code for given url. eliminate duplicates */ int find_url(char *path, int len, hinfo_t *hinfo, u_short port, int *hashp) { url_t *u; int hash; hash = calc_hash(path, len) ^ (int)hinfo ^ port; for (u = urllist; u; u = u->next) { if (hash == u->url_hash && u->host->info == hinfo && u->port == port && !memcmp(u->local_part, path, len) && u->local_part[len] == '\0') { dbg(URL, ("http://%s/%.*s already in chain, not adding\n", hinfo->name, len, path)); return 1; } } *hashp = hash; return 0; } /* Return 1 if url and referer are in the same directory, otherwise 0 */ int same_dir(char *path, int len, url_t *referer, int is_req) { int d, p = referer->disp_pathoff; char *lp = referer->local_part; if ((is_req ? referer->parm->opt->follow_src : referer->parm->opt->follow_href) >= HOST_RECURSIVE) return 1; /* check, if in same top-level disposition directory as referer */ if (p != -1) { if (len < p) goto notsub; while (lp[p] != '/') p++; if (len < p) goto notsub; if (memcmp(path, lp, p)) goto notsub; if (len > p && path[p] != '/') goto notsub; } /* now check, if max directory nesting level reached */ if (referer->parm->opt->max_depth >= 0) for (p++, d = 0; p < len; p++) if (path[p] == '/') if (++d > referer->parm->opt->max_depth) { dbg(URL, ("not added '/%.*s' (directories to deeply nested)\n", len, path)); return 0; } return 1; notsub: dbg(URL, ("not added '/%.*s' (different top-dir)\n", len, path)); return 0; } #define PR_NOURL -2 #define PR_BAD -1 #define PR_UNK 0 #define PR_HTTP 1 /* parse the "remote" part of a url. return protocol */ static int parse_url(char *url, int len, int guess_proto, int *pp, char **auth, int *auth_len, u_short *port, char *hostbuf, int *hnlp) { int p, po, ho, noho, hnl; /* int proto;*/ checken("parse_url (top)"); /* get protocol */ for (p = 0; ; p++) { if (p >= len) { brken: /* no protocol. impossible in good url. */ if (guess_proto) { rhttp: /* proto = PR_HTTP;*/ p = 0; break; } return PR_NOURL; } if (url[p] == ':') { noho = len < p + 2 || url[p + 1] != '/' || url[p + 2] != '/'; if (noho && guess_proto) goto rhttp; if (len < 7 || strncasecmp(url, "http:", 5)) { *pp = p; /* for message */ return PR_UNK; } /* proto = PR_HTTP;*/ if (noho) return PR_BAD; /* no host. */ p += 3; break; } if (!isalpha((int)url[p])) goto brken; } /* get host & port */ *auth = 0; *auth_len = 0; reho: ho = p; po = 0; for (; p < len && url[p] != '/'; p++) { if (url[p] == ':') po = p; else if (url[p] == '@') { *auth = url + ho; *auth_len = p - ho; p++; goto reho; } } if (p == ho) return PR_BAD; /* empty host. */ if (po) { if (!(*port = atoi(url + po + 1))) return PR_BAD; /* invalid port. */ } else { *port = 80; po = p; } for (hnl = 0; ho < po && hnl < SHORTSTR - 1; ho++, hnl++) hostbuf[hnl] = tolower((int)url[ho]); hostbuf[hnl] = '\0'; *hnlp = hnl; *pp = p; /* return proto;*/ checken("parse_url (end)"); return PR_HTTP; } /* parse proxy url, return proxy structure */ proxy_t * parse_proxy(char *proxy, int ratio) { char *auth, *pt; proxy_t *prox; host_t *host; int hnl, auth_len, p, lp, len, lplen; u_short port; char hostbuf[SHORTSTR]; checken("parse_proxy (top)"); dbg(URL, ("parse_proxy '%s', ratio %d\n", proxy, ratio)); len = strlen(proxy); switch (parse_url(proxy, len, 1, &p, &auth, &auth_len, &port, hostbuf, &hnl)) { default: prx(ERR, "unsupported proxy protocol %.*s\n", p, proxy); case PR_BAD: case PR_NOURL: return 0; case PR_HTTP: break; } /* get path in local part */ if (p < len) lp = p + 1; else lp = len; lplen = len - lp; if ((host = host_lookup_fast(hostbuf, hnl)) != NULL) { if (!host->info) return 0; } if (!(prox = mmalloc(sizeof(*prox) + lplen + 1 + (auth ? len_enc_auth(auth_len) : 0)))) return 0; if (!host) { if (!(host = host_lookup_full(hostbuf, hnl, 0, prox))) { free(prox); return 0; } waiting_proxies++; prox->ready = 0; } else prox->ready = 1; memcpy(prox->cgi_path, proxy + lp, lplen); pt = prox->cgi_path + lplen; *pt++ = '\0'; dbg(URL, (" cgi_path '%s'\n", prox->cgi_path)); if (auth) { encode_auth(pt, auth, auth_len); dbg(URL, (" has auth.\n")); prox->have_auth = 1; } else prox->have_auth = 0; prox->host = host; prox->port = port; /* prox->cur_conn = 0;*/ prox->score = 0; prox->ratio = ratio; prox->next = proxylist; proxylist = prox; checken("parse_proxy (end)"); return prox; } /* parse the complete url string */ /* a url_t structure is returned, which should be freed, if it is not actually queued for download */ int parse_add_url(char *url, int len, url_t *referer, url_parm_t *parm, int istopdir, int isreq, int relocs, int link_depth) { char hostbuf[SHORTSTR], *auth; int hash, hnl, auth_len, p, lp, dp, fp; u_short port; url_t *u; host_t *host; checken("parse_add_url (top)"); if (!parm) parm = referer->parm; parm->ref_count++; /* XXX should be done later */ checken_updgen(parm); switch (parse_url(url, len, istopdir > 1, &p, &auth, &auth_len, &port, hostbuf, &hnl)) { case PR_NOURL: dbg(URL, ("'%.*s' is no URL\n", len, url)); return -1; case PR_UNK: prx(WRN, "unsupported protocol %.*s\n", p, url); case PR_BAD: return 0; } if (!(host = host_lookup_fast(hostbuf, hnl))) { if (!istopdir && economize_dns) { dbg(URL, ("not adding '%.*s' (non-cached hostname)\n", len, url)); return 0; } } else { if (!host->info) { /* prx(ERR, "non-existent host in '%.*s'\n", len, url); */ num_urls++; num_urls_fail++; return 0; } if (referer && (host->info != referer->host->info || port != referer->port) && (isreq ? referer->parm->opt->follow_src : referer->parm->opt->follow_href) <= HOST_RECURSIVE) { dbg(URL, ("not adding '%.*s' (different host)\n", len, url)); return 0; } } /* get path in local part */ dp = -1; if (p < len) { lp = p + 1; repath: fp = ++p; if (p + 2 <= len && url[p] == '.' && url[p + 1] == '.' && (p + 2 == len || url[p + 2] == '/')) { prx(WRN, "'..' in URL? We hit an evil site ...\n"); return 0; } for (; p < len; p++) { if (url[p] == '/') { dp = fp - lp; goto repath; } if (url[p] == '?') { if ((istopdir == 2 ? 1 : istopdir) <= parm->opt->inhibit_cgiget) { dbg(URL, ("not adding ?-URL '%.*s'\n", len, url)); return 0; } break; } } } else lp = fp = len; if (host && find_url(url + lp, len - lp, host->info, port, &hash)) return 0; if (referer && (port != referer->port || !same_dir(url + lp, len - lp, referer, isreq))) return 0; if (!(u = mmalloc(sizeof(*u) + (len - lp + 1)))) return 0; if (!host && !(host = host_lookup_full(hostbuf, hnl, u, 0))) { free(u); return 0; } u->local_part[len - lp] = '\0'; memcpy(u->local_part, url + lp, len - lp); #ifdef USE_MAGIC u->len = (int)(&u->local_part[len - lp + 1] - (char *)&(u->len)); #endif u->url_hash = hash; u->referer = referer; u->parm = parm; u->host = host; u->port = port; u->path_len = fp - lp; u->is_top_dir = istopdir != 0; u->is_requisite = isreq; u->relocs = relocs; u->link_depth = link_depth; u->disp_pathoff = istopdir ? dp : referer->disp_pathoff; if (auth) { clone_parm(u); if (!(u->parm->http_auth = mmalloc(len_enc_auth(auth_len)))) { free_url(u); return 0; } encode_auth(u->parm->http_auth, auth, auth_len); checken_updgen(u->parm); } checken("parse_add_url (pre-end)"); dbg(URL, ("'%.*s' => '%.*s' @ %s : %i / '%.*s' '%.*s' %i\n", len, url, auth_len, auth ? auth : "", host->name, port, fp - lp, url + lp, len - fp, url + fp, dp)); if (host->info) add_url(u); return 1; } int queue_url(url_t *u) { wurl_t *wu; if (!(wu = mmalloc(sizeof(*wu)))) return 0; wu->url = u; cq_append(queue_urls_connect, wu); return 1; } static void do_add_url(url_t *u) { u->attempt = 0; u->next = urllist; urllist = u; num_urls++; checken_updurl(u); dbg(URL, ("added http://%s/%s\n", u->host->name, u->local_part)); } /* add a url to the url chain and enqueue for processing */ void add_url(url_t *u) { checken("add_url (top)"); if (queue_url(u)) do_add_url(u); else free(u); checken("add_url (end)"); } void free_url(url_t *u) { checken_gen(u->parm, "parm checksum failure in free_url()"); if (--u->parm->ref_count < 1) free(u->parm); else checken_updgen(u->parm); free(u); } int clone_parm(url_t *u) { url_parm_t *parm; checken_gen(u->parm, "parm checksum failure in clone_parm()"); if (u->parm->ref_count > 1) { if (!(parm = mmalloc(sizeof(*parm)))) return 0; memcpy(parm, u->parm, sizeof(*parm)); parm->ref_count = 1; u->parm->ref_count--; checken_updgen(u->parm); u->parm = parm; checken_updgen(parm); } return 1; }