/* $Id: url.c,v 1.7 2003/04/05 18:07:24 ossi Exp $ *
*
* puf 0.9 Copyright (C) 2000-2003 by Oswald Buddenhagen <puf@ossi.cjb.net>
* based on puf 0.1.x (C) 1999,2000 by Anders Gavare <gavare@hotmail.com>
*
* You may modify and distribute this code under the terms of the GPL.
* There is NO WARRANTY of any kind. See COPYING for details.
*
* url.c - parse and manage urls
*
*/
#include "puf.h"
int economize_dns;
url_t *urllist;
proxy_t *proxylist;
/* calculate hash code for given url. eliminate duplicates */
int
find_url(char *path, int len, hinfo_t *hinfo, u_short port, int *hashp)
{
url_t *u;
int hash;
hash = calc_hash(path, len) ^ (int)hinfo ^ port;
for (u = urllist; u; u = u->next) {
if (hash == u->url_hash &&
u->host->info == hinfo &&
u->port == port &&
!memcmp(u->local_part, path, len) && u->local_part[len] == '\0')
{
dbg(URL, ("http://%s/%.*s already in chain, not adding\n",
hinfo->name, len, path));
return 1;
}
}
*hashp = hash;
return 0;
}
/* Return 1 if url and referer are in the same directory, otherwise 0 */
int
same_dir(char *path, int len, url_t *referer, int is_req)
{
int d, p = referer->disp_pathoff;
char *lp = referer->local_part;
if ((is_req ?
referer->parm->opt->follow_src : referer->parm->opt->follow_href)
>= HOST_RECURSIVE)
return 1;
/* check, if in same top-level disposition directory as referer */
if (p != -1) {
if (len < p)
goto notsub;
while (lp[p] != '/')
p++;
if (len < p)
goto notsub;
if (memcmp(path, lp, p))
goto notsub;
if (len > p && path[p] != '/')
goto notsub;
}
/* now check, if max directory nesting level reached */
if (referer->parm->opt->max_depth >= 0)
for (p++, d = 0; p < len; p++)
if (path[p] == '/')
if (++d > referer->parm->opt->max_depth) {
dbg(URL,
("not added '/%.*s' (directories to deeply nested)\n",
len, path));
return 0;
}
return 1;
notsub:
dbg(URL, ("not added '/%.*s' (different top-dir)\n", len, path));
return 0;
}
#define PR_NOURL -2
#define PR_BAD -1
#define PR_UNK 0
#define PR_HTTP 1
/* parse the "remote" part of a url. return protocol */
static int
parse_url(char *url, int len, int guess_proto, int *pp,
char **auth, int *auth_len,
u_short *port, char *hostbuf, int *hnlp)
{
int p, po, ho, noho, hnl;
/* int proto;*/
checken("parse_url (top)");
/* get protocol */
for (p = 0; ; p++) {
if (p >= len) {
brken: /* no protocol. impossible in good url. */
if (guess_proto) {
rhttp:
/* proto = PR_HTTP;*/
p = 0;
break;
}
return PR_NOURL;
}
if (url[p] == ':') {
noho = len < p + 2 || url[p + 1] != '/' || url[p + 2] != '/';
if (noho && guess_proto)
goto rhttp;
if (len < 7 || strncasecmp(url, "http:", 5)) {
*pp = p; /* for message */
return PR_UNK;
}
/* proto = PR_HTTP;*/
if (noho)
return PR_BAD; /* no host. */
p += 3;
break;
}
if (!isalpha((int)url[p]))
goto brken;
}
/* get host & port */
*auth = 0;
*auth_len = 0;
reho:
ho = p;
po = 0;
for (; p < len && url[p] != '/'; p++) {
if (url[p] == ':')
po = p;
else if (url[p] == '@') {
*auth = url + ho;
*auth_len = p - ho;
p++;
goto reho;
}
}
if (p == ho)
return PR_BAD; /* empty host. */
if (po) {
if (!(*port = atoi(url + po + 1)))
return PR_BAD; /* invalid port. */
} else {
*port = 80;
po = p;
}
for (hnl = 0; ho < po && hnl < SHORTSTR - 1; ho++, hnl++)
hostbuf[hnl] = tolower((int)url[ho]);
hostbuf[hnl] = '\0';
*hnlp = hnl;
*pp = p;
/* return proto;*/
checken("parse_url (end)");
return PR_HTTP;
}
/* parse proxy url, return proxy structure */
proxy_t *
parse_proxy(char *proxy, int ratio)
{
char *auth, *pt;
proxy_t *prox;
host_t *host;
int hnl, auth_len, p, lp, len, lplen;
u_short port;
char hostbuf[SHORTSTR];
checken("parse_proxy (top)");
dbg(URL, ("parse_proxy '%s', ratio %d\n", proxy, ratio));
len = strlen(proxy);
switch (parse_url(proxy, len, 1, &p, &auth, &auth_len, &port,
hostbuf, &hnl))
{
default:
prx(ERR, "unsupported proxy protocol %.*s\n", p, proxy);
case PR_BAD:
case PR_NOURL:
return 0;
case PR_HTTP:
break;
}
/* get path in local part */
if (p < len)
lp = p + 1;
else
lp = len;
lplen = len - lp;
if ((host = host_lookup_fast(hostbuf, hnl)) != NULL) {
if (!host->info)
return 0;
}
if (!(prox = mmalloc(sizeof(*prox) + lplen + 1 +
(auth ? len_enc_auth(auth_len) : 0))))
return 0;
if (!host) {
if (!(host = host_lookup_full(hostbuf, hnl, 0, prox))) {
free(prox);
return 0;
}
waiting_proxies++;
prox->ready = 0;
} else
prox->ready = 1;
memcpy(prox->cgi_path, proxy + lp, lplen);
pt = prox->cgi_path + lplen;
*pt++ = '\0';
dbg(URL, (" cgi_path '%s'\n", prox->cgi_path));
if (auth) {
encode_auth(pt, auth, auth_len);
dbg(URL, (" has auth.\n"));
prox->have_auth = 1;
} else
prox->have_auth = 0;
prox->host = host;
prox->port = port;
/* prox->cur_conn = 0;*/
prox->score = 0;
prox->ratio = ratio;
prox->next = proxylist;
proxylist = prox;
checken("parse_proxy (end)");
return prox;
}
/* parse the complete url string */
/* a url_t structure is returned, which should be freed,
if it is not actually queued for download */
int
parse_add_url(char *url, int len, url_t *referer, url_parm_t *parm,
int istopdir, int isreq, int relocs, int link_depth)
{
char hostbuf[SHORTSTR], *auth;
int hash, hnl, auth_len, p, lp, dp, fp;
u_short port;
url_t *u;
host_t *host;
checken("parse_add_url (top)");
if (!parm)
parm = referer->parm;
parm->ref_count++; /* XXX should be done later */
checken_updgen(parm);
switch (parse_url(url, len, istopdir > 1, &p, &auth, &auth_len, &port,
hostbuf, &hnl)) {
case PR_NOURL:
dbg(URL, ("'%.*s' is no URL\n", len, url));
return -1;
case PR_UNK:
prx(WRN, "unsupported protocol %.*s\n", p, url);
case PR_BAD:
return 0;
}
if (!(host = host_lookup_fast(hostbuf, hnl))) {
if (!istopdir && economize_dns) {
dbg(URL, ("not adding '%.*s' (non-cached hostname)\n", len, url));
return 0;
}
} else {
if (!host->info) {
/* prx(ERR, "non-existent host in '%.*s'\n", len, url); */
num_urls++;
num_urls_fail++;
return 0;
}
if (referer &&
(host->info != referer->host->info || port != referer->port) &&
(isreq ?
referer->parm->opt->follow_src : referer->parm->opt->follow_href)
<= HOST_RECURSIVE)
{
dbg(URL, ("not adding '%.*s' (different host)\n", len, url));
return 0;
}
}
/* get path in local part */
dp = -1;
if (p < len) {
lp = p + 1;
repath:
fp = ++p;
if (p + 2 <= len && url[p] == '.' && url[p + 1] == '.' &&
(p + 2 == len || url[p + 2] == '/')) {
prx(WRN, "'..' in URL? We hit an evil site ...\n");
return 0;
}
for (; p < len; p++) {
if (url[p] == '/') {
dp = fp - lp;
goto repath;
}
if (url[p] == '?') {
if ((istopdir == 2 ? 1 : istopdir)
<= parm->opt->inhibit_cgiget)
{
dbg(URL, ("not adding ?-URL '%.*s'\n", len, url));
return 0;
}
break;
}
}
} else
lp = fp = len;
if (host && find_url(url + lp, len - lp, host->info, port, &hash))
return 0;
if (referer &&
(port != referer->port ||
!same_dir(url + lp, len - lp, referer, isreq)))
return 0;
if (!(u = mmalloc(sizeof(*u) + (len - lp + 1))))
return 0;
if (!host && !(host = host_lookup_full(hostbuf, hnl, u, 0))) {
free(u);
return 0;
}
u->local_part[len - lp] = '\0';
memcpy(u->local_part, url + lp, len - lp);
#ifdef USE_MAGIC
u->len = (int)(&u->local_part[len - lp + 1] - (char *)&(u->len));
#endif
u->url_hash = hash;
u->referer = referer;
u->parm = parm;
u->host = host;
u->port = port;
u->path_len = fp - lp;
u->is_top_dir = istopdir != 0;
u->is_requisite = isreq;
u->relocs = relocs;
u->link_depth = link_depth;
u->disp_pathoff = istopdir ? dp : referer->disp_pathoff;
if (auth) {
clone_parm(u);
if (!(u->parm->http_auth = mmalloc(len_enc_auth(auth_len)))) {
free_url(u);
return 0;
}
encode_auth(u->parm->http_auth, auth, auth_len);
checken_updgen(u->parm);
}
checken("parse_add_url (pre-end)");
dbg(URL, ("'%.*s' => '%.*s' @ %s : %i / '%.*s' '%.*s' %i\n", len, url,
auth_len, auth ? auth : "", host->name, port,
fp - lp, url + lp, len - fp, url + fp, dp));
if (host->info)
add_url(u);
return 1;
}
int
queue_url(url_t *u)
{
wurl_t *wu;
if (!(wu = mmalloc(sizeof(*wu))))
return 0;
wu->url = u;
cq_append(queue_urls_connect, wu);
return 1;
}
static void
do_add_url(url_t *u)
{
u->attempt = 0;
u->next = urllist;
urllist = u;
num_urls++;
checken_updurl(u);
dbg(URL, ("added http://%s/%s\n", u->host->name, u->local_part));
}
/* add a url to the url chain and enqueue for processing */
void
add_url(url_t *u)
{
checken("add_url (top)");
if (queue_url(u))
do_add_url(u);
else
free(u);
checken("add_url (end)");
}
void
free_url(url_t *u)
{
checken_gen(u->parm, "parm checksum failure in free_url()");
if (--u->parm->ref_count < 1)
free(u->parm);
else
checken_updgen(u->parm);
free(u);
}
int
clone_parm(url_t *u)
{
url_parm_t *parm;
checken_gen(u->parm, "parm checksum failure in clone_parm()");
if (u->parm->ref_count > 1) {
if (!(parm = mmalloc(sizeof(*parm))))
return 0;
memcpy(parm, u->parm, sizeof(*parm));
parm->ref_count = 1;
u->parm->ref_count--;
checken_updgen(u->parm);
u->parm = parm;
checken_updgen(parm);
}
return 1;
}
syntax highlighted by Code2HTML, v. 0.9.1