/* $Id: url.c,v 1.7 2003/04/05 18:07:24 ossi Exp $ *
 *
 * puf 0.9  Copyright (C) 2000-2003 by Oswald Buddenhagen <puf@ossi.cjb.net>
 * based on puf 0.1.x (C) 1999,2000 by Anders Gavare <gavare@hotmail.com>
 *
 * You may modify and distribute this code under the terms of the GPL.
 * There is NO WARRANTY of any kind. See COPYING for details.
 *
 * url.c - parse and manage urls
 *
 */

#include "puf.h"


int economize_dns;
url_t *urllist;
proxy_t *proxylist;

/*  calculate hash code for given url. eliminate duplicates  */
int 
find_url(char *path, int len, hinfo_t *hinfo, u_short port, int *hashp)
{
    url_t *u;
    int hash;

    hash = calc_hash(path, len) ^ (int)hinfo ^ port;
    for (u = urllist; u; u = u->next) {
	if (hash == u->url_hash && 
	    u->host->info == hinfo && 
	    u->port == port &&
	    !memcmp(u->local_part, path, len) && u->local_part[len] == '\0')
	{
	    dbg(URL, ("http://%s/%.*s already in chain, not adding\n",
		      hinfo->name, len, path));
	    return 1;
	}
    }
    *hashp = hash;
    return 0;
}


/*  Return 1 if url and referer are in the same directory, otherwise 0  */
int 
same_dir(char *path, int len, url_t *referer, int is_req)
{
    int d, p = referer->disp_pathoff;
    char *lp = referer->local_part;

    if ((is_req ? 
	 referer->parm->opt->follow_src : referer->parm->opt->follow_href)
	    >= HOST_RECURSIVE)
	return 1;

    /*  check, if in same top-level disposition directory as referer  */
    if (p != -1) {
	if (len < p)
	    goto notsub;
	while (lp[p] != '/')
	    p++;
	if (len < p)
	    goto notsub;
	if (memcmp(path, lp, p))
	    goto notsub;
	if (len > p && path[p] != '/')
	    goto notsub;
    }

    /*  now check, if max directory nesting level reached  */
    if (referer->parm->opt->max_depth >= 0)
	for (p++, d = 0; p < len; p++)
	    if (path[p] == '/')
		if (++d > referer->parm->opt->max_depth) {
		    dbg(URL, 
			("not added '/%.*s' (directories to deeply nested)\n",
			 len, path));
		    return 0;
		}

    return 1;

  notsub:
    dbg(URL, ("not added '/%.*s' (different top-dir)\n", len, path));
    return 0;
}


#define PR_NOURL -2
#define PR_BAD -1
#define PR_UNK 0
#define PR_HTTP 1

/*  parse the "remote" part of a url. return protocol  */
static int 
parse_url(char *url, int len, int guess_proto, int *pp, 
	      char **auth, int *auth_len, 
	      u_short *port, char *hostbuf, int *hnlp)
{
    int p, po, ho, noho, hnl;
/*    int proto;*/

    checken("parse_url (top)");
    /*  get protocol  */
    for (p = 0; ; p++) {
	if (p >= len) {
	  brken:		/*  no protocol. impossible in good url.  */
	    if (guess_proto) {
	      rhttp:
/*		proto = PR_HTTP;*/
		p = 0;
		break;
	    }
	    return PR_NOURL;
	}
	if (url[p] == ':') {
	    noho = len < p + 2 || url[p + 1] != '/' || url[p + 2] != '/';
	    if (noho && guess_proto)
		goto rhttp;
	    if (len < 7 || strncasecmp(url, "http:", 5)) {
		*pp = p;	/*  for message  */
		return PR_UNK;
	    }
/*	    proto = PR_HTTP;*/
	    if (noho)
		return PR_BAD;	/*  no host.  */
	    p += 3;
	    break;
	}
	if (!isalpha((int)url[p]))
	    goto brken;
    }

    /*  get host & port  */
    *auth = 0;
    *auth_len = 0;
  reho:
    ho = p;
    po = 0;
    for (; p < len && url[p] != '/'; p++) {
	if (url[p] == ':')
	    po = p;
	else if (url[p] == '@') {
	    *auth = url + ho;
	    *auth_len = p - ho;
	    p++;
	    goto reho;
	}
    }
    if (p == ho)
	return PR_BAD;		/*  empty host.  */

    if (po) {
	if (!(*port = atoi(url + po + 1)))
	    return PR_BAD;		/*  invalid port.  */
    } else {
	*port = 80;
	po = p;
    }

    for (hnl = 0; ho < po && hnl < SHORTSTR - 1; ho++, hnl++)
	hostbuf[hnl] = tolower((int)url[ho]);
    hostbuf[hnl] = '\0';

    *hnlp = hnl;
    *pp = p;
/*    return proto;*/
    checken("parse_url (end)");
    return PR_HTTP;
}


/*  parse proxy url, return proxy structure  */
proxy_t *
parse_proxy(char *proxy, int ratio)
{
    char *auth, *pt;
    proxy_t *prox;
    host_t *host;
    int hnl, auth_len, p, lp, len, lplen;
    u_short port;
    char hostbuf[SHORTSTR];

    checken("parse_proxy (top)");
    dbg(URL, ("parse_proxy '%s', ratio %d\n", proxy, ratio));
    len = strlen(proxy);
    switch (parse_url(proxy, len, 1, &p, &auth, &auth_len, &port, 
		      hostbuf, &hnl))
    {
	default:
	    prx(ERR, "unsupported proxy protocol %.*s\n", p, proxy);
	case PR_BAD:
	case PR_NOURL:
	    return 0;
	case PR_HTTP:
	    break;
    }

    /*  get path in local part  */
    if (p < len)
	lp = p + 1;
    else
	lp = len;
    lplen = len - lp;

    if ((host = host_lookup_fast(hostbuf, hnl)) != NULL) {
	if (!host->info)
	    return 0;
    }

    if (!(prox = mmalloc(sizeof(*prox) + lplen + 1 + 
			 (auth ? len_enc_auth(auth_len) : 0))))
	return 0;

    if (!host) {
	if (!(host = host_lookup_full(hostbuf, hnl, 0, prox))) {
	    free(prox);
	    return 0;
	}
	waiting_proxies++;
	prox->ready = 0;
    } else
	prox->ready = 1;

    memcpy(prox->cgi_path, proxy + lp, lplen);
    pt = prox->cgi_path + lplen;
    *pt++ = '\0';
    dbg(URL, (" cgi_path '%s'\n", prox->cgi_path));

    if (auth) {
	encode_auth(pt, auth, auth_len);
	dbg(URL, (" has auth.\n"));
	prox->have_auth = 1;
    } else
	prox->have_auth = 0;

    prox->host = host;
    prox->port = port;
/*    prox->cur_conn = 0;*/
    prox->score = 0;
    prox->ratio = ratio;

    prox->next = proxylist;
    proxylist = prox;

    checken("parse_proxy (end)");
    return prox;
}


/*  parse the complete url string  */
/*  a url_t structure is returned, which should be freed, 
    if it is not actually queued for download  */
int 
parse_add_url(char *url, int len, url_t *referer, url_parm_t *parm,
	      int istopdir, int isreq, int relocs, int link_depth)
{
    char hostbuf[SHORTSTR], *auth;
    int hash, hnl, auth_len, p, lp, dp, fp;
    u_short port;
    url_t *u;
    host_t *host;

    checken("parse_add_url (top)");

    if (!parm)
	parm = referer->parm;
    parm->ref_count++; /* XXX should be done later */
    checken_updgen(parm);

    switch (parse_url(url, len, istopdir > 1, &p, &auth, &auth_len, &port,
		      hostbuf, &hnl)) {
	case PR_NOURL:
	    dbg(URL, ("'%.*s' is no URL\n", len, url));
	    return -1;
	case PR_UNK:
	    prx(WRN, "unsupported protocol %.*s\n", p, url);
	case PR_BAD:
	    return 0;
    }

    if (!(host = host_lookup_fast(hostbuf, hnl))) {
	if (!istopdir && economize_dns) {
	    dbg(URL, ("not adding '%.*s' (non-cached hostname)\n", len, url));
	    return 0;
	}
    } else {
	if (!host->info) {
	    /*  prx(ERR, "non-existent host in '%.*s'\n", len, url);  */
	    num_urls++;
	    num_urls_fail++;
	    return 0;
	}
	if (referer &&
	    (host->info != referer->host->info || port != referer->port) &&
	    (isreq ? 
	     referer->parm->opt->follow_src : referer->parm->opt->follow_href)
		<= HOST_RECURSIVE)
	{
	    dbg(URL, ("not adding '%.*s' (different host)\n", len, url));
	    return 0;
	}
    }

    /*  get path in local part  */
    dp = -1;
    if (p < len) {
	lp = p + 1;
      repath:
	fp = ++p;
	if (p + 2 <= len && url[p] == '.' && url[p + 1] == '.' &&
	    (p + 2 == len || url[p + 2] == '/')) {
	    prx(WRN, "'..' in URL? We hit an evil site ...\n");
	    return 0;
	}
	for (; p < len; p++) {
	    if (url[p] == '/') {
		dp = fp - lp;
		goto repath;
	    }
	    if (url[p] == '?') {
		if ((istopdir == 2 ? 1 : istopdir) 
			<= parm->opt->inhibit_cgiget)
		{
		    dbg(URL, ("not adding ?-URL '%.*s'\n", len, url));
		    return 0;
		}
		break;
	    }
	}
    } else
	lp = fp = len;

    if (host && find_url(url + lp, len - lp, host->info, port, &hash))
	return 0;

    if (referer && 
	(port != referer->port || 
	 !same_dir(url + lp, len - lp, referer, isreq)))
	return 0;

    if (!(u = mmalloc(sizeof(*u) + (len - lp + 1))))
	return 0;

    if (!host && !(host = host_lookup_full(hostbuf, hnl, u, 0))) {
	free(u);
	return 0;
    }

    u->local_part[len - lp] = '\0';
    memcpy(u->local_part, url + lp, len - lp);

#ifdef USE_MAGIC
    u->len = (int)(&u->local_part[len - lp + 1] - (char *)&(u->len));
#endif

    u->url_hash = hash;
    u->referer = referer;
    u->parm = parm;
    u->host = host;
    u->port = port;
    u->path_len = fp - lp;
    u->is_top_dir = istopdir != 0;
    u->is_requisite = isreq;
    u->relocs = relocs;
    u->link_depth = link_depth;
    u->disp_pathoff = istopdir ? dp : referer->disp_pathoff;

    if (auth) {
	clone_parm(u);
	if (!(u->parm->http_auth = mmalloc(len_enc_auth(auth_len)))) {
	    free_url(u);
	    return 0;
	}
	encode_auth(u->parm->http_auth, auth, auth_len);
	checken_updgen(u->parm);
    }

    checken("parse_add_url (pre-end)");

    dbg(URL, ("'%.*s' => '%.*s' @ %s : %i / '%.*s' '%.*s' %i\n", len, url,
	 auth_len, auth ? auth : "", host->name, port,
	 fp - lp, url + lp, len - fp, url + fp, dp));

    if (host->info)
	add_url(u);

    return 1;
}

int 
queue_url(url_t *u)
{
    wurl_t *wu;

    if (!(wu = mmalloc(sizeof(*wu))))
	return 0;
    wu->url = u;
    cq_append(queue_urls_connect, wu);
    return 1;
}

static void 
do_add_url(url_t *u)
{
    u->attempt = 0;

    u->next = urllist;
    urllist = u;
    num_urls++;

    checken_updurl(u);

    dbg(URL, ("added http://%s/%s\n", u->host->name, u->local_part));
}

/*  add a url to the url chain and enqueue for processing  */
void 
add_url(url_t *u)
{
    checken("add_url (top)");
    if (queue_url(u))
	do_add_url(u);
    else
	free(u);
    checken("add_url (end)");
}

void 
free_url(url_t *u)
{
    checken_gen(u->parm, "parm checksum failure in free_url()");
    if (--u->parm->ref_count < 1)
	free(u->parm);
    else
	checken_updgen(u->parm);
    free(u);
}

int 
clone_parm(url_t *u)
{
    url_parm_t *parm;

    checken_gen(u->parm, "parm checksum failure in clone_parm()");
    if (u->parm->ref_count > 1) {
	if (!(parm = mmalloc(sizeof(*parm))))
	    return 0;
	memcpy(parm, u->parm, sizeof(*parm));
	parm->ref_count = 1;
	u->parm->ref_count--;
	checken_updgen(u->parm);
	u->parm = parm;
	checken_updgen(parm);
    }
    return 1;
}


syntax highlighted by Code2HTML, v. 0.9.1