/* $Id: recurse.c,v 1.6 2003/04/05 18:07:24 ossi Exp $ *
 *
 * puf 0.9  Copyright (C) 2000-2003 by Oswald Buddenhagen <puf@ossi.cjb.net>
 * based on puf 0.1.x (C) 1999,2000 by Anders Gavare <gavare@hotmail.com>
 *
 * You may modify and distribute this code under the terms of the GPL.
 * There is NO WARRANTY of any kind. See COPYING for details.
 *
 * recurse.c - scan files/buffers for references to other urls
 *
 */

#include "puf.h"


/*
 *  1.  Expand filename into a fully qualified url (relative
 *      to referer if it's not absolute)
 *  2.  Check to see if it should be recursed into, and if so
 *      then add it
 */
static void 
add_reference(char *ref, int len, url_t *referer, int is_req)
{
    char buf[SHORTSTR];
    url_t *u;
    int totlen, pathlen, hash, p, t, sl;

    if (!len || !referer)
	return;

    checken("add_reference (top)");

    dbg(URL, ("add_reference '%.*s' by 'http://%s/%s'\n", len, ref,
	referer->host->name, referer->local_part));

    if (parse_add_url(ref, len, referer, 0, 0, is_req, 0, 
		      referer->link_depth + 1) != -1)
	return;		/*  with protocol, maybe some error  */

    if (ref[0] == '/') {
	pathlen = 0;
	p = 1;
    } else {
	pathlen = referer->path_len;
	memcpy(buf, referer->local_part, pathlen);
	p = 0;
    }

    for (totlen = pathlen, sl = '/'; p < len; p++) {
	/*  Here we are positioned at the first character of a name:  */

	/*  Find the next slash or end of string  */
	for (t = p; p < len && ref[p] != sl; p++)
	    if (ref[p] == '?') {
		if (referer->parm->opt->inhibit_cgiget >= 0) {
		    dbg(URL, ("not adding relative ?-ref '%.*s'\n", len, ref));
		    return;
		} else
		    sl = -1;
	    }

	if (p - t > 0) {	/*  only add if there really was a name  */
	    if (p - t == 2 && ref[t] == '.' && ref[t + 1] == '.') {
		if (!totlen) {
		    prx(WRN, 
			"relative ref '%.*s' in http://%s/%s points below /\n", 
		        len, ref, referer->host->name, referer->local_part);
		    return;
		}
		for (; --totlen > 0 && buf[totlen - 1] != '/'; );
		pathlen = totlen;
	    } else if (p - t != 1 || ref[t] != '.') {
		if (totlen + (p - t) + 1 > SHORTSTR) {
		    prx(ERR, "relative ref '%.*s' in http://%s/%s too long\n", 
			len, ref, referer->host->name, referer->local_part);
		    return;
		}
		memcpy(buf + totlen, ref + t, p - t);
		totlen += p - t;
		if (ref[p] == '/') {
		    buf[totlen++] = '/';
		    pathlen = totlen;
		}
	    }
	}
    }
    dbg(URL, ("'%.*s' => / '%.*s' '%.*s'\n", len, ref, pathlen, buf,
	totlen - pathlen, buf + pathlen));

    if (find_url(buf, totlen, referer->host->info, referer->port, &hash))
	return;

    if (!same_dir(buf, totlen, referer, is_req))
	return;

    if (!(u = mmalloc(sizeof(*u) + totlen + 1)))
	return;

    u->local_part[totlen] = '\0';
    memcpy(u->local_part, buf, totlen);

#ifdef USE_MAGIC
    u->len = (u->local_part - (char *)&(u->len)) + totlen + 1;
#endif
    u->url_hash = hash;
    u->referer = referer;
    checken_gen(referer->parm, "parm checksum failure in add_reference");
    u->parm = referer->parm;
    u->parm->ref_count++;
    checken_updgen(u->parm);
    u->host = referer->host;
    u->port = referer->port;
    u->path_len = pathlen;
    u->disp_pathoff = referer->disp_pathoff;
    u->is_top_dir = 0;
    u->is_requisite = is_req;
    u->relocs = 0;
    u->link_depth = referer->link_depth + 1;

    checken("add_reference (pre-end)");

    add_url(u);
}


/* Find rule in buf and return pointer past the match */
/* This is case-sensitive!!! */
static char *
matchen(char *buff, int blen, char *rule)
{
    char r, b, *buf, *rp;
    int bp, len;

    for (buf = buff - 1, len = blen;;) {
	bp = 0;
	rp = rule;
	r = *rp;
	do {
	    buf++;
	    if (--len < 0)
		goto ret0;
	} while (*buf != r);
	do {
	    bp++;
	    rp++;
	    b = buf[bp];
	    if (!(r = *rp))
		goto complete;
	} while (b == r);
    }

  complete:
    return buf + bp;
  ret0:
    return 0;
}

static void 
recursen(url_t *u, char *buf, char *lbuf, int len, int max, 
	 char *what, int is_req)
{
    char *moff;
    int p, p2;

    for (p = 0; (moff = matchen(lbuf + p, max - p, what)); ) {
	p = moff - lbuf;

	/*  Find the end of the ref-string  */
	if (buf[p] == '\\' && buf[p + 1] == '"') {
	    p += 2;
	    p2 = p;
	    while (p + 2 < len && buf[p] != '#' && buf[p] > ' ' && 
		   (buf[p] != '\\' || buf[p + 1] != '"'))
		p++;
	} else if (buf[p] == '"') {
	    p++;
	    p2 = p;
	    while (p + 1 < len && buf[p] != '#' && buf[p] > ' ' &&
		   buf[p] != '"')
		p++;
	} else {
	    p2 = p;
	    while (p + 1 < len && buf[p] != '#' && buf[p] > ' ' &&
		   buf[p] != '>')
		p++;
	}

	if (p - p2 > 0)
	    add_reference(buf + p2, p - p2, u, is_req);
    }
}

/*  scan a buffer for references  */
int 
recurse_buff(url_t *u, char *buf, int len, int notlast)
{
    char lbuf[MAXBUFSIZE + OVERLAPLEN];
    int p, max = len - (notlast ? OVERLAPLEN : 15);

    if (max <= 0)
	return 0;

    checken("recurse_buff (top)");

    for (p = 0; p < len; p++)
	lbuf[p] = tolower((int)buf[p]);

    if (u->parm->opt->follow_src > DONT_FETCH) {
	recursen(u, buf, lbuf, len, max, "src=", 1);
	recursen(u, buf, lbuf, len, max, "background=", 1);
    }
    if (u->parm->opt->follow_href > NOT_RECURSIVE)
	recursen(u, buf, lbuf, len, max, "href=", 0);

    checken("recurse_buff (end)");

    return max;
}


/*  scan a partial file for references  */

void 
recurse_pfile(url_t *u, int fi, char **bupo, int *lepo)
{
    char buf[MAXBUFSIZE];
    int len, off;

    /*  Scan file for href's and src's:  */
    off = 0;
    while ((len = read(fi, buf + off, MAXBUFSIZE - off) + off) ==
	   MAXBUFSIZE) {
	recurse_buff(u, buf, len, 1);
	memcpy(buf, buf + MAXBUFSIZE - OVERLAPLEN, off = OVERLAPLEN);
    }
    off = recurse_buff(u, buf, len, bupo != 0);
    if (bupo) {
	len -= off;
	*bupo -= len;
	*lepo += len;
	memcpy(*bupo, buf + off, len);
    }

    checken("recurse_pfile (end)");
}


/*  scan an entire file for references  */

void 
recurse_file(url_t *u, char *name)
{
    int fi;

    if ((fi = mmfopen(name, O_RDONLY, &fi)) >= 0) {
	recurse_pfile(u, fi, 0, 0);
	close(fi);
    } else
	prx(ERR, "cannot scan %s for links: %s\n", name, strerror(errno));
}



syntax highlighted by Code2HTML, v. 0.9.1