/* $Id: recurse.c,v 1.6 2003/04/05 18:07:24 ossi Exp $ *
*
* puf 0.9 Copyright (C) 2000-2003 by Oswald Buddenhagen <puf@ossi.cjb.net>
* based on puf 0.1.x (C) 1999,2000 by Anders Gavare <gavare@hotmail.com>
*
* You may modify and distribute this code under the terms of the GPL.
* There is NO WARRANTY of any kind. See COPYING for details.
*
* recurse.c - scan files/buffers for references to other urls
*
*/
#include "puf.h"
/*
* 1. Expand filename into a fully qualified url (relative
* to referer if it's not absolute)
* 2. Check to see if it should be recursed into, and if so
* then add it
*/
static void
add_reference(char *ref, int len, url_t *referer, int is_req)
{
char buf[SHORTSTR];
url_t *u;
int totlen, pathlen, hash, p, t, sl;
if (!len || !referer)
return;
checken("add_reference (top)");
dbg(URL, ("add_reference '%.*s' by 'http://%s/%s'\n", len, ref,
referer->host->name, referer->local_part));
if (parse_add_url(ref, len, referer, 0, 0, is_req, 0,
referer->link_depth + 1) != -1)
return; /* with protocol, maybe some error */
if (ref[0] == '/') {
pathlen = 0;
p = 1;
} else {
pathlen = referer->path_len;
memcpy(buf, referer->local_part, pathlen);
p = 0;
}
for (totlen = pathlen, sl = '/'; p < len; p++) {
/* Here we are positioned at the first character of a name: */
/* Find the next slash or end of string */
for (t = p; p < len && ref[p] != sl; p++)
if (ref[p] == '?') {
if (referer->parm->opt->inhibit_cgiget >= 0) {
dbg(URL, ("not adding relative ?-ref '%.*s'\n", len, ref));
return;
} else
sl = -1;
}
if (p - t > 0) { /* only add if there really was a name */
if (p - t == 2 && ref[t] == '.' && ref[t + 1] == '.') {
if (!totlen) {
prx(WRN,
"relative ref '%.*s' in http://%s/%s points below /\n",
len, ref, referer->host->name, referer->local_part);
return;
}
for (; --totlen > 0 && buf[totlen - 1] != '/'; );
pathlen = totlen;
} else if (p - t != 1 || ref[t] != '.') {
if (totlen + (p - t) + 1 > SHORTSTR) {
prx(ERR, "relative ref '%.*s' in http://%s/%s too long\n",
len, ref, referer->host->name, referer->local_part);
return;
}
memcpy(buf + totlen, ref + t, p - t);
totlen += p - t;
if (ref[p] == '/') {
buf[totlen++] = '/';
pathlen = totlen;
}
}
}
}
dbg(URL, ("'%.*s' => / '%.*s' '%.*s'\n", len, ref, pathlen, buf,
totlen - pathlen, buf + pathlen));
if (find_url(buf, totlen, referer->host->info, referer->port, &hash))
return;
if (!same_dir(buf, totlen, referer, is_req))
return;
if (!(u = mmalloc(sizeof(*u) + totlen + 1)))
return;
u->local_part[totlen] = '\0';
memcpy(u->local_part, buf, totlen);
#ifdef USE_MAGIC
u->len = (u->local_part - (char *)&(u->len)) + totlen + 1;
#endif
u->url_hash = hash;
u->referer = referer;
checken_gen(referer->parm, "parm checksum failure in add_reference");
u->parm = referer->parm;
u->parm->ref_count++;
checken_updgen(u->parm);
u->host = referer->host;
u->port = referer->port;
u->path_len = pathlen;
u->disp_pathoff = referer->disp_pathoff;
u->is_top_dir = 0;
u->is_requisite = is_req;
u->relocs = 0;
u->link_depth = referer->link_depth + 1;
checken("add_reference (pre-end)");
add_url(u);
}
/* Find rule in buf and return pointer past the match */
/* This is case-sensitive!!! */
static char *
matchen(char *buff, int blen, char *rule)
{
char r, b, *buf, *rp;
int bp, len;
for (buf = buff - 1, len = blen;;) {
bp = 0;
rp = rule;
r = *rp;
do {
buf++;
if (--len < 0)
goto ret0;
} while (*buf != r);
do {
bp++;
rp++;
b = buf[bp];
if (!(r = *rp))
goto complete;
} while (b == r);
}
complete:
return buf + bp;
ret0:
return 0;
}
static void
recursen(url_t *u, char *buf, char *lbuf, int len, int max,
char *what, int is_req)
{
char *moff;
int p, p2;
for (p = 0; (moff = matchen(lbuf + p, max - p, what)); ) {
p = moff - lbuf;
/* Find the end of the ref-string */
if (buf[p] == '\\' && buf[p + 1] == '"') {
p += 2;
p2 = p;
while (p + 2 < len && buf[p] != '#' && buf[p] > ' ' &&
(buf[p] != '\\' || buf[p + 1] != '"'))
p++;
} else if (buf[p] == '"') {
p++;
p2 = p;
while (p + 1 < len && buf[p] != '#' && buf[p] > ' ' &&
buf[p] != '"')
p++;
} else {
p2 = p;
while (p + 1 < len && buf[p] != '#' && buf[p] > ' ' &&
buf[p] != '>')
p++;
}
if (p - p2 > 0)
add_reference(buf + p2, p - p2, u, is_req);
}
}
/* scan a buffer for references */
int
recurse_buff(url_t *u, char *buf, int len, int notlast)
{
char lbuf[MAXBUFSIZE + OVERLAPLEN];
int p, max = len - (notlast ? OVERLAPLEN : 15);
if (max <= 0)
return 0;
checken("recurse_buff (top)");
for (p = 0; p < len; p++)
lbuf[p] = tolower((int)buf[p]);
if (u->parm->opt->follow_src > DONT_FETCH) {
recursen(u, buf, lbuf, len, max, "src=", 1);
recursen(u, buf, lbuf, len, max, "background=", 1);
}
if (u->parm->opt->follow_href > NOT_RECURSIVE)
recursen(u, buf, lbuf, len, max, "href=", 0);
checken("recurse_buff (end)");
return max;
}
/* scan a partial file for references */
void
recurse_pfile(url_t *u, int fi, char **bupo, int *lepo)
{
char buf[MAXBUFSIZE];
int len, off;
/* Scan file for href's and src's: */
off = 0;
while ((len = read(fi, buf + off, MAXBUFSIZE - off) + off) ==
MAXBUFSIZE) {
recurse_buff(u, buf, len, 1);
memcpy(buf, buf + MAXBUFSIZE - OVERLAPLEN, off = OVERLAPLEN);
}
off = recurse_buff(u, buf, len, bupo != 0);
if (bupo) {
len -= off;
*bupo -= len;
*lepo += len;
memcpy(*bupo, buf + off, len);
}
checken("recurse_pfile (end)");
}
/* scan an entire file for references */
void
recurse_file(url_t *u, char *name)
{
int fi;
if ((fi = mmfopen(name, O_RDONLY, &fi)) >= 0) {
recurse_pfile(u, fi, 0, 0);
close(fi);
} else
prx(ERR, "cannot scan %s for links: %s\n", name, strerror(errno));
}
syntax highlighted by Code2HTML, v. 0.9.1