/* $Id: recurse.c,v 1.6 2003/04/05 18:07:24 ossi Exp $ * * * puf 0.9 Copyright (C) 2000-2003 by Oswald Buddenhagen * based on puf 0.1.x (C) 1999,2000 by Anders Gavare * * You may modify and distribute this code under the terms of the GPL. * There is NO WARRANTY of any kind. See COPYING for details. * * recurse.c - scan files/buffers for references to other urls * */ #include "puf.h" /* * 1. Expand filename into a fully qualified url (relative * to referer if it's not absolute) * 2. Check to see if it should be recursed into, and if so * then add it */ static void add_reference(char *ref, int len, url_t *referer, int is_req) { char buf[SHORTSTR]; url_t *u; int totlen, pathlen, hash, p, t, sl; if (!len || !referer) return; checken("add_reference (top)"); dbg(URL, ("add_reference '%.*s' by 'http://%s/%s'\n", len, ref, referer->host->name, referer->local_part)); if (parse_add_url(ref, len, referer, 0, 0, is_req, 0, referer->link_depth + 1) != -1) return; /* with protocol, maybe some error */ if (ref[0] == '/') { pathlen = 0; p = 1; } else { pathlen = referer->path_len; memcpy(buf, referer->local_part, pathlen); p = 0; } for (totlen = pathlen, sl = '/'; p < len; p++) { /* Here we are positioned at the first character of a name: */ /* Find the next slash or end of string */ for (t = p; p < len && ref[p] != sl; p++) if (ref[p] == '?') { if (referer->parm->opt->inhibit_cgiget >= 0) { dbg(URL, ("not adding relative ?-ref '%.*s'\n", len, ref)); return; } else sl = -1; } if (p - t > 0) { /* only add if there really was a name */ if (p - t == 2 && ref[t] == '.' && ref[t + 1] == '.') { if (!totlen) { prx(WRN, "relative ref '%.*s' in http://%s/%s points below /\n", len, ref, referer->host->name, referer->local_part); return; } for (; --totlen > 0 && buf[totlen - 1] != '/'; ); pathlen = totlen; } else if (p - t != 1 || ref[t] != '.') { if (totlen + (p - t) + 1 > SHORTSTR) { prx(ERR, "relative ref '%.*s' in http://%s/%s too long\n", len, ref, referer->host->name, referer->local_part); return; } memcpy(buf + totlen, ref + t, p - t); totlen += p - t; if (ref[p] == '/') { buf[totlen++] = '/'; pathlen = totlen; } } } } dbg(URL, ("'%.*s' => / '%.*s' '%.*s'\n", len, ref, pathlen, buf, totlen - pathlen, buf + pathlen)); if (find_url(buf, totlen, referer->host->info, referer->port, &hash)) return; if (!same_dir(buf, totlen, referer, is_req)) return; if (!(u = mmalloc(sizeof(*u) + totlen + 1))) return; u->local_part[totlen] = '\0'; memcpy(u->local_part, buf, totlen); #ifdef USE_MAGIC u->len = (u->local_part - (char *)&(u->len)) + totlen + 1; #endif u->url_hash = hash; u->referer = referer; checken_gen(referer->parm, "parm checksum failure in add_reference"); u->parm = referer->parm; u->parm->ref_count++; checken_updgen(u->parm); u->host = referer->host; u->port = referer->port; u->path_len = pathlen; u->disp_pathoff = referer->disp_pathoff; u->is_top_dir = 0; u->is_requisite = is_req; u->relocs = 0; u->link_depth = referer->link_depth + 1; checken("add_reference (pre-end)"); add_url(u); } /* Find rule in buf and return pointer past the match */ /* This is case-sensitive!!! */ static char * matchen(char *buff, int blen, char *rule) { char r, b, *buf, *rp; int bp, len; for (buf = buff - 1, len = blen;;) { bp = 0; rp = rule; r = *rp; do { buf++; if (--len < 0) goto ret0; } while (*buf != r); do { bp++; rp++; b = buf[bp]; if (!(r = *rp)) goto complete; } while (b == r); } complete: return buf + bp; ret0: return 0; } static void recursen(url_t *u, char *buf, char *lbuf, int len, int max, char *what, int is_req) { char *moff; int p, p2; for (p = 0; (moff = matchen(lbuf + p, max - p, what)); ) { p = moff - lbuf; /* Find the end of the ref-string */ if (buf[p] == '\\' && buf[p + 1] == '"') { p += 2; p2 = p; while (p + 2 < len && buf[p] != '#' && buf[p] > ' ' && (buf[p] != '\\' || buf[p + 1] != '"')) p++; } else if (buf[p] == '"') { p++; p2 = p; while (p + 1 < len && buf[p] != '#' && buf[p] > ' ' && buf[p] != '"') p++; } else { p2 = p; while (p + 1 < len && buf[p] != '#' && buf[p] > ' ' && buf[p] != '>') p++; } if (p - p2 > 0) add_reference(buf + p2, p - p2, u, is_req); } } /* scan a buffer for references */ int recurse_buff(url_t *u, char *buf, int len, int notlast) { char lbuf[MAXBUFSIZE + OVERLAPLEN]; int p, max = len - (notlast ? OVERLAPLEN : 15); if (max <= 0) return 0; checken("recurse_buff (top)"); for (p = 0; p < len; p++) lbuf[p] = tolower((int)buf[p]); if (u->parm->opt->follow_src > DONT_FETCH) { recursen(u, buf, lbuf, len, max, "src=", 1); recursen(u, buf, lbuf, len, max, "background=", 1); } if (u->parm->opt->follow_href > NOT_RECURSIVE) recursen(u, buf, lbuf, len, max, "href=", 0); checken("recurse_buff (end)"); return max; } /* scan a partial file for references */ void recurse_pfile(url_t *u, int fi, char **bupo, int *lepo) { char buf[MAXBUFSIZE]; int len, off; /* Scan file for href's and src's: */ off = 0; while ((len = read(fi, buf + off, MAXBUFSIZE - off) + off) == MAXBUFSIZE) { recurse_buff(u, buf, len, 1); memcpy(buf, buf + MAXBUFSIZE - OVERLAPLEN, off = OVERLAPLEN); } off = recurse_buff(u, buf, len, bupo != 0); if (bupo) { len -= off; *bupo -= len; *lepo += len; memcpy(*bupo, buf + off, len); } checken("recurse_pfile (end)"); } /* scan an entire file for references */ void recurse_file(url_t *u, char *name) { int fi; if ((fi = mmfopen(name, O_RDONLY, &fi)) >= 0) { recurse_pfile(u, fi, 0, 0); close(fi); } else prx(ERR, "cannot scan %s for links: %s\n", name, strerror(errno)); }