/* $Id: http_rsp.c,v 1.12 2004/03/07 12:48:20 ossi Exp $ * * * puf 0.9 Copyright (C) 2000-2004 by Oswald Buddenhagen * based on puf 0.1.x (C) 1999,2000 by Anders Gavare * * You may modify and distribute this code under the terms of the GPL. * There is NO WARRANTY of any kind. See COPYING for details. * * http_rst.c - receive and process http response message * */ #include "puf.h" int economize_files; /* create all directories in the path */ static void create_dir(char *buf) { int p; for (p = 0; buf[p]; p++) if (p && buf[p] == '/') { buf[p] = '\0'; mkdir(buf, 0777); buf[p] = '/'; } } /* open a file. create the directory it should live in, if it's not there. try to free up handles, if necessary. */ int mmfopen(char *name, int flags, int *f) { int fi, try_free, try_mkdir; static int cf = -1; if (cf != -1) { close(cf); cf = -1; } try_free = try_mkdir = 0; retry: if ((fi = open(name, flags, 0666)) < 0) { if (errno == ENFILE || errno == EMFILE) { if (!try_free && !economize_files && free_fd()) { try_free++; goto retry; } return -2; } if (errno == ENOENT && !try_mkdir) { create_dir(name); try_mkdir++; goto retry; } return -1; } if (economize_files) return cf = fi; else return *f = fi; } /* open file and store the handle in aurl_t structure */ static int mfopen(aurl_t *au, int flags) { return mmfopen(au->disposition, flags, &(au->f)); } /* try to "steal" a handle from an open target file */ int free_fd() { lnq_iterate(queue_urls_reply, aurl_t, au, { if (au->f != -1) { close(au->f); au->f = -1; return 1; } }); return 0; } /* save data to overlap buffer. note, that we possibly are saving contents of the previous buffer! */ static int save_buff(aurl_t *au, char *buf, int len) { char *bp; int siz; if ((siz = len < OVERLAPLEN ? OVERLAPLEN : len) > au->size) { au->size = siz; if (!(bp = mmalloc(siz))) { if (au->buffer) { free (au->buffer); au->buffer = 0; au->size = au->offset = 0; } return 0; } au->offset = len; memcpy(bp, buf, len); if (au->buffer) free (au->buffer); au->buffer = bp; } else { au->offset = len; memcpy(au->buffer, buf, len); } return 1; } static int needs_recurse(aurl_t *au) { return (!au->url->parm->opt->max_recurse || au->url->link_depth < au->url->parm->opt->max_recurse) && (au->url->parm->opt->follow_href > NOT_RECURSIVE || au->url->parm->opt->follow_src > DONT_FETCH) && (au->content_is_html || au->url->parm->opt->force_html); } /* handle http reply message */ int handle_reply(aurl_t *au) { char *bufp, *nbuf; ptrarr_t *sh; int fi, a, e, l, o, p, len, alen, orglen, nsiz; unsigned u; char databuf[OVERLAPLEN + MAXBUFSIZE], buf[SHORTSTR]; /* Receive some data: */ bufp = databuf + OVERLAPLEN; if ((orglen = read(au->socket, bufp, MAXBUFSIZE)) < 0) return errm(au->url, "data read for $u failed"); len = orglen + au->offset; if (orglen) { /* Copy overlap buffer from last read */ if (au->offset) { if (au->offset > OVERLAPLEN) { char *nbuf; if (len > MAXHEADERLEN || !(nbuf = mrealloc(au->buffer, len))) { return errm(au->url, "reply header for $u has insane length"); } else { memcpy(nbuf + au->offset, bufp, orglen); au->buffer = bufp = nbuf; au->size = len; } } else { bufp -= au->offset; memcpy(bufp, au->buffer, au->offset); } } } else { /* note: close with size_fetched < size_total is accepted. this is basically incorrect, but common practice. */ /* handle remainig data in overlap buffer */ bufp = au->buffer; } /* first the http message header */ if (!au->http_done_header) { for (p = 0;;) { for (a = p;;) { /* reached end-of-buffer before header end? */ if (p >= len) { if (!orglen) return errm(au->url, "broken reply header for $u"); return save_buff(au, bufp + a, len - a) ? RT_OK : RT_RETRY; } if (bufp[p++] == '\n') break; } for (e = p - 1; e > a && bufp[e - 1] <= ' '; e--); l = e - a; bufp[e] = '\0'; dbg(HDR, ("read header: %s\n", bufp + a)); if (!au->http_result_code) { /* empty lines before reply header are incorrect, but should be handled for robustness */ if (l) { /* get result code */ if (strncasecmp(bufp + a, "http/", 5)) return errm(au->url, "broken reply header for $u"); au->http_result_code = atoi(bufp + a + 9); if (bufp[a + 9] != '1') switch (au->http_result_code) { case 200: /* ok */ au->file_off = 0; case 206: /* partial content */ case 304: /* not modified */ break; case 300: /* multiple choices */ case 301: /* moved permanently */ case 302: /* moved temporarily */ case 307: /* temporary redirect (new 302) */ au->reloc = 1; break; case 400: /* bad request */ case 505: /* http version not supported */ if (!au->url->host->info->is_http11) return errm(au->url, "!sever failed to parse request for $u"); prx(NFO, "falling back to HTTP/1.0 for host '%s'\n", au->url->host->name); au->url->host->info->is_http11 = 0; return RT_AGAIN; case 401: return errm(au->url, "!need authorization for $u"); /* the following two are theoretically fatal errors, but on some servers they indicate temporary failure ... strange ... */ case 403: /* access denied */ { static const char msg[] = "!access to $u denied"; return errm(au->url, au->url->parm->opt->http_err_trans ? msg + 1 : msg); } case 404: /* not found */ { static const char msg[] = "!$u not found"; return errm(au->url, au->url->parm->opt->http_err_trans ? msg + 1 : msg); } case 407: /* proxy auth required */ au->proxy->host = 0; /* mark dead */ return RT_AGAIN; case 503: /* service unavailable - connection refused, etc. */ return RT_REFUSED; case 504: /* gateway timeout - server not responding */ return RT_TIMEOUT; default: return errm(au->url, "unrecognised HTTP status '%s' for $u", bufp + a); } } } else { /* have_result */ /* end of headers? */ if (!l) { if (au->http_result_code >= 200) break; else { au->http_result_code = 0; dbg(HDR, ("awaiting next header after 1xx response.\n")); continue; } } /* continued header? */ if (bufp[a] <= ' ') continue; /* save requested */ sh = &au->url->parm->opt->save_headers; for (u = 0; u < sh->nents; u++) if (!strncasecmp(bufp + a, ((char **)sh->ents)[u], strlen(((char **)sh->ents)[u]))) { if (au->hdrssiz < au->hdrslen + e - a + 1) { nsiz = au->hdrslen * 2 + e - a + 1; if (!(nbuf = mrealloc(au->headers, nsiz))) break; au->headers = nbuf; au->hdrssiz = nsiz; } memcpy(au->headers + au->hdrslen, bufp + a, e - a); au->hdrslen += e - a; au->headers[au->hdrslen++] = '\n'; break; } /* split header name and content */ for (o = a; o < e && bufp[o] > ' '; o++); bufp[o++] = '\0'; for (; o < e && bufp[o] <= ' '; o++); /* handle header */ if (au->reloc) { if (!strcasecmp(bufp + a, "Location:")) { prx(NFO, "relocation from http://%s/%s to %s\n", au->url->host->name, au->url->local_part, bufp + o); if (au->url->relocs < 5) { parse_add_url(bufp + o, e - o, au->url->referer, au->url->parm, au->url->is_top_dir, au->url->is_requisite, au->url->relocs + 1, au->url->link_depth); } else prx(ERR, "%s exceeds maximal redirection count!\n", bufp + o); return RT_SKIP; } /* needn't check other headers when redirect encoutered */ } else if (!strcasecmp(bufp + a, "Last-Modified:")) { if ((au->file_time = parseHTTPdate(bufp + o)) == BAD_DATE) prx(WRN, "unrecognised date format '%s'", bufp + o); } else if (!strcasecmp(bufp + a, "Content-Length:")) sscanf(bufp + o, SOFFT, &(au->size_total)); else if (!strcasecmp(bufp + a, "Content-Type:")) { if (!strncasecmp(bufp + o, "text/html", 9) && !isalpha((int)bufp[o + 9])) au->content_is_html = 1; } else if (!strcasecmp(bufp + a, "Content-Range:")) { /* The Content-Range string should look somewhat like this: "bytes 250260-664041471/664041472" */ off_t rs, re, rt; if(sscanf(bufp + o, "bytes "SOFFT"-"SOFFT"/"SOFFT, &rs, &re, &rt) != 3) { return errm(au->url, "unrecognized Content-Range for $u"); } } } /* have_result */ } /* main header loop */ if (au->reloc) /* no relocation url found */ return errm(au->url, "missing new location while redirecting $u"); if (au->http_result_code == 304) { /* Not Modified */ if (needs_recurse(au)) recurse_file(au->url, au->disposition); return RT_DONE; /* would HR_SKIP be more appropriate? */ } if (au->size_total) { if (au->url->parm->opt->max_bytes && au->size_total > au->url->parm->opt->max_bytes) au->size_total = au->url->parm->opt->max_bytes; total_bytes += au->size_total; /* update statistics */ } au->http_done_header = 1; bufp += p; /* let the header vanish */ len -= p; alen = len; } else /* done_header */ alen = orglen; /* http message body */ if (au->file_created) { if (au->f != -1) /* If the file is already open, let's just write to it ... */ fi = au->f; else /* file is switched */ if ((fi = mfopen(au, O_WRONLY | _O_BINARY)) < 0) return errm(au->url, "!$u: cannot open %s for appending: %s", au->disposition, strerror(errno)); } else { /* no attempt to open the file till now */ au->file_created = 1; if (au->file_off && needs_recurse(au)) { if ((fi = mfopen(au, O_RDWR)) < 0) return errm(au->url, "!$u: cannot open %s for reading and appending: %s", au->disposition, strerror(errno)); recurse_pfile(au->url, fi, &bufp, &len); } else { if ((fi = mfopen(au, au->file_off ? O_WRONLY | _O_BINARY : O_WRONLY | O_CREAT | O_TRUNC | _O_BINARY)) < 0) return errm(au->url, "!$u: cannot create %s: %s", au->disposition, strerror(errno)); } } /* scan the buffer for references */ if (needs_recurse(au)) { int done = recurse_buff(au->url, bufp, len, orglen); if (orglen && !save_buff(au, bufp + done, len - done)) return RT_RETRY; } if (orglen) { int retaf; /* point at "fresh" data */ bufp += len - alen; /* hard file size limitation */ if (au->url->parm->opt->max_bytes && au->file_off + alen >= au->url->parm->opt->max_bytes) { alen = au->url->parm->opt->max_bytes - au->file_off; retaf = 0; } else retaf = 1; /* write the buffer to disk */ lseek(fi, au->file_off, SEEK_SET); if (write(fi, bufp, alen) != alen) die(1, "Write error! Disk full?\n"); /* Update the counters and statistics: */ au->file_off += alen; au->size_fetched += alen; if (!au->size_total) total_bytes += alen; fetched_bytes += alen; if (max_bytes && fetched_bytes > max_bytes) byebye("byte quota exceeded!"); if (retaf) return RT_OK; } /* file is complete -> rename it */ touch(au); memcpy(buf, au->disposition, au->displen); if (au->url->parm->opt->enumerate_files) sprintf(buf + au->displen, "%d.puf", ++au->url->parm->opt->disp_path->file_num); else buf[au->displen] = '\0'; rename(au->disposition, buf); /* save headers */ if (au->headers) { strcat(buf, ".hdr"); if ((fi = open(buf, O_WRONLY | O_CREAT | O_TRUNC, 0666)) < 0 && ((errno != ENFILE && errno != EMFILE) || economize_files || !free_fd() || (fi = open(buf, O_WRONLY | O_CREAT | O_TRUNC, 0666)) < 0)) return errm(au->url, "!$u: cannot create %s: %s", buf, strerror(errno)); write(fi, au->headers, au->hdrslen); close(fi); } return RT_DONE; }