/* Web Polygraph http://www.web-polygraph.org/ * (C) 2003-2006 The Measurement Factory * Licensed under the Apache License, Version 2.0 */ #include "base/polygraph.h" #include "xstd/String.h" #include "xstd/Clock.h" #include "xstd/StrIdentifier.h" #include "xstd/gadgets.h" #include "base/OLog.h" #include "base/AddrParsers.h" #include "runtime/XactAbortCoord.h" #include "runtime/HttpCookies.h" #include "runtime/HttpDate.h" #include "runtime/httpText.h" #include "runtime/httpHdrs.h" /* internal type to store static parsing info */ class MsgHdrParsTab { public: MsgHdrParsTab(); ~MsgHdrParsTab(); public: StrIdentifier *ids; Array *parsers; }; MsgHdrParsTab *ReqHdr::TheParsTab = 0; MsgHdrParsTab *RepHdr::TheParsTab = 0; /* MsgHdr */ MsgHdr::MsgHdr(const MsgHdrParsTab &aTab): theParsTab(aTab) { reset(); } void MsgHdr::reset() { theHdrSize = 0; theHttpVersion.reset(); theContSize = -1; theChecksum.reset(); theDate = Time(); theGroupId.clear(); theXactId.clear(); theTarget = NetAddr(); theRemWorld.reset(); theAbortCoord.reset(); thePhaseSyncPos = -1; theConnectionKeepAlive = kaDefault; theContType = ctUnknown; theTransferEncoding = tcNone; isCachable = true; theBufBeg = theBufEnd = theSrchPtr = 0; theFields.reset(); theSrchState = ssFirst; // do not reset parsing tables } bool MsgHdr::markupContent() const { return theContType == ctMarkup; } bool MsgHdr::knownContentType() const { return theContType != ctUnknown; } bool MsgHdr::chunkedEncoding() const { return theTransferEncoding == tcChunked; } bool MsgHdr::persistentConnection() const { if (theHttpVersion <= HttpVersion(1,0)) // 1.0: keep if explicitly told so return theConnectionKeepAlive == kaYes; else // 1.1: keep unless told otherwise return theConnectionKeepAlive != kaNo; } // note: buf does not have to be zero-terminated! bool MsgHdr::parse(const char *buf, Size sz) { if (!theBufBeg) { // have not started the search yet theBufBeg = theSrchPtr = buf; Assert(theSrchState == ssFirst); } else { // continue search Assert(theBufBeg == buf); Assert(theSrchState != ssFound); } theBufEnd = buf + sz; // to be refined later while (theSrchPtr < theBufEnd && theSrchState != ssFound) { // search for LF if (theSrchState == ssFirst) { do { if (*theSrchPtr++ == '\n') { theSrchState = ssSkip; break; } } while (theSrchPtr < theBufEnd); } // LF after skipping optional CRs means end-of-headers while (theSrchState == ssSkip && theSrchPtr < theBufEnd) { if (*theSrchPtr == '\n') { theSrchState = ssFound; } else if (*theSrchPtr != '\r') { theFields.append(theSrchPtr); // start of a header! theSrchState = ssFirst; } ++theSrchPtr; } } if (theSrchState != ssFound) return false; // found end-of-headers! theBufEnd = theSrchPtr; theHdrSize = theBufEnd - theBufBeg; // now parse known fields // luckily, we already know field starts! parseFields(); return true; } void MsgHdr::parseFields() { const char *eoh = theBufEnd; // skip end-of-headers CRLF while (theBufBeg < eoh && eoh[-1] == '\n') --eoh; while (theBufBeg < eoh && eoh[-1] == '\r') --eoh; parseRLine(theBufBeg, theFields.count() ? theFields[0] : eoh); for (int i = theFields.count()-1; i >= 0; --i) { const char *hdr = theFields[i]; const int len = eoh-hdr; // approximate (includes crlfs) const int id = theParsTab.ids->lookup(hdr, len); if (id > 0) { const char *val = hdr + theParsTab.ids->string(id).len(); while (isspace(*val)) ++val; Parser p = *theParsTab.parsers->item(id); (this->*p)(val, eoh); } eoh = hdr; } } bool MsgHdr::parseHttpVersion(const char *&beg, const char *end, HttpVersion &v) { const char *p = 0; int major = -1, minor = -1; if (isInt(beg, major, &p) && p+1 < end && *p == '.' && isInt(p+1, minor, &p)) { v = HttpVersion(major, minor); beg = p; return true; } return false; } bool MsgHdr::parseUniqId(const char *&buf, const char *, UniqId &id) { int a = 0, b = 0, c = 0; const char *p = 0; if (isInt(buf, a, &p, 16) && *p == '.' && isInt(p+1, b, &p, 16) && *p == ':' && isInt(p+1, c, &buf, 16)) return (id = UniqId(a, b, c)) != 0; return false; } bool MsgHdr::parseWorld(const char *buf, const char *eoh, ObjWorld &world) { UniqId id; if (!parseUniqId(buf, eoh, id) || *buf != ' ') return false; int size = -1, wss = -1; int hotPos = -1; const char *p = buf; if (isInt(p+1, wss, &p) && *p == '/' && isInt(p+1, size, &p) && *p == ' ' && isInt(p+1, hotPos)) { world.id(id); world.size(size); world.wss(wss); world.hotPos(hotPos); return true; } return false; } bool MsgHdr::ParseHostInUri(const char *&start, const char *eorl, NetAddr &host) { if (const char *newStart = SkipHostInUri(start, eorl, host)) { start = newStart; return true; } return false; } bool MsgHdr::parseUri(const char *&buf, const char *end, HttpUri &uri) { const char *uriStart = buf; // see if there is a protocol://host prefix if (*buf != '/') ParseHostInUri(buf, end, uri.host); uri.pathBuf = buf; // includes leading '/' // user-specified url_prefix may contain world-id tag; // search for the first tag that is followed by a valid wid while (const char *wid = StrBoundChr(buf, 'w', end)) { if (parseUniqId(buf = wid+1, end, uri.oid.world())) break; } if (const char *tid = StrBoundChr(buf, 't', end)) isInt(tid + 1, uri.oid.type(), &buf, 16); if (const char *oid = StrBoundChr(buf, '_', end)) isInt(oid + 1, uri.oid.name(), &buf, 16); // find the end of the Uri while (buf < end && !isspace(*buf)) ++buf; uri.pathLen = buf - uri.pathBuf; if (!uri.oid.world() || uri.oid.type() < 0 || uri.oid.name() < 0) uri.oid.foreignUrl(String(uriStart, buf - uriStart)); return true; } bool MsgHdr::parseContLen(const char *buf, const char *) { theContSize = xatoi(buf, -1); return theContSize >= 0; } bool MsgHdr::parseContMd5(const char *buf, const char *eoh) { if (DecodeBase64(buf, eoh - buf, theChecksum.buf(), theChecksum.size()) == theChecksum.size()) { theChecksum.set(true); return true; } return false; } bool MsgHdr::parseContType(const char *buf, const char *eoh) { theContType = ctOther; // default if (strncasecmp(buf, "text/", 5) == 0) { buf += 5; if (buf+4 <= eoh && strncasecmp(buf+2, "ml", 2) == 0) theContType = ctMarkup; else if (buf+4 <= eoh && strncasecmp(buf+1, "ml", 2) == 0) theContType = ctMarkup; else if (buf+3 <= eoh && strncasecmp(buf, "css", 3) == 0) theContType = ctMarkup; } return true; } bool MsgHdr::parseDate(const char *buf, const char *eoh) { theDate = HttpDateParse(buf, eoh - buf); return theDate >= 0; } bool MsgHdr::parsePragma(const char *buf, const char *) { if (!strncasecmp("no-cache", buf, 8)) isCachable = false; else return false; return true; } bool MsgHdr::parseCControl(const char *buf, const char *) { if (!strncasecmp("no-cache", buf, 8)) { isCachable = false; return true; } return false; } bool MsgHdr::parseXXact(const char *buf, const char *eoh) { return parseUniqId(buf, eoh, theGroupId) && *buf == ' ' && parseUniqId(++buf, eoh, theXactId); } bool MsgHdr::parseXRemWorld(const char *buf, const char *eoh) { return parseWorld(buf, eoh, theRemWorld); } bool MsgHdr::parseXAbort(const char *buf, const char *) { int whether = 0; int where = 0; const char *p = 0; if (isInt(buf, whether, &p) && *p == ' ' && isInt(p+1, where) && whether && where) { theAbortCoord.configure(whether, where); return true; } return false; } bool MsgHdr::parseXPhaseSyncPos(const char *buf, const char *) { thePhaseSyncPos = xatoi(buf, 0); return true; } bool MsgHdr::parseXTarget(const char *buf, const char *eoh) { return ParseNetAddr(buf, eoh, theTarget); } /* XXX: Connection and other headers may have a _list_ of options */ bool MsgHdr::parseConnection(const char *buf, const char *) { if (!strncasecmp("close", buf, 5)) theConnectionKeepAlive = kaNo; else if (!strncasecmp("keep", buf, 4)) theConnectionKeepAlive = kaYes; else return false; return true; } bool MsgHdr::parseTransferEncoding(const char *buf, const char *) { if (!strncasecmp("chunked", buf, 7)) theTransferEncoding = tcChunked; else if (!strncasecmp("identity", buf, 8)) theTransferEncoding = tcIdentity; else theTransferEncoding = tcOther; return true; } // adds definitions common to replies and requests void MsgHdr::Configure(MsgHdrParsTab &tab) { AddParser(hfpDate, &MsgHdr::parseDate, tab); AddParser(hfpContLength, &MsgHdr::parseContLen, tab); AddParser(hfpContMd5, &MsgHdr::parseContMd5, tab); AddParser(hfpContType, &MsgHdr::parseContType, tab); AddParser(hfpCacheControl, &MsgHdr::parseCControl, tab); AddParser(hfpConnection, &MsgHdr::parseConnection, tab); AddParser(hfpPragma, &MsgHdr::parsePragma, tab); AddParser(hfpProxyConnection, &MsgHdr::parseConnection, tab); AddParser(hfpTransferEncoding, &MsgHdr::parseTransferEncoding, tab); AddParser(hfpXXact, &MsgHdr::parseXXact, tab); AddParser(hfpXRemWorld, &MsgHdr::parseXRemWorld, tab); AddParser(hfpXAbort, &MsgHdr::parseXAbort, tab); AddParser(hfpXPhaseSyncPos, &MsgHdr::parseXPhaseSyncPos, tab); AddParser(hfpXTarget, &MsgHdr::parseXTarget, tab); } int MsgHdr::AddParser(const String &field, Parser parser, MsgHdrParsTab &where) { Assert(field); const String trimmedField = isspace(field.last()) ? field(0, field.len()-1) : field; Assert(trimmedField); const int id = where.ids->add(trimmedField); where.parsers->put(new Parser(parser), id); return id; } void MsgHdr::store(OLog &log) const { log << theHdrSize << (int)theDate.sec() << theContSize // << theChecksum << theGroupId << theXactId << theTarget << theHttpVersion.vMinor() // XXX: log major too << (int)theConnectionKeepAlive << isCachable // XXX: not stored or loaded: theTransferEncoding, theContType ; } // these should never be called inline bool dontCallMe() { Assert(0); return false; } bool MsgHdr::parseGetReqLine(const char *, const char *) { return dontCallMe(); } bool MsgHdr::parseHeadReqLine(const char *, const char *) { return dontCallMe(); } bool MsgHdr::parsePostReqLine(const char *, const char *) { return dontCallMe(); } bool MsgHdr::parsePutReqLine(const char *, const char *) { return dontCallMe(); } bool MsgHdr::parseHost(const char *, const char *) { return dontCallMe(); } bool MsgHdr::parseServer(const char *, const char *) { return dontCallMe(); } bool MsgHdr::parseLocation(const char *, const char *) { return dontCallMe(); } bool MsgHdr::parseLMT(const char *, const char *) { return dontCallMe(); } bool MsgHdr::parseExpires(const char *, const char *) { return dontCallMe(); } bool MsgHdr::parseIms(const char *, const char *) { return dontCallMe(); } bool MsgHdr::parseAcceptEncoding(const char *, const char *) { return dontCallMe(); } bool MsgHdr::parseXLocWorld(const char *, const char *) { return dontCallMe(); } bool MsgHdr::parseCookie(const char *, const char *) { return dontCallMe(); } /* ReqHdr */ ReqHdr::ReqHdr(): MsgHdr(*TheParsTab), isHealthCheck(false), isAcceptingGzip(false) { } void ReqHdr::reset() { MsgHdr::reset(); theUri = HttpUri(); theIms = Time(); theLocWorld.reset(); isHealthCheck = false; isAcceptingGzip = false; } bool ReqHdr::parseRLine(const char *buf, const char *eorl) { const int id = theParsTab.ids->lookup(buf, eorl - buf); if (id > 0) { buf += theParsTab.ids->string(id).len(); while (isspace(*buf)) ++buf; Parser p = *theParsTab.parsers->item(id); return (this->*p)(buf, eorl); } return false; } bool ReqHdr::parseAnyReqLine(const char *buf, const char *eorl) { // a "well-known" health check uri static const String health = "/health"; isHealthCheck = health.casePrefixOf(buf, eorl-buf); parseUri(buf, eorl, theUri); if (const char *proto = StrBoundChr(buf, ' ', eorl)) { // optimization: not checking for "HTTP/" match proto += 6; if (proto < eorl) parseHttpVersion(proto, eorl, theHttpVersion); } return true; } bool ReqHdr::parseGetReqLine(const char *buf, const char *eorl) { if (parseAnyReqLine(buf, eorl)) { theUri.oid.get(true); return true; } return false; } bool ReqHdr::parseHeadReqLine(const char *buf, const char *eorl) { if (parseAnyReqLine(buf, eorl)) { theUri.oid.head(true); return true; } return false; } bool ReqHdr::parsePostReqLine(const char *buf, const char *eorl) { if (parseAnyReqLine(buf, eorl)) { theUri.oid.post(true); return true; } return false; } bool ReqHdr::parsePutReqLine(const char *buf, const char *eorl) { if (parseAnyReqLine(buf, eorl)) { theUri.oid.put(true); return true; } return false; } bool ReqHdr::parseHost(const char *buf, const char *eoh) { return ParseNetAddr(buf, eoh, theUri.host); } bool ReqHdr::parseIms(const char *buf, const char *eoh) { theIms = HttpDateParse(buf, eoh - buf); return theIms >= 0; } bool ReqHdr::parseAcceptEncoding(const char *buf, const char *eoh) { // XXX: these checks ignore "q=0" preferences isAcceptingGzip = StrBoundChr(buf, '*', eoh) || StrBoundStr(buf, "gzip", eoh); // XXX: codings are case-insensitive return true; } bool ReqHdr::parseXLocWorld(const char *buf, const char *eoh) { return parseWorld(buf, eoh, theLocWorld); } void ReqHdr::store(OLog &log) const { MsgHdr::store(log); log << theUri.host << theUri.oid << (int)theIms.sec(); } bool ReqHdr::expectBody() const { return theUri.oid.post() || theUri.oid.put(); } bool ReqHdr::acceptedEncoding(int coding) const { return coding == codingIdentity || // always acceptable for now (coding == codingGzip && isAcceptingGzip); } void ReqHdr::Configure() { TheParsTab = new MsgHdrParsTab(); MsgHdr::Configure(*TheParsTab); AddParser(hfpHost, &MsgHdr::parseHost, *TheParsTab); AddParser(hfpIMS, &MsgHdr::parseIms, *TheParsTab); AddParser(hfpXLocWorld, &MsgHdr::parseXLocWorld, *TheParsTab); AddParser(hfpAcceptEncoding, &MsgHdr::parseAcceptEncoding, *TheParsTab); // request method parsers use the same index/interface as field parsers AddParser(rlpGet, &MsgHdr::parseGetReqLine, *TheParsTab); AddParser(rlpHead, &MsgHdr::parseHeadReqLine, *TheParsTab); AddParser(rlpPost, &MsgHdr::parsePostReqLine, *TheParsTab); AddParser(rlpPut, &MsgHdr::parsePutReqLine, *TheParsTab); } void ReqHdr::Clean() { delete TheParsTab; TheParsTab = 0; } /* RepHdr */ bool RepHdr::PositiveStatusCode(int code) { /* 1xx: Informational - Request received, continuing process * 2xx: Success - The action was successfully received, understood, and accepted * 3xx: Redirection - Further action must be taken in order to complete the request * 4xx: Client Error - The request contains bad syntax or cannot be fulfilled * 5xx: Server Error - The server failed to fulfill an apparently valid request */ return 100 <= code && code < 400; } RepHdr::RepHdr(): MsgHdr(*TheParsTab), theStatus(scUnknown), theCookies(0) { } void RepHdr::reset() { MsgHdr::reset(); theServer = String(); theLocn = HttpUri(); theLMT = theExpires = Time(); theStatus = scUnknown; theCookies = 0; } void RepHdr::collectCookies(HttpCookies *cookies) { theCookies = cookies; } bool RepHdr::expectPolyHeaders() const { return expectBody() && theStatus != sc407_ProxyAuthRequired && theStatus != sc403_Forbidden && !redirect(); } // RFC 2616: All responses to the HEAD request method MUST NOT include // a message-body. All 1xx , 204, and 304 responses MUST NOT include a // message-body. All other responses do include a message-body. bool RepHdr::expectBody() const { // note: we cannot handle the HEAD case here; the caller should if ((100 <= theStatus && theStatus < 200) || theStatus == sc204_NoContent || theStatus == sc304_NotModified) return false; return true; } bool RepHdr::redirect() const { return theStatus == sc300_Choices || theStatus == sc302_Found || theStatus == sc303_Other || theStatus == sc307_TmpRedir; } Time RepHdr::calcLmt() const { if (theLMT >= 0) return theLMT; if (theDate >= 0) return theDate; return TheClock; } bool RepHdr::parseRLine(const char *buf, const char *eorl) { if (strncasecmp("HTTP/", buf, 5) != 0) return false; buf += 5; if (buf >= eorl || !parseHttpVersion(buf, eorl, theHttpVersion)) return false; buf += 1; return isInt(buf, theStatus); } bool RepHdr::parseServer(const char *buf, const char *eoh) { theServer = String(buf, eoh-buf); return theServer.len() > 0; } bool RepHdr::parseLocation(const char *buf, const char *eoh) { return parseUri(buf, eoh, theLocn); } bool RepHdr::parseLMT(const char *buf, const char *eoh) { theLMT = HttpDateParse(buf, eoh - buf); return theLMT >= 0; } bool RepHdr::parseExpires(const char *buf, const char *eoh) { theExpires = HttpDateParse(buf, eoh - buf); return theExpires >= 0; } bool RepHdr::parseCookie(const char *buf, const char *eoh) { if (theCookies) { HttpCookie *cookie = HttpCookieParse(buf, eoh - buf); if (!Should(cookie)) return false; theCookies->add(cookie); } return true; } void RepHdr::store(OLog &log) const { MsgHdr::store(log); log << theStatus << (int)theLMT.sec() << (int)theExpires.sec() ; } void RepHdr::Configure() { TheParsTab = new MsgHdrParsTab(); MsgHdr::Configure(*TheParsTab); AddParser(hfpLocation, &MsgHdr::parseLocation, *TheParsTab); AddParser(hfpServer, &MsgHdr::parseServer, *TheParsTab); AddParser(hfpLmt, &MsgHdr::parseLMT, *TheParsTab); AddParser(hfpExpires, &MsgHdr::parseExpires, *TheParsTab); AddParser(hfpSetCookie, &MsgHdr::parseCookie, *TheParsTab); } void RepHdr::Clean() { delete TheParsTab; TheParsTab = 0; } /* MsgHdrParsTab */ MsgHdrParsTab::MsgHdrParsTab() { ids = new StrIdentifier; parsers = new Array; } MsgHdrParsTab::~MsgHdrParsTab() { delete ids; ids = 0; while (parsers->count()) delete parsers->pop(); delete parsers; parsers = 0; }