/* Web Polygraph http://www.web-polygraph.org/ * (C) 2003-2006 The Measurement Factory * Licensed under the Apache License, Version 2.0 */ #include "base/polygraph.h" #include #include "xstd/h/iostream.h" #include "xstd/h/sstream.h" #include "xstd/h/iomanip.h" #include "xstd/ZLib.h" #include #include "xstd/Rnd.h" #include "xstd/gadgets.h" #include "base/RndPermut.h" #include "base/BStream.h" #include "base/rndDistrStat.h" #include "base/ContTypeStat.h" #include "base/polyLogCats.h" #include "runtime/IOBuf.h" #include "runtime/LogComment.h" #include "runtime/ErrorMgr.h" #include "runtime/polyErrors.h" #include "runtime/httpHdrs.h" #include "runtime/httpText.h" #include "pgl/MimeSym.h" #include "pgl/ContentSym.h" #include "csm/EmbedContMdl.h" #include "csm/RndBodyIter.h" #include "csm/CdbBodyIter.h" #include "csm/ContainerBodyIter.h" #include "csm/InjectIter.h" #include "csm/GzipEncoder.h" #include "csm/cdbEntries.h" #include "csm/ContentDbase.h" #include "csm/TextDbase.h" #include "csm/ObjLifeCycle.h" #include "csm/ContentCfg.h" static const String DefaultContentCfgKind = "some-content"; ContentCfg::ContentCfg(int anId): theObjLifeCycle(0), theChbRatio(-1), theChecksumRatio(-1), theUniqueRatio(-1), theSize(0), theEmbedCont(0), theCdb(0), theTdb(0), theInjGap(0), theInfProb(-1), theExtSel(0), thePfxSel(0), theNewPerOid(0), theId(anId), theEncodings(new int[codingEnd]) { theObjLifeCycle = new ObjLifeCycle; } ContentCfg::~ContentCfg() { delete theObjLifeCycle; delete[] theEncodings; } void ContentCfg::configure(const ContentSym &cfg) { theKind = cfg.kind(); if (!theKind) theKind = DefaultContentCfgKind; if (MimeSym *mime = cfg.mime()) { theMimeType = mime->mimeType(); mime->extensions(theExtensions, theExtSel); mime->prefixes(thePrefixes, thePfxSel); } theSize = cfg.size(); theObjLifeCycle->configure(cfg.objLifeCycle()); cfg.cachable(theChbRatio); cfg.checksum(theChecksumRatio); if (cfg.unique(theUniqueRatio) && theUniqueRatio >= 0) { // form unique prefix to be used to make some content unique const int sharedId = GlbPermut(rndSharedContent); char buf[64]; ofixedstream os(buf, sizeof(buf)); os << 'u' << hex << setfill('0') << setw(8) << sharedId << '.' << setw(2) << theId << '/' << ends; os.flush(); theCommonPrefix = buf; const double commonRatio = 1 - theUniqueRatio; Comment << "fyi: " << (100*commonRatio) << "% of '" << theKind << "' content (id " << theId << ") will be identical; common prefix: " << theCommonPrefix << endc; } if (cfg.hasEmbed()) { theEmbedCont = new EmbedContMdl; theEmbedCont->configure(&cfg); } if (const String &cdbName = cfg.cdb()) { ifstream f(cdbName.cstr()); IBStream is; is.configure(&f, cdbName); theCdb = new ContentDbase; theCdb->load(is); if (!is.good()) Comment << "error: cannot load content database from '" << cdbName << "'; " << Error::Last() << endc << xexit; if (!theCdb->count()) Comment << "error: no entries in '" << cdbName << "' content database" << endc << xexit; } //if (theEmbedCont && theCdb) // Comment << "error: content cfg `" << theKind << "': " // << "cannot support containers together with content_db yet; " // << "do not use may_contain with content_db" // << endc << xexit; if (!theSize && !theCdb) Comment << "error: content cfg `" << theKind << "': " << "has neither size distribution nor content_db; " << "either one or both must be specified for " << "Polygaph to know what object sizes this content type " << "should generate" << endc << xexit; if (const String &tdbName = cfg.injectDb()) { ifstream f(tdbName.cstr()); theTdb = new TextDbase; theTdb->load(f); if (f.bad()) Comment << "error: cannot load text database from `" << tdbName << "; " << Error::Last() << endc << xexit; if (!theTdb->count()) Comment << "error: text database `" << tdbName << " appears to be empty" << endc << xexit; } theInjGap = cfg.injectGap(); // XXX: we should put all inject* fields into one PGL object if (cfg.infectProb(theInfProb) != (theTdb != 0) || (theTdb != 0) != (theInjGap != 0)) { Comment << "error: content cfg `" << theKind << "': " << "`inject_prob' requires `inject_src' requires `inject_gap'" << " and vice versa" << endc << xexit; } configureEncodings(cfg); if (!ContTypeStat::Kinds().count()) ContTypeStat::RecordKind(0, "foreign"); ContTypeStat::RecordKind(id(), kind()); } void ContentCfg::configureEncodings(const ContentSym &cfg) { theEncodings[codingIdentity] = theEncodings[codingGzip] = -1; Strings encodings; if (cfg.encodings(encodings)) { for (int i = 0; i < encodings.count(); ++i) { const String &encoding = *encodings[i]; if (encoding == "identity") theEncodings[codingIdentity] = 0; else if (encoding == "gzip") { if (Deflator::Supported) { theEncodings[codingGzip] = 6; } else { Comment << "error: support for 'gzip' content encoding " << "(fount in content cfg '" << theKind << "') has been " << "disabled" << endc << xexit; } } else Comment << "error: unknown content encoding '" << encoding << "' in content cfg '" << theKind << "'; known codings are " << "'identity' and 'gzip'" << endc << xexit; } } else { theEncodings[codingIdentity] = 0; } } const String &ContentCfg::url_ext(int seed) const { return pickStr(theExtensions, theExtSel, seed); } const String &ContentCfg::url_pfx(int seed) const { return pickStr(thePrefixes, thePfxSel, seed); } double ContentCfg::repSizeMean() const { Assert(theSize || theCdb); if (theSize) return RndDistrStat(theSize).mean(); else return theCdb->entrySizeMean(); } bool ContentCfg::multipleContentCodings() const { return theEncodings[codingIdentity] >= 0 && theEncodings[codingGzip] >= 0; } void ContentCfg::calcTimes(const ObjId &oid, ObjTimes ×) const { const int seed = GlbPermut(oid.hash(), rndRepOlc); theObjLifeCycle->calcTimes(seed, times); } bool ContentCfg::calcContentCoding(ObjId &oid, const ReqHdr &req) const { if (theEncodings[codingGzip] >= 0 && req.acceptedEncoding(codingGzip)) oid.gzipContent(true); else if (theEncodings[codingIdentity] >= 0 && req.acceptedEncoding(codingIdentity)) oid.gzipContent(false); else return false; return true; } Size ContentCfg::calcRawRepSize(const ObjId &oid) const { Assert(theSize || theCdb); if (theSize) { const int seed = GlbPermut(contentHash(oid), rndRepSize); theSize->rndGen()->seed(seed); const double dh = theSize->trial(); // prevent int overflows and leave room for headers // make sure uniquePrefix fits const Size contentPrefixSize = calcContentPrefixSize(oid); Size sz = (int)MiniMax((double)contentPrefixSize, ceil(dh), (double)INT_MAX - 100*1024); // paranoid sanity checks if (!Should(sz >= contentPrefixSize)) sz = contentPrefixSize; if (!Should(sz >= 0)) sz = 0; return sz; } else { const int start = selectCdbStart(oid); CdbEntryPrnOpt opt; // assume that buf, injector, and rng are not needed opt.embed.model = theEmbedCont; opt.embed.container = oid; opt.sizeMax = Size(INT_MAX); // Size::Max(); opt.entryOff = 0; return theCdb->entry(start)->size(opt); } } bool ContentCfg::calcCachability(const ObjId &oid) const { const int seed = GlbPermut(oid.hash(), rndRepCach); RndGen rng(seed); return rng.event(theChbRatio); } bool ContentCfg::calcChecksumNeed(const ObjId &oid) const { const int seed = GlbPermut(oid.hash(), rndRepCheckNeed); RndGen rng(seed); return rng.event(theChecksumRatio); } Size ContentCfg::calcContentPrefixSize(const ObjId &oid) const { switch (contentUniqueness(oid)) { case cuUnique: { IOBuf buf; return pourUniqueContentPrefix(oid, buf); } case cuCommon: return theCommonPrefix.len(); case cuChance: default: return 0; } } Size ContentCfg::pourContentPrefix(const ObjId &oid, IOBuf &buf) const { switch (contentUniqueness(oid)) { case cuUnique: return pourUniqueContentPrefix(oid, buf); case cuCommon: { buf.append(theCommonPrefix.data(), theCommonPrefix.len()); return theCommonPrefix.len(); } case cuChance: default: return 0; } } int ContentCfg::contentUniqueness(const ObjId &oid) const { if (theUniqueRatio < 0) return cuChance; // default: leave it to chance or other factors // no sense in generating content [prefix] for foreign oids if (!Should(!oid.foreignUrl() && !oid.foreignSrc())) return cuChance; const int seed = GlbPermut(oid.hash(), rndUniqueContent); RndGen rng(seed); return rng.event(theUniqueRatio) ? cuUnique : cuCommon; } // internal method, should be called only if uniqueContent() Size ContentCfg::pourUniqueContentPrefix(const ObjId &oid, IOBuf &buf) const { // mimic Oid2Url() but do not use TheViservs and such, just indeces ofixedstream os(buf.space(), buf.spaceSize()); os << 'u' << hex << setfill('0') << 'v' << setw(3) << oid.viserv() << '/' << 'w' << oid.world() << '/' << 't' << setw(2) << oid.type() << '/' << '_' << setw(8) << ' '; os.flush(); const Size size = (std::streamoff)os.tellp(); Should(size < buf.spaceSize()); // otherwise may be too big buf.appended(size); return size; } int ContentCfg::contentHash(const ObjId &oid) const { if (contentUniqueness(oid) == cuCommon) return theId; else return oid.hash(); } int ContentCfg::selectCdbStart(const ObjId &oid) const { Assert(theCdb); RndGen rng(GlbPermut(contentHash(oid), rndCdbStart)); return rng(0, theCdb->count()); } const String &ContentCfg::pickStr(const Strings &strings, RndDistr *sel, int seed) const { static String noStr = 0; if (const int count = strings.count()) { sel->rndGen()->seed(seed); const int idx = (int)sel->trial(); Assert(0 <= idx && idx < count); return *strings[idx]; } return noStr; } double ContentCfg::compContPerCall(const ContentCfg *cc) const { if (cc->id() == id()) return 1.0; if (theEmbedCont) return theEmbedCont->compContPerCall(cc); return 0.0; } void ContentCfg::noteNewContProb(ContentCfg *cc, double newProb) { if (cc->id() == id()) return; if (theEmbedCont) theEmbedCont->noteNewContProb(cc, newProb); } void ContentCfg::newPerOid(double aNewPerOid) { Assert(aNewPerOid > 0); theNewPerOid = aNewPerOid; } // XXX: iterators should be farmed, but it is hard because they // come in different types (perhaps somebody else should farm them?) BodyIter *ContentCfg::getBodyIter(const ObjId &oid) { BodyIter *res = 0; if (theCdb) { CdbBodyIter *i = new CdbBodyIter; i->cdb(theCdb); if (theEmbedCont) i->embedContModel(theEmbedCont); i->startPos(selectCdbStart(oid)); if (theTdb) { RndGen rng(GlbPermut(contentHash(oid), rndInjProb)); // should we inject this object? if (rng.event(theInfProb)) { InjectIter *inj = new InjectIter; // XXX: Farm these! inj->creator(this); inj->textDbase(theTdb); inj->gap(theInjGap); i->injector(inj); } } res = i; } else if (theEmbedCont) { ContainerBodyIter *i = new ContainerBodyIter; i->embedContModel(theEmbedCont); res = i; } else { RndBodyIter *i = new RndBodyIter; res = i; } if (Should(res)) { // keep in sync with GzipEncoder ctor res->contentCfg(this); res->oidCfg(oid, contentHash(oid)); res->contentSize(calcRawRepSize(oid)); if (oid.gzipContent()) return wrapBodyIter(res); } return res; } BodyIter *ContentCfg::wrapBodyIter(BodyIter *prime) { // wrap configured body iterator in gzip encoder const int gzipLevel = theEncodings[codingGzip]; // we have to actually encode to give the caller the exact encoded size GzipEncoder encoder(gzipLevel, prime); WrBuf buf; encoder.start(&buf); while (encoder) { encoder.pour(); // XXX: what if it fails? buf.reset(); } const Size encodedSize = encoder.builtSize(); encoder.abandonProducer(); BodyIter *res = new GzipEncoder(gzipLevel, prime); res->contentSize(encodedSize); return res; } void ContentCfg::putBodyIter(BodyIter *i) const { i->stop(); delete i; } void ContentCfg::putInjector(InjectIter *i) const { delete i; // XXX: Farm these? }