/* Web Polygraph http://www.web-polygraph.org/
* (C) 2003-2006 The Measurement Factory
* Licensed under the Apache License, Version 2.0 */
#include "base/polygraph.h"
#include <limits.h>
#include "xstd/h/iostream.h"
#include "xstd/h/sstream.h"
#include "xstd/h/iomanip.h"
#include "xstd/ZLib.h"
#include <fstream>
#include "xstd/Rnd.h"
#include "xstd/gadgets.h"
#include "base/RndPermut.h"
#include "base/BStream.h"
#include "base/rndDistrStat.h"
#include "base/ContTypeStat.h"
#include "base/polyLogCats.h"
#include "runtime/IOBuf.h"
#include "runtime/LogComment.h"
#include "runtime/ErrorMgr.h"
#include "runtime/polyErrors.h"
#include "runtime/httpHdrs.h"
#include "runtime/httpText.h"
#include "pgl/MimeSym.h"
#include "pgl/ContentSym.h"
#include "csm/EmbedContMdl.h"
#include "csm/RndBodyIter.h"
#include "csm/CdbBodyIter.h"
#include "csm/ContainerBodyIter.h"
#include "csm/InjectIter.h"
#include "csm/GzipEncoder.h"
#include "csm/cdbEntries.h"
#include "csm/ContentDbase.h"
#include "csm/TextDbase.h"
#include "csm/ObjLifeCycle.h"
#include "csm/ContentCfg.h"
static const String DefaultContentCfgKind = "some-content";
ContentCfg::ContentCfg(int anId):
theObjLifeCycle(0), theChbRatio(-1), theChecksumRatio(-1),
theUniqueRatio(-1), theSize(0),
theEmbedCont(0), theCdb(0),
theTdb(0), theInjGap(0), theInfProb(-1),
theExtSel(0), thePfxSel(0),
theNewPerOid(0), theId(anId),
theEncodings(new int[codingEnd]) {
theObjLifeCycle = new ObjLifeCycle;
}
ContentCfg::~ContentCfg() {
delete theObjLifeCycle;
delete[] theEncodings;
}
void ContentCfg::configure(const ContentSym &cfg) {
theKind = cfg.kind();
if (!theKind)
theKind = DefaultContentCfgKind;
if (MimeSym *mime = cfg.mime()) {
theMimeType = mime->mimeType();
mime->extensions(theExtensions, theExtSel);
mime->prefixes(thePrefixes, thePfxSel);
}
theSize = cfg.size();
theObjLifeCycle->configure(cfg.objLifeCycle());
cfg.cachable(theChbRatio);
cfg.checksum(theChecksumRatio);
if (cfg.unique(theUniqueRatio) && theUniqueRatio >= 0) {
// form unique prefix to be used to make some content unique
const int sharedId = GlbPermut(rndSharedContent);
char buf[64];
ofixedstream os(buf, sizeof(buf));
os << 'u' << hex << setfill('0') <<
setw(8) << sharedId << '.' <<
setw(2) << theId << '/' << ends;
os.flush();
theCommonPrefix = buf;
const double commonRatio = 1 - theUniqueRatio;
Comment << "fyi: " << (100*commonRatio) << "% of '" <<
theKind << "' content (id " << theId <<
") will be identical; common prefix: " << theCommonPrefix <<
endc;
}
if (cfg.hasEmbed()) {
theEmbedCont = new EmbedContMdl;
theEmbedCont->configure(&cfg);
}
if (const String &cdbName = cfg.cdb()) {
ifstream f(cdbName.cstr());
IBStream is;
is.configure(&f, cdbName);
theCdb = new ContentDbase;
theCdb->load(is);
if (!is.good())
Comment << "error: cannot load content database from '"
<< cdbName << "'; " << Error::Last() << endc << xexit;
if (!theCdb->count())
Comment << "error: no entries in '" << cdbName
<< "' content database" << endc << xexit;
}
//if (theEmbedCont && theCdb)
// Comment << "error: content cfg `" << theKind << "': "
// << "cannot support containers together with content_db yet; "
// << "do not use may_contain with content_db"
// << endc << xexit;
if (!theSize && !theCdb)
Comment << "error: content cfg `" << theKind << "': "
<< "has neither size distribution nor content_db; "
<< "either one or both must be specified for "
<< "Polygaph to know what object sizes this content type "
<< "should generate"
<< endc << xexit;
if (const String &tdbName = cfg.injectDb()) {
ifstream f(tdbName.cstr());
theTdb = new TextDbase;
theTdb->load(f);
if (f.bad())
Comment << "error: cannot load text database from `"
<< tdbName << "; " << Error::Last() << endc << xexit;
if (!theTdb->count())
Comment << "error: text database `"
<< tdbName << " appears to be empty" << endc << xexit;
}
theInjGap = cfg.injectGap();
// XXX: we should put all inject* fields into one PGL object
if (cfg.infectProb(theInfProb) != (theTdb != 0) ||
(theTdb != 0) != (theInjGap != 0)) {
Comment << "error: content cfg `" << theKind << "': "
<< "`inject_prob' requires `inject_src' requires `inject_gap'"
<< " and vice versa"
<< endc << xexit;
}
configureEncodings(cfg);
if (!ContTypeStat::Kinds().count())
ContTypeStat::RecordKind(0, "foreign");
ContTypeStat::RecordKind(id(), kind());
}
void ContentCfg::configureEncodings(const ContentSym &cfg) {
theEncodings[codingIdentity] = theEncodings[codingGzip] = -1;
Strings encodings;
if (cfg.encodings(encodings)) {
for (int i = 0; i < encodings.count(); ++i) {
const String &encoding = *encodings[i];
if (encoding == "identity")
theEncodings[codingIdentity] = 0;
else
if (encoding == "gzip") {
if (Deflator::Supported) {
theEncodings[codingGzip] = 6;
} else {
Comment << "error: support for 'gzip' content encoding " <<
"(fount in content cfg '" << theKind << "') has been " <<
"disabled" << endc << xexit;
}
} else
Comment << "error: unknown content encoding '" << encoding <<
"' in content cfg '" << theKind << "'; known codings are " <<
"'identity' and 'gzip'" << endc << xexit;
}
} else {
theEncodings[codingIdentity] = 0;
}
}
const String &ContentCfg::url_ext(int seed) const {
return pickStr(theExtensions, theExtSel, seed);
}
const String &ContentCfg::url_pfx(int seed) const {
return pickStr(thePrefixes, thePfxSel, seed);
}
double ContentCfg::repSizeMean() const {
Assert(theSize || theCdb);
if (theSize)
return RndDistrStat(theSize).mean();
else
return theCdb->entrySizeMean();
}
bool ContentCfg::multipleContentCodings() const {
return theEncodings[codingIdentity] >= 0 && theEncodings[codingGzip] >= 0;
}
void ContentCfg::calcTimes(const ObjId &oid, ObjTimes ×) const {
const int seed = GlbPermut(oid.hash(), rndRepOlc);
theObjLifeCycle->calcTimes(seed, times);
}
bool ContentCfg::calcContentCoding(ObjId &oid, const ReqHdr &req) const {
if (theEncodings[codingGzip] >= 0 && req.acceptedEncoding(codingGzip))
oid.gzipContent(true);
else
if (theEncodings[codingIdentity] >= 0 && req.acceptedEncoding(codingIdentity))
oid.gzipContent(false);
else
return false;
return true;
}
Size ContentCfg::calcRawRepSize(const ObjId &oid) const {
Assert(theSize || theCdb);
if (theSize) {
const int seed = GlbPermut(contentHash(oid), rndRepSize);
theSize->rndGen()->seed(seed);
const double dh = theSize->trial();
// prevent int overflows and leave room for headers
// make sure uniquePrefix fits
const Size contentPrefixSize = calcContentPrefixSize(oid);
Size sz = (int)MiniMax((double)contentPrefixSize,
ceil(dh), (double)INT_MAX - 100*1024);
// paranoid sanity checks
if (!Should(sz >= contentPrefixSize))
sz = contentPrefixSize;
if (!Should(sz >= 0))
sz = 0;
return sz;
} else {
const int start = selectCdbStart(oid);
CdbEntryPrnOpt opt;
// assume that buf, injector, and rng are not needed
opt.embed.model = theEmbedCont;
opt.embed.container = oid;
opt.sizeMax = Size(INT_MAX); // Size::Max();
opt.entryOff = 0;
return theCdb->entry(start)->size(opt);
}
}
bool ContentCfg::calcCachability(const ObjId &oid) const {
const int seed = GlbPermut(oid.hash(), rndRepCach);
RndGen rng(seed);
return rng.event(theChbRatio);
}
bool ContentCfg::calcChecksumNeed(const ObjId &oid) const {
const int seed = GlbPermut(oid.hash(), rndRepCheckNeed);
RndGen rng(seed);
return rng.event(theChecksumRatio);
}
Size ContentCfg::calcContentPrefixSize(const ObjId &oid) const {
switch (contentUniqueness(oid)) {
case cuUnique: {
IOBuf buf;
return pourUniqueContentPrefix(oid, buf);
}
case cuCommon:
return theCommonPrefix.len();
case cuChance:
default:
return 0;
}
}
Size ContentCfg::pourContentPrefix(const ObjId &oid, IOBuf &buf) const {
switch (contentUniqueness(oid)) {
case cuUnique:
return pourUniqueContentPrefix(oid, buf);
case cuCommon: {
buf.append(theCommonPrefix.data(), theCommonPrefix.len());
return theCommonPrefix.len();
}
case cuChance:
default:
return 0;
}
}
int ContentCfg::contentUniqueness(const ObjId &oid) const {
if (theUniqueRatio < 0)
return cuChance; // default: leave it to chance or other factors
// no sense in generating content [prefix] for foreign oids
if (!Should(!oid.foreignUrl() && !oid.foreignSrc()))
return cuChance;
const int seed = GlbPermut(oid.hash(), rndUniqueContent);
RndGen rng(seed);
return rng.event(theUniqueRatio) ? cuUnique : cuCommon;
}
// internal method, should be called only if uniqueContent()
Size ContentCfg::pourUniqueContentPrefix(const ObjId &oid, IOBuf &buf) const {
// mimic Oid2Url() but do not use TheViservs and such, just indeces
ofixedstream os(buf.space(), buf.spaceSize());
os << 'u' << hex << setfill('0') <<
'v' << setw(3) << oid.viserv() << '/' <<
'w' << oid.world() << '/' <<
't' << setw(2) << oid.type() << '/' <<
'_' << setw(8) << ' ';
os.flush();
const Size size = (std::streamoff)os.tellp();
Should(size < buf.spaceSize()); // otherwise may be too big
buf.appended(size);
return size;
}
int ContentCfg::contentHash(const ObjId &oid) const {
if (contentUniqueness(oid) == cuCommon)
return theId;
else
return oid.hash();
}
int ContentCfg::selectCdbStart(const ObjId &oid) const {
Assert(theCdb);
RndGen rng(GlbPermut(contentHash(oid), rndCdbStart));
return rng(0, theCdb->count());
}
const String &ContentCfg::pickStr(const Strings &strings, RndDistr *sel, int seed) const {
static String noStr = 0;
if (const int count = strings.count()) {
sel->rndGen()->seed(seed);
const int idx = (int)sel->trial();
Assert(0 <= idx && idx < count);
return *strings[idx];
}
return noStr;
}
double ContentCfg::compContPerCall(const ContentCfg *cc) const {
if (cc->id() == id())
return 1.0;
if (theEmbedCont)
return theEmbedCont->compContPerCall(cc);
return 0.0;
}
void ContentCfg::noteNewContProb(ContentCfg *cc, double newProb) {
if (cc->id() == id())
return;
if (theEmbedCont)
theEmbedCont->noteNewContProb(cc, newProb);
}
void ContentCfg::newPerOid(double aNewPerOid) {
Assert(aNewPerOid > 0);
theNewPerOid = aNewPerOid;
}
// XXX: iterators should be farmed, but it is hard because they
// come in different types (perhaps somebody else should farm them?)
BodyIter *ContentCfg::getBodyIter(const ObjId &oid) {
BodyIter *res = 0;
if (theCdb) {
CdbBodyIter *i = new CdbBodyIter;
i->cdb(theCdb);
if (theEmbedCont)
i->embedContModel(theEmbedCont);
i->startPos(selectCdbStart(oid));
if (theTdb) {
RndGen rng(GlbPermut(contentHash(oid), rndInjProb));
// should we inject this object?
if (rng.event(theInfProb)) {
InjectIter *inj = new InjectIter; // XXX: Farm these!
inj->creator(this);
inj->textDbase(theTdb);
inj->gap(theInjGap);
i->injector(inj);
}
}
res = i;
} else
if (theEmbedCont) {
ContainerBodyIter *i = new ContainerBodyIter;
i->embedContModel(theEmbedCont);
res = i;
} else {
RndBodyIter *i = new RndBodyIter;
res = i;
}
if (Should(res)) {
// keep in sync with GzipEncoder ctor
res->contentCfg(this);
res->oidCfg(oid, contentHash(oid));
res->contentSize(calcRawRepSize(oid));
if (oid.gzipContent())
return wrapBodyIter(res);
}
return res;
}
BodyIter *ContentCfg::wrapBodyIter(BodyIter *prime) {
// wrap configured body iterator in gzip encoder
const int gzipLevel = theEncodings[codingGzip];
// we have to actually encode to give the caller the exact encoded size
GzipEncoder encoder(gzipLevel, prime);
WrBuf buf;
encoder.start(&buf);
while (encoder) {
encoder.pour(); // XXX: what if it fails?
buf.reset();
}
const Size encodedSize = encoder.builtSize();
encoder.abandonProducer();
BodyIter *res = new GzipEncoder(gzipLevel, prime);
res->contentSize(encodedSize);
return res;
}
void ContentCfg::putBodyIter(BodyIter *i) const {
i->stop();
delete i;
}
void ContentCfg::putInjector(InjectIter *i) const {
delete i; // XXX: Farm these?
}
syntax highlighted by Code2HTML, v. 0.9.1