/* Web Polygraph http://www.web-polygraph.org/
* (C) 2003-2006 The Measurement Factory
* Licensed under the Apache License, Version 2.0 */
#include "base/polygraph.h"
#include <ctype.h>
#include "xstd/Rnd.h"
#include "xstd/gadgets.h"
#include "runtime/LogComment.h"
#include "runtime/ErrorMgr.h"
#include "runtime/polyErrors.h"
#include "client/CltXact.h"
#include "client/CltOpts.h"
#include "client/ParseBuffer.h"
#include "client/UriScriptBodyParser.h"
// commonly used RE patterns
static const String ptnConst = "'([^']*)'";
static const String ptnName = "([[:alnum:]_]+)";
static const String ptnSpace = "[[:space:]]*";
static const String ptnEq = ptnSpace + "=" + ptnSpace;
RegEx UriScriptBodyParser::TheWre;
RegEx UriScriptBodyParser::TheXre;
RegEx UriScriptBodyParser::TheYre;
RegEx UriScriptBodyParser::TheZre;
BodyParserFarmT<UriScriptBodyParser> UriScriptBodyParser::TheParsers;
BodyParser *UriScriptBodyParser::GetOne(CltXact *anOwner, const CltCfg *aCfg) {
if (!TheParsers.capacity())
TheParsers.limit(1024);
UriScriptBodyParser *parser = TheParsers.getTyped();
parser->configure(anOwner, aCfg);
return parser;
}
UriScriptBodyParser::UriScriptBodyParser() {
// configure static REs if needed
if (!TheXre.configured()) {
ConfigureScriptVarRe(TheWre, "w");
ConfigureScriptVarRe(TheXre, "x");
ConfigureScriptVarRe(TheYre, "y");
ConfigureScriptVarRe(TheZre, "z");
}
resetSelf();
}
void UriScriptBodyParser::reset() {
resetSelf();
BodyParser::reset();
}
void UriScriptBodyParser::resetSelf() {
theCfg = 0;
theScriptBeg = theScriptEnd = 0;
theState = stNone;
}
void UriScriptBodyParser::configure(CltXact *anOwner, const CltCfg *aCfg) {
BodyParser::configure(anOwner);
Check(!theCfg && aCfg);
theCfg = aCfg;
}
BodyParserFarm &UriScriptBodyParser::farm() const {
return TheParsers;
}
Size UriScriptBodyParser::parse(const ParseBuffer &buf) {
if (theState == stNone)
theState = stOpen;
if (theState == stOpen)
parseOpen(buf);
if (theState == stClose)
parseClose(buf);
if (theState == stBody)
parseBody(buf);
if (theState == stDone)
return parseAny(buf);
return 0; // need more data
}
void UriScriptBodyParser::parseOpen(const ParseBuffer &buf) {
static const String pfx = "<script";
if (buf.size() < pfx.len())
return; // need more data
if (strncmp(buf.data(), pfx.cstr(), pfx.len()) == 0)
theState = stClose;
else
theState = stDone;
}
void UriScriptBodyParser::parseClose(const ParseBuffer &buf) {
static const String sfx = "</script";
if (buf.size() < sfx.len())
return; // need more data
const char *bufEnd = buf.data() + buf.size();
theScriptBeg = buf.data();
theScriptEnd = StrBoundStr(theScriptBeg, sfx.cstr(), bufEnd);
if (theScriptEnd)
theState = stBody;
}
void UriScriptBodyParser::parseBody(const ParseBuffer &buf) {
if (Should(theScriptBeg) && Should(theScriptEnd))
parseVars();
theState = stDone;
}
void UriScriptBodyParser::parseVars() {
String w, x;
if (parseVar("w", TheWre, w) && parseVar("x", TheXre, x)) {
// XXX: kludge to do 95%/5% split
static RndGen rng;
const bool formerFrameless = rng.event(0.95);
forwardUri(x + (formerFrameless ? "/1-": "/2-") + w);
}
String y;
if (parseVar("y", TheYre, y))
forwardUri(y);
String z;
if (parseVar(0, TheZre, z))
forwardUri(z);
}
bool UriScriptBodyParser::parseVar(const char *name, const RegEx &re, String &value) {
static RegEx::Matches matches(1+1);
static const int expectedCount = matches.capacity();
matches.reset();
matches.append(RegEx::StartEnd(0, theScriptEnd - theScriptBeg));
if (re.match(theScriptBeg, matches, RegEx::reStartEnd) &&
Should(matches.count() == expectedCount)) {
const RegEx::Match &m = matches[1];
if (m.rm_so >= 0 && m.rm_eo > m.rm_so) {
value = String(theScriptBeg + m.rm_so, m.rm_eo - m.rm_so);
return true;
}
}
if (name) { // client asked to report errors
const Error &err = errForeignTag;
if (!TheCltOpts.ignoreBadContTags && ReportError(err)) {
dumpContext(Comment << "undefined or malformed variable '" <<
name << "' in the following emdedded script",
theScriptBeg, theScriptEnd - theScriptBeg) << endc;
}
}
return false;
}
void UriScriptBodyParser::forwardUri(const String &uri) {
Error err;
const char *parsep = uri.cstr();
ReqHdr hdr;
if (hdr.parseUri(parsep, parsep+uri.len(), hdr.theUri))
err = theOwner->noteEmbedded(hdr);
else
err = errForeignTag;
if (err) {
if (!TheCltOpts.ignoreBadContTags && ReportError(err)) {
dumpContext(Comment << "unparseable recreated URL: ",
uri.data(), uri.len()) << endc;
}
} else {
static int reportCount = 0;
if (!reportCount++) {
dumpContext(Comment << "fyi: first URL extracted " <<
"from a script: ", uri.data(), uri.len()) << endc;
}
}
}
Size UriScriptBodyParser::parseAny(const ParseBuffer &buf) {
const Size parsedSize = buf.size();
theOwner->noteContent(buf);
return parsedSize;
}
void UriScriptBodyParser::noteLeftovers(const ParseBuffer &leftovers) {
Should(theState == stOpen || theState == stClose);
if (theState == stClose && ReportError(errContentLeftovers)) {
const char *what = theState == stOpen ? "beginning" : "end";
dumpContext(Comment << "unable to locate the " << what <<
" of a URI-setting embedded <script> in the remaining " <<
leftovers.size() << " of content leftovers near " << endl,
leftovers.data(), leftovers.size()) << endc;
}
theOwner->noteContent(leftovers);
}
void UriScriptBodyParser::noteOverflow(const ParseBuffer &buf) {
if (ReportError(errHugeContentToken)) {
dumpContext(Comment << "huge URI-setting <script> context near ",
buf.data(), buf.size()) << endc;
}
theOwner->noteContent(buf);
}
void UriScriptBodyParser::ConfigureScriptVarRe(RegEx &re, const String &name) {
String pattern;
pattern += name;
pattern += ptnEq;
pattern += ptnSpace;
pattern += ptnConst;
re.configure(name + " RE", pattern);
Should(re);
}
syntax highlighted by Code2HTML, v. 0.9.1