/* Web Polygraph http://www.web-polygraph.org/
* (C) 2003-2006 The Measurement Factory
* Licensed under the Apache License, Version 2.0 */
#include "base/polygraph.h"
#include <ctype.h>
#include "xstd/StrIdentifier.h"
#include "xstd/gadgets.h"
#include "base/AnyToString.h"
#include "csm/ContentDbase.h"
#include "csm/cdbEntries.h"
#include "csm/XmlParser.h"
#include "csm/XmlTagParser.h"
#include "csm/cdbBuilders.h"
/* CdbBuilder */
int CdbBuilder::TheLinkCount = 0;
CdbBuilder::CdbBuilder(): theDb(0), theBufB(0), theBufE(0) {
}
CdbBuilder::~CdbBuilder() {
}
void CdbBuilder::db(ContentDbase *aDb) {
Assert(!theDb && aDb);
theDb = aDb;
}
void CdbBuilder::configure(const String &aFname, const char *aBufB, const char *aBufE) {
theFname = aFname;
theBufB = aBufB;
theBufE = aBufE;
}
/* MarkupParser */
StrIdentifier *MarkupParser::TheReplIdentifier = 0;
Array<String*> MarkupParser::TheReplacements;
bool MarkupParser::parse() {
static XmlParser parser;
(void)parser.parse(theBufB, theBufE);
for (int i = 0; i < parser.nodeCount(); ++i) {
const XmlParser::Node &node = parser.node(i);
const String image = String(node.imageBeg, node.imageLen);
switch (node.type) {
case XmlParser::Node::tpText: {
CdbeText *e = new CdbeText;
e->image(image);
addEntry(e);
break;
}
case XmlParser::Node::tpTag: {
parseTag(image); // will call addEntry
break;
}
case XmlParser::Node::tpComment: {
CdbeComment *e = new CdbeComment;
e->image(image);
addEntry(e);
break;
}
default:
Assert(false);
}
}
if (!parser.nodeCount())
cerr << theFname << ": warning: no valid markup found" << endl;
if (parser.tail().imageLen) {
cerr << theFname << ": warning: ignoring markup leftovers starting at ";
cerr.write(parser.tail().imageBeg, Min(parser.tail().imageLen, 40));
cerr << endl;
}
return true;
}
void MarkupParser::addEntry(CdbEntry *e) {
theDb->add(e);
}
void MarkupParser::parseBlob(const String &blobImage) {
if (blobImage.len() > 0) {
CdbeBlob *e = new CdbeBlob;
e->image(blobImage);
addEntry(e);
}
}
void MarkupParser::parseTag(const String &tagImage) {
const char *tagB = tagImage.cstr();
const char *tagE = tagImage.cstr() + tagImage.len();
static XmlTagParser parser;
if (!parser.parse(tagB+1, tagE-1)) {
parseBlob(tagImage);
return;
}
if (!isalpha(*parser.tagname().nameBeg)) { // closing tag, comment, etc.
parseBlob(tagImage);
return;
}
String keyPfx;
keyPfx.append(parser.tagname().nameBeg, parser.tagname().nameLen);
keyPfx += '.';
// replace src="url" with src="/pg/embed/..."
String newImage;
const char *lastCopy = tagB;
for (int i = 0; i < parser.attrCount(); ++i) {
XmlTagParser::Token attr = parser.attr(i);
if (attr.valBeg && *attr.valBeg != '#') { // skip URLs pointing to self
const String keySfx = String(attr.nameBeg, attr.nameLen);
if (const String *replacement = AttrValReplacement(keyPfx, keySfx)) {
parseBlob(tagImage(lastCopy - tagB, attr.valBeg - tagB));
lastCopy = attr.valBeg + attr.valLen;
CdbeLink *link = new CdbeLink;
link->contentCategory = *replacement;
link->origImage = tagImage(attr.valBeg - tagB, lastCopy - tagB);
addEntry(link);
TheLinkCount++;
}
}
}
parseBlob(tagImage(lastCopy - tagB, tagE - tagB));
}
int MarkupParser::RegReplacement(const String &key, const String &ctype) {
Assert(TheReplIdentifier);
const int id = TheReplIdentifier->add(key);
TheReplacements.put(new String(ctype), id);
return id;
}
const String *MarkupParser::AttrValReplacement(const String &keyPfx, const String &keySfx) {
if (!TheReplIdentifier) {
TheReplIdentifier = new StrIdentifier;
const String embedHtml = "/pg/embed/html";
const String embedImage = "/pg/embed/image";
const String embedData = "/pg/embed/data";
// specific rules
RegReplacement("applet.archive", embedData);
RegReplacement("frame.src", embedHtml);
RegReplacement("iframe.src", embedHtml);
RegReplacement("img.src", embedImage);
RegReplacement("img.lowsrc", embedImage);
RegReplacement("img.usemap", embedHtml);
RegReplacement("input.src", embedImage);
RegReplacement("input.usemap", embedHtml);
RegReplacement("layer.src", embedHtml);
RegReplacement("object.data", embedData);
RegReplacement("script.src", embedHtml);
RegReplacement("link.href", embedData);
// more general catch-all rules for attributes
RegReplacement("background", embedImage);
RegReplacement("href", embedHtml);
RegReplacement("src", embedData);
RegReplacement("data", embedData);
TheReplIdentifier->optimize();
}
const String key = keyPfx + keySfx;
int idx = TheReplIdentifier->lookup(key);
if (idx <= 0)
idx = TheReplIdentifier->lookup(keySfx);
if (idx > 0) {
Assert(idx < TheReplacements.count());
return TheReplacements[idx];
}
return 0;
}
/* LinkOnlyParser */
LinkOnlyParser::LinkOnlyParser(): thePage(0) {
}
bool LinkOnlyParser::parse() {
thePage = new CdbePage;
theImage = String();
if (MarkupParser::parse()) {
flush();
theDb->add(thePage);
return true;
}
return false;
}
void LinkOnlyParser::addEntry(CdbEntry *e) {
if (e->type() == cdbeLink) {
flush();
thePage->add(e);
} else {
theImage += PrintToString(*e);
delete e;
}
}
void LinkOnlyParser::flush() {
if (theImage.len() > 0) {
CdbeBlob *e = new CdbeBlob;
e->image(theImage);
thePage->add(e);
theImage = String();
}
}
/* VerbatimParser */
bool VerbatimParser::parse() {
if (theBufB < theBufE) {
CdbeBlob *e = new CdbeBlob;
e->image(String(theBufB, theBufE-theBufB));
theDb->add(e);
return true;
}
cerr << theFname << ": warning: empty verbatim file" << endl;
return true;
}
syntax highlighted by Code2HTML, v. 0.9.1