/* Web Polygraph http://www.web-polygraph.org/
* (C) 2003-2006 The Measurement Factory
* Licensed under the Apache License, Version 2.0 */
#include "base/polygraph.h"
#include <ctype.h>
#include "xstd/gadgets.h"
#include "csm/XmlParser.h"
XmlParser::XmlParser(): theStr(0), theEos(0) {
theTail.type = Node::tpNone;
theTail.imageLen = 0;
}
bool XmlParser::parse(const char *s, const char *eos) {
theNodes.reset();
theStr = s;
theEos = eos;
theTail.imageBeg = eos;
while (theStr < theEos) {
Node node;
node.imageBeg = theStr; // remember starting position
node.type = skipNode(); // advance and identify type, if possible
node.imageLen = theStr - node.imageBeg;
if (node.type == Node::tpNone) {
Assert(theStr == theEos);
theTail.imageBeg = node.imageBeg;
} else {
theNodes.append(node);
theTail.imageBeg = theStr;
}
}
theTail.imageLen = eos - theTail.imageBeg;
return theTail.imageLen == 0;
}
XmlParser::Node::Type XmlParser::skipNode() {
// make sure an incomplete "<" or "<!" buffer does not confuse us
if (theStr >= theEos)
return Node::tpNone;
if (*theStr != '<')
return skipText();
++theStr; // skip '<'
if (theStr >= theEos)
return Node::tpNone; // need more lookahead space
if (*theStr != '!')
return skipTag();
++theStr; // skip '!'
if (theStr >= theEos)
return Node::tpNone; // need more lookahead space
if (*theStr != '-')
return skipTag();
++theStr; // skip first '-'
if (theStr >= theEos)
return Node::tpNone; // need more lookahead space
if (*theStr != '-')
return skipTag();
++theStr; // skip second '-'
return skipComment();
}
XmlParser::Node::Type XmlParser::skipText() {
while (theStr < theEos && *theStr != '<')
++theStr;
return Node::tpText; // never fails
}
XmlParser::Node::Type XmlParser::skipTag() {
while (theStr < theEos && *theStr != '>')
++theStr;
if (theStr < theEos) {
++theStr;
return Node::tpTag;
}
return Node::tpNone;
}
XmlParser::Node::Type XmlParser::skipComment() {
while (theStr < theEos) {
// make progress; if we see "--", investigate further
if (*theStr++ == '-' && theStr < theEos && *theStr == '-') {
const char *p = theStr + 1;
// SGML allows spaces between "--" and ">"; skip them, if any
while (p < theEos && isspace(*p))
++p;
// end-of-comment found if we see ">"
if (p < theEos && *p++ == '>') {
theStr = p;
return Node::tpComment;
}
}
}
return Node::tpNone;
}
syntax highlighted by Code2HTML, v. 0.9.1