/* Web Polygraph http://www.web-polygraph.org/ * (C) 2003-2006 The Measurement Factory * Licensed under the Apache License, Version 2.0 */ #include "base/polygraph.h" #include #include "xstd/gadgets.h" #include "csm/XmlParser.h" XmlParser::XmlParser(): theStr(0), theEos(0) { theTail.type = Node::tpNone; theTail.imageLen = 0; } bool XmlParser::parse(const char *s, const char *eos) { theNodes.reset(); theStr = s; theEos = eos; theTail.imageBeg = eos; while (theStr < theEos) { Node node; node.imageBeg = theStr; // remember starting position node.type = skipNode(); // advance and identify type, if possible node.imageLen = theStr - node.imageBeg; if (node.type == Node::tpNone) { Assert(theStr == theEos); theTail.imageBeg = node.imageBeg; } else { theNodes.append(node); theTail.imageBeg = theStr; } } theTail.imageLen = eos - theTail.imageBeg; return theTail.imageLen == 0; } XmlParser::Node::Type XmlParser::skipNode() { // make sure an incomplete "<" or "= theEos) return Node::tpNone; if (*theStr != '<') return skipText(); ++theStr; // skip '<' if (theStr >= theEos) return Node::tpNone; // need more lookahead space if (*theStr != '!') return skipTag(); ++theStr; // skip '!' if (theStr >= theEos) return Node::tpNone; // need more lookahead space if (*theStr != '-') return skipTag(); ++theStr; // skip first '-' if (theStr >= theEos) return Node::tpNone; // need more lookahead space if (*theStr != '-') return skipTag(); ++theStr; // skip second '-' return skipComment(); } XmlParser::Node::Type XmlParser::skipText() { while (theStr < theEos && *theStr != '<') ++theStr; return Node::tpText; // never fails } XmlParser::Node::Type XmlParser::skipTag() { while (theStr < theEos && *theStr != '>') ++theStr; if (theStr < theEos) { ++theStr; return Node::tpTag; } return Node::tpNone; } XmlParser::Node::Type XmlParser::skipComment() { while (theStr < theEos) { // make progress; if we see "--", investigate further if (*theStr++ == '-' && theStr < theEos && *theStr == '-') { const char *p = theStr + 1; // SGML allows spaces between "--" and ">"; skip them, if any while (p < theEos && isspace(*p)) ++p; // end-of-comment found if we see ">" if (p < theEos && *p++ == '>') { theStr = p; return Node::tpComment; } } } return Node::tpNone; }