/* Web Polygraph http://www.web-polygraph.org/ * (C) 2003-2006 The Measurement Factory * Licensed under the Apache License, Version 2.0 */ #include "xstd/xstd.h" #include "xstd/h/iomanip.h" #include "xstd/String.h" #include "xstd/StrIdentifier.h" /* StrIdentifier */ StrIdentifier::StrIdentifier(): theCount(0), theLastId(0), isLocked(false) { } StrIdentifier::~StrIdentifier() { while (theStrings.count()) delete theStrings.pop(); } int StrIdentifier::add(const String &str) { return doAdd(str, ++theLastId); } int StrIdentifier::add(const String &str, int id) { Assert(theLastId != id); // weak check for uniqueness return doAdd(str, theLastId = id); } int StrIdentifier::doAdd(const String &str, int id) { Assert(!isLocked); // no additions if optimize()-d String *copy = new String(str); theHead.add(copy, id, -1); theStrings.put(copy, id); theCount++; return id; } StrIdentifierIter StrIdentifier::iterator() const { return StrIdentifierIter(*this); } int StrIdentifier::lookup(const String &str) const { return lookup(str.data(), str.len()); } int StrIdentifier::lookup(const char *buf, int len) const { return theHead.lookup(buf, len); } // collapses single-branch paths void StrIdentifier::optimize() { // new entries cannot be added from now on // note: we can support run-time additions, but Node::add // must not assume sequntial lookup positions isLocked = true; theHead.optimize(); } void StrIdentifier::report(ostream &os) const { theHead.report(os, 0); } /* StrIdentifierIter */ StrIdentifierIter::StrIdentifierIter(const StrIdentifier &anIdfr): theIdfr(anIdfr), thePos(0) { sync(); } StrIdentifierIter &StrIdentifierIter::operator ++() { if (!atEnd()) { ++thePos; sync(); } return *this; } void StrIdentifierIter::sync() { while (thePos < theIdfr.theStrings.count() && !theIdfr.theStrings[thePos]) { ++thePos; } } bool StrIdentifierIter::atEnd() const { return thePos >= theIdfr.theStrings.count(); } const String &StrIdentifierIter::str() const { Assert(!atEnd()); return *theIdfr.theStrings[thePos]; } int StrIdentifierIter::id() const { Assert(!atEnd()); return thePos; // same as id in this StrIdentifier implementation } /* StrIdNode */ StrIdNode::StrIdNode(): theId(0) { u.theTab = 0; u.theStr = 0; } void StrIdNode::add(String *str, int id, int myPos) { if (!theId) { // virgin node Assert(id > 0 && str); theId = id; u.theStr = str; } else if (theId > 0) { // conflict, create next level table String *s = u.theStr; u.theTab = new StrIdTable(myPos+1); u.theTab->add(s, theId); u.theTab->add(str, id); theId = -1; } else { // intermediate node, move further u.theTab->add(str, id); } } int StrIdNode::lookup(const char *buf, int len) const { if (!theId) return 0; else if (theId > 0) return u.theStr->casePrefixOf(buf, len) ? theId : 0; else return u.theTab->lookup(buf, len); } void StrIdNode::optimize() { // shrink outgoing branch link by link until hit a fan-out StrIdNode n; while (theId < 0 && u.theTab->single(n)) { delete u.theTab; *this = n; } // recurse if needed if (theId < 0) u.theTab->optimize(); } void StrIdNode::report(ostream &os, int myLevel) const { if (!theId) os << "(0) " << endl; else if (theId > 0) os << '(' << theId << ") " << *u.theStr << endl; else if (theId < 0) u.theTab->report(os, myLevel+1); } /* StrIdTable */ StrIdTable::StrIdTable(int aPos): theLookupPos(aPos) { Assert(0 <= theLookupPos && theLookupPos <= 255); } void StrIdTable::add(String *str, int id) { const int idx = lookupIdx(*str); if (idx >= theNodes.capacity()) { // out of bounds theNodes.stretch(idx+1); theNodes.count(theNodes.capacity()); } theNodes[idx].add(str, id, theLookupPos); } int StrIdTable::lookup(const char *buf, int len) const { const int idx = lookupIdx(buf, len); if (idx < 0) // buffer is too short already return 0; else if (idx >= theNodes.count()) // lookup char is out of bounds return 0; else return theNodes[idx].lookup(buf, len); } void StrIdTable::optimize() { for (int i = 0; i < theNodes.count(); ++i) { if (!theNodes[i].dead()) theNodes[i].optimize(); } } bool StrIdTable::single(StrIdNode &n) { // to optimize we could've keep track on the number of live nodes // but it is combersome because of node conflicts/propogation int idx = -1; for (int i = 0; i < theNodes.count(); ++i) { if (!theNodes[i].dead()) { if (idx >= 0) return false; // more than one found idx = i; } } if (idx < 0) // none found return false; n = theNodes[idx]; return true; } void StrIdTable::report(ostream &os, int myLevel) const { os << endl; for (int i = 0; i < theNodes.count(); ++i) { if (!theNodes[i].dead()) { os << setw(2*myLevel) << " "; if (i > 32) os << (char)i; else os << '<' << i << '>'; os << ": "; theNodes[i].report(os, myLevel); } } }