/* Web Polygraph http://www.web-polygraph.org/ * (C) 2003-2006 The Measurement Factory * Licensed under the Apache License, Version 2.0 */ #ifndef POLYGRAPH__XSTD_STRIDENTIFIER_H #define POLYGRAPH__XSTD_STRIDENTIFIER_H #include #include "xstd/String.h" #include "xstd/Array.h" #include "xstd/gadgets.h" // StrIdentifier is a String->int map. // // StrIdentifier look-up method checks if a given buffer starts // with one of the "known" strings. If it does, an "id" of the // corresponding string is returned. // No "known" string can be prefix of the other -- we do not // want to search for the longest prefix and such... // A buffer is not expected to be zero-terminated. // The search algorithm is very efficient. There is probably // no optimal static (no caching, re-grouping) algorithm. // note: the search tree does not de-allocate its parts on destruction class StrIdentifier; // the user interface class, declared last class StrIdTable; class String; // a node on the search tree (leaf or not) class StrIdNode { public: StrIdNode(); bool dead() const { return theId == 0; } bool leaf() const { return theId > 0; } int id() const { return theId; } int lookup(const char *buf, int len) const; void add(String *str, int id, int myLevel); void optimize(); void report(ostream &os, int myLevel) const; protected: union { StrIdTable *theTab; // lookup table (intermediate nodes) String *theStr; // known-string (leaf nodes) } u; int theId; // id > 0 => leaf, id < 0 => intermed }; // collection of nodes indexed by a char at a given search pos class StrIdTable { public: StrIdTable(int aPos); int lookup(const char *buf, int len) const; void add(String *str, int id); void optimize(); bool single(StrIdNode &n); void report(ostream &os, int myLevel) const; protected: inline int lookupIdx(const String &str) const; // returns -1 if len <= theLookupPos inline int lookupIdx(const char *buf, int len) const; protected: Array theNodes; int theLookupPos; }; // iterator for StrIdentifier class StrIdentifierIter { public: StrIdentifierIter(const StrIdentifier &anIdfr); operator void*() const { return atEnd() ? (void*)0 : (void*)-1; } bool atEnd() const; StrIdentifierIter &operator ++(); const String &str() const; int id() const; protected: void sync(); protected: const StrIdentifier &theIdfr; int thePos; }; // user interface: build(add everything), optimize, look-up. class StrIdentifier { public: friend class StrIdentifierIter; typedef StrIdentifierIter Iter; public: StrIdentifier(); ~StrIdentifier(); int count() const { return theCount; } int lookup(const char *buf, int len) const; int lookup(const String &str) const; const String &string(int id) const { return *theStrings[id]; } Iter iterator() const; // to be safe, use one or the other of a given class: int add(const String &str); // next available id int add(const String &str, int id); // small user-specified id // call this after all additions void optimize(); // prints a search tree void report(ostream &os) const; protected: int doAdd(const String &str, int id); protected: Array theStrings; // reverse index (id -> str) StrIdNode theHead; int theCount; int theLastId; bool isLocked; // additions are fatal, set in optimize() }; /* inlined methods */ inline int StrIdTable::lookupIdx(const String &str) const { return str.len() > theLookupPos ? xord(toupper(str[theLookupPos])) : theLookupPos; } inline int StrIdTable::lookupIdx(const char *buf, int len) const { return len > theLookupPos ? xord(toupper(buf[theLookupPos])) : -1; } #endif