/* Web Polygraph http://www.web-polygraph.org/
* (C) 2003-2006 The Measurement Factory
* Licensed under the Apache License, Version 2.0 */
#ifndef POLYGRAPH__XSTD_STRIDENTIFIER_H
#define POLYGRAPH__XSTD_STRIDENTIFIER_H
#include <ctype.h>
#include "xstd/String.h"
#include "xstd/Array.h"
#include "xstd/gadgets.h"
// StrIdentifier is a String->int map.
//
// StrIdentifier look-up method checks if a given buffer starts
// with one of the "known" strings. If it does, an "id" of the
// corresponding string is returned.
// No "known" string can be prefix of the other -- we do not
// want to search for the longest prefix and such...
// A buffer is not expected to be zero-terminated.
// The search algorithm is very efficient. There is probably
// no optimal static (no caching, re-grouping) algorithm.
// note: the search tree does not de-allocate its parts on destruction
class StrIdentifier; // the user interface class, declared last
class StrIdTable;
class String;
// a node on the search tree (leaf or not)
class StrIdNode {
public:
StrIdNode();
bool dead() const { return theId == 0; }
bool leaf() const { return theId > 0; }
int id() const { return theId; }
int lookup(const char *buf, int len) const;
void add(String *str, int id, int myLevel);
void optimize();
void report(ostream &os, int myLevel) const;
protected:
union {
StrIdTable *theTab; // lookup table (intermediate nodes)
String *theStr; // known-string (leaf nodes)
} u;
int theId; // id > 0 => leaf, id < 0 => intermed
};
// collection of nodes indexed by a char at a given search pos
class StrIdTable {
public:
StrIdTable(int aPos);
int lookup(const char *buf, int len) const;
void add(String *str, int id);
void optimize();
bool single(StrIdNode &n);
void report(ostream &os, int myLevel) const;
protected:
inline int lookupIdx(const String &str) const;
// returns -1 if len <= theLookupPos
inline int lookupIdx(const char *buf, int len) const;
protected:
Array<StrIdNode> theNodes;
int theLookupPos;
};
// iterator for StrIdentifier
class StrIdentifierIter {
public:
StrIdentifierIter(const StrIdentifier &anIdfr);
operator void*() const { return atEnd() ? (void*)0 : (void*)-1; }
bool atEnd() const;
StrIdentifierIter &operator ++();
const String &str() const;
int id() const;
protected:
void sync();
protected:
const StrIdentifier &theIdfr;
int thePos;
};
// user interface: build(add everything), optimize, look-up.
class StrIdentifier {
public:
friend class StrIdentifierIter;
typedef StrIdentifierIter Iter;
public:
StrIdentifier();
~StrIdentifier();
int count() const { return theCount; }
int lookup(const char *buf, int len) const;
int lookup(const String &str) const;
const String &string(int id) const { return *theStrings[id]; }
Iter iterator() const;
// to be safe, use one or the other of a given class:
int add(const String &str); // next available id
int add(const String &str, int id); // small user-specified id
// call this after all additions
void optimize();
// prints a search tree
void report(ostream &os) const;
protected:
int doAdd(const String &str, int id);
protected:
Array<String *> theStrings; // reverse index (id -> str)
StrIdNode theHead;
int theCount;
int theLastId;
bool isLocked; // additions are fatal, set in optimize()
};
/* inlined methods */
inline
int StrIdTable::lookupIdx(const String &str) const {
return str.len() > theLookupPos ?
xord(toupper(str[theLookupPos])) : theLookupPos;
}
inline
int StrIdTable::lookupIdx(const char *buf, int len) const {
return len > theLookupPos ?
xord(toupper(buf[theLookupPos])) : -1;
}
#endif
syntax highlighted by Code2HTML, v. 0.9.1