/* Web Polygraph       http://www.web-polygraph.org/
 * (C) 2003-2006 The Measurement Factory
 * Licensed under the Apache License, Version 2.0 */

#ifndef POLYGRAPH__XSTD_STRIDENTIFIER_H
#define POLYGRAPH__XSTD_STRIDENTIFIER_H

#include <ctype.h>
#include "xstd/String.h"
#include "xstd/Array.h"
#include "xstd/gadgets.h"


// StrIdentifier is a String->int map.
//
// StrIdentifier look-up method checks if a given buffer starts
// with one of the "known" strings. If it does, an "id" of the
// corresponding string is returned.
// No "known" string can be prefix of the other -- we do not
// want to search for the longest prefix and such...
// A buffer is not expected to be zero-terminated.

// The search algorithm is very efficient. There is probably
// no optimal static (no caching, re-grouping) algorithm.

// note: the search tree does not de-allocate its parts on destruction

class StrIdentifier; // the user interface class, declared last

class StrIdTable;
class String;

// a node on the search tree (leaf or not)
class StrIdNode {
	public:
		StrIdNode();

		bool dead() const { return theId == 0; }
		bool leaf() const { return theId > 0; }
		int id() const { return theId; }

		int lookup(const char *buf, int len) const;
		void add(String *str, int id, int myLevel);

		void optimize();

		void report(ostream &os, int myLevel) const;

	protected:
		union {
			StrIdTable *theTab; // lookup table (intermediate nodes)
			String *theStr;     // known-string (leaf nodes)
		} u;
		int theId;              // id > 0 => leaf, id < 0 => intermed
};


// collection of nodes indexed by a char at a given search pos
class StrIdTable {
	public:
		StrIdTable(int aPos);

		int lookup(const char *buf, int len) const;
		void add(String *str, int id);

		void optimize();
		bool single(StrIdNode &n);

		void report(ostream &os, int myLevel) const;

	protected:
		inline int lookupIdx(const String &str) const;
		// returns -1 if len <= theLookupPos
		inline int lookupIdx(const char *buf, int len) const;
		
	protected:
		Array<StrIdNode> theNodes;
		int theLookupPos;
};


// iterator for StrIdentifier
class StrIdentifierIter {
	public:
		StrIdentifierIter(const StrIdentifier &anIdfr);

		operator void*() const { return atEnd() ? (void*)0 : (void*)-1; }
		bool atEnd() const;
		StrIdentifierIter &operator ++();

		const String &str() const;
		int id() const;

	protected:
		void sync();

	protected:
		const StrIdentifier &theIdfr;
		int thePos;
};

// user interface: build(add everything), optimize, look-up.
class StrIdentifier {
	public:
		friend class StrIdentifierIter;
		typedef StrIdentifierIter Iter;

	public:
		StrIdentifier();
		~StrIdentifier();

		int count() const { return theCount; }
		int lookup(const char *buf, int len) const;
		int lookup(const String &str) const;
		const String &string(int id) const { return *theStrings[id]; }

		Iter iterator() const;

		// to be safe, use one or the other of a given class:
		int add(const String &str);          // next available id
		int add(const String &str, int id);  // small user-specified id

		// call this after all additions
		void optimize();

		// prints a search tree
		void report(ostream &os) const;

	protected:
		int doAdd(const String &str, int id);

	protected:
		Array<String *> theStrings; // reverse index (id -> str)
		StrIdNode theHead;
		int theCount;
		int theLastId;

		bool isLocked; // additions are fatal, set in optimize()
};


/* inlined methods */

inline 
int StrIdTable::lookupIdx(const String &str) const {
	return str.len() > theLookupPos ?
		xord(toupper(str[theLookupPos])) : theLookupPos;
}

inline
int StrIdTable::lookupIdx(const char *buf, int len) const {
	return len > theLookupPos ?
		xord(toupper(buf[theLookupPos])) : -1;
}

#endif


syntax highlighted by Code2HTML, v. 0.9.1