#ifndef _PCRE_RML_H #define _PCRE_RML_H #include #include "../common/utilit.h" #include extern void RmlPcreMakeTables(vector& character_table, MorphLanguageEnum Langua); // We convert user-passed pointers into special Arg objects typedef pcrecpp::Arg Arg; extern Arg no_arg; // Interface for regular expression matching. Also corresponds to a // pre-compiled regular expression. An "RE" object is safe for // concurrent use by multiple threads. class RML_RE { public: // We provide implicit conversions from strings so that users can // pass in a string or a "const char*" wherever an "RE" is expected. RML_RE(const char* pat) { Init(pat, NULL); } RML_RE(const char *pat, const pcrecpp::RE_Options& option) { Init(pat, &option); } RML_RE(const string& pat) { Init(pat.c_str(), NULL); } RML_RE(const string& pat, const pcrecpp::RE_Options& option) { Init(pat.c_str(), &option); } // actually used by DDC RML_RE(const string& pat, const vector& RegExpTables) { Init(pat.c_str(), NULL, &RegExpTables[0]); } ~RML_RE(); // The string specification for this RE. E.g. // RE re("ab*c?d+"); // re.pattern(); // "ab*c?d+" const string& pattern() const { return pattern_; } // If RE could not be created properly, returns an error string. // Else returns the empty string. const string& error() const { return *error_; } /***** The useful part: the matching interface *****/ // This is provided so one can do pattern.ReplaceAll() just as // easily as ReplaceAll(pattern-text, ....) typedef pcrecpp::StringPiece StringPiece; typedef pcrecpp::RE_Options RE_Options; bool FullMatch(const StringPiece& text, const Arg& ptr1 = no_arg, const Arg& ptr2 = no_arg, const Arg& ptr3 = no_arg, const Arg& ptr4 = no_arg, const Arg& ptr5 = no_arg, const Arg& ptr6 = no_arg, const Arg& ptr7 = no_arg, const Arg& ptr8 = no_arg, const Arg& ptr9 = no_arg, const Arg& ptr10 = no_arg, const Arg& ptr11 = no_arg, const Arg& ptr12 = no_arg, const Arg& ptr13 = no_arg, const Arg& ptr14 = no_arg, const Arg& ptr15 = no_arg, const Arg& ptr16 = no_arg) const; bool PartialMatch(const StringPiece& text, const Arg& ptr1 = no_arg, const Arg& ptr2 = no_arg, const Arg& ptr3 = no_arg, const Arg& ptr4 = no_arg, const Arg& ptr5 = no_arg, const Arg& ptr6 = no_arg, const Arg& ptr7 = no_arg, const Arg& ptr8 = no_arg, const Arg& ptr9 = no_arg, const Arg& ptr10 = no_arg, const Arg& ptr11 = no_arg, const Arg& ptr12 = no_arg, const Arg& ptr13 = no_arg, const Arg& ptr14 = no_arg, const Arg& ptr15 = no_arg, const Arg& ptr16 = no_arg) const; bool Consume(StringPiece* input, const Arg& ptr1 = no_arg, const Arg& ptr2 = no_arg, const Arg& ptr3 = no_arg, const Arg& ptr4 = no_arg, const Arg& ptr5 = no_arg, const Arg& ptr6 = no_arg, const Arg& ptr7 = no_arg, const Arg& ptr8 = no_arg, const Arg& ptr9 = no_arg, const Arg& ptr10 = no_arg, const Arg& ptr11 = no_arg, const Arg& ptr12 = no_arg, const Arg& ptr13 = no_arg, const Arg& ptr14 = no_arg, const Arg& ptr15 = no_arg, const Arg& ptr16 = no_arg) const; bool FindAndConsume(StringPiece* input, const Arg& ptr1 = no_arg, const Arg& ptr2 = no_arg, const Arg& ptr3 = no_arg, const Arg& ptr4 = no_arg, const Arg& ptr5 = no_arg, const Arg& ptr6 = no_arg, const Arg& ptr7 = no_arg, const Arg& ptr8 = no_arg, const Arg& ptr9 = no_arg, const Arg& ptr10 = no_arg, const Arg& ptr11 = no_arg, const Arg& ptr12 = no_arg, const Arg& ptr13 = no_arg, const Arg& ptr14 = no_arg, const Arg& ptr15 = no_arg, const Arg& ptr16 = no_arg) const; bool Replace(const StringPiece& rewrite, string *str) const; int GlobalReplace(const StringPiece& rewrite, string *str) const; bool Extract(const StringPiece &rewrite, const StringPiece &text, string *out) const; /***** Generic matching interface *****/ // Type of match (TODO: Should be restructured as part of RE_Options) enum Anchor { UNANCHORED, // No anchoring ANCHOR_START, // Anchor at start only ANCHOR_BOTH // Anchor at start and end }; // General matching routine. Stores the length of the match in // "*consumed" if successful. bool DoMatch(const StringPiece& text, Anchor anchor, int* consumed, const Arg* const* args, int n) const; // Return the number of capturing subpatterns, or -1 if the // regexp wasn't valid on construction. int NumberOfCapturingGroups(); protected: void Init(const char* pattern, const RE_Options* options); void Init(const char* pattern, const RE_Options* options, const unsigned char *tableptr); // Match against "text", filling in "vec" (up to "vecsize" * 2/3) with // pairs of integers for the beginning and end positions of matched // text. The first pair corresponds to the entire matched text; // subsequent pairs correspond, in order, to parentheses-captured // matches. Returns the number of pairs (one more than the number of // the last subpattern with a match) if matching was successful // and zero if the match failed. // I.e. for RE("(foo)|(bar)|(baz)") it will return 2, 3, and 4 when matching // against "foo", "bar", and "baz" respectively. // When matching RE("(foo)|hello") against "hello", it will return 1. // But the values for all subpattern are filled in into "vec". int TryMatch(const StringPiece& text, int startpos, Anchor anchor, int *vec, int vecsize) const; // Append the "rewrite" string, with backslash subsitutions from "text" // and "vec", to string "out". bool Rewrite(string *out, const StringPiece& rewrite, const StringPiece& text, int *vec, int veclen) const; // internal implementation for DoMatch bool DoMatchImpl(const StringPiece& text, Anchor anchor, int* consumed, const Arg* const args[], int n, int* vec, int vecsize) const; // Compile the regexp for the specified anchoring mode pcre* Compile(Anchor anchor, const unsigned char *tableptr); string pattern_; RE_Options options_; pcre* re_full_; // For full matches pcre* re_partial_; // For partial matches const string* error_; // Error indicator (or points to empty string) int match_limit_; // limit on execution resources // Don't allow the default copy or assignment constructors -- // they're expensive and too easy to do by accident. RML_RE(const RML_RE&); void operator=(const RML_RE&); }; #endif