// // C++ Implementation: regexpstatebuilder.cpp // // Description: Builds the regexp automaton // // // Author: Lorenzo Bettini, 2007, http://www.lorenzobettini.it // // Copyright: See COPYING file that comes with this distribution // // #include "regexpstatebuilder.H" #include #include "langelems.h" #include "statelangelem.h" #include "stringlistlangelem.h" #include "delimitedlangelem.h" #include "namedsubexpslangelem.h" #include "regexpstate.h" #include "stringdef.h" #include "tostringcollection.h" #include "regexpreprocessor.h" #include "keys.h" #include "messages.h" using namespace std; static const string buildex(const string &s); void add_exp(RegExpStatePtr state, const string &exp, ParserInfo *parserInfo, RegExpFormatterPtr f) { unsigned int numOfSubexpressions = RegexPreProcessor::num_of_subexpressions(exp); if (numOfSubexpressions) { // for marked subexpressions we must not use buildex, otherwise we might change // the subexpressions indexes (e.g., for backreferences) state->add_alternative(exp, parserInfo, f); } else { state->add_exp(buildex(exp), parserInfo, f); // record that we (manually) added an explicit marked subexpression state->setHasMarkedAlternatives(); } } /** * Definitely associate the regular expression to this state * @param state * @return */ bool freeze_state(RegExpStatePtr state) { bool ok = true; try { state->freeze(); } catch (boost::bad_expression &e) { ok = false; } return ok; } RegExpStateBuilder::RegExpStateBuilder() { } RegExpStateBuilder::~RegExpStateBuilder() { } void setFormatterExitLevel(StateStartLangElem *elem, RegExpFormatterPtr formatter) { bool exit_all = elem->exitAll(); bool exit = elem->doExit(); /* only act on the exit state (if any exist statement is defined) */ if (exit_all) { formatter->exit_state_level = 1; formatter->exit_all = true; } if (exit) formatter->exit_state_level = 1; } RegExpStatePtr RegExpStateBuilder::build(LangElems *elems) { if (! elems) return RegExpStatePtr(); RegExpStatePtr state(new RegExpState); state->set_default_formatter(RegExpFormatterPtr(new RegExpFormatter(NORMAL))); build(elems, state); return state; } void RegExpStateBuilder::build(LangElems *elems, RegExpStatePointer state) { if (elems) for (LangElems::const_iterator it = elems->begin(); it != elems->end(); ++it) build_DB(*it, state); bool problems = !freeze_state(state); if (!problems) return; if (elems) { // try to find out where the problem is... RegExpStatePtr temp_state(new RegExpState()); for (LangElems::const_iterator it = elems->begin(); it != elems->end(); ++it) { build_DB(*it, temp_state); try { temp_state->freeze(); } catch (boost::bad_expression &e) { exitError("problem in this expression: " + (*it)->toStringOriginal(), *it); } } } else { foundBug("bug in expression parsing", __FILE__, __LINE__); } } void RegExpStateBuilder::build(LangElem *elem, RegExpStatePointer state) { } /** * Build a non-marking group (i.e., (? ... ) starting from s * @param s * @return */ const string non_marking_group(const string &s) { return "(?:" + s + ")"; } /** * Build a subexpression starting from s * @param s * @return */ const string buildex(const string &s) { return "(" + s + ")"; } /** * Build a regular expression that must be matched * with a null string before and after (operator \< and \>) * @param s * @return */ const string buildex_isolated(const string &s) { return "\\<" + s + "\\>"; } /** * An expression is isolated basically if it is an alphanumerical * string * TODO check whether this is actually correct in principle * @param s * @return */ bool is_to_isolate(const string &s) { if (s.size()) { if ((isalnum(s[0]) || s[0] == '_') && (isalnum(s[s.size()-1]) || s[s.size()-1] == '_')) return true; } return false; } int escaped_string_size(const string &s) { if (s.size() && s[0] == '\\') return s.size() - 1; return s.size(); } /** * Case of a list of alternatives * @param elem * @param state */ void RegExpStateBuilder::build(StringListLangElem *elem, RegExpStatePointer state) { const string &name = elem->getName(); StringDefs *alternatives = elem->getAlternatives(); string stringdef = toStringCollection(alternatives, '|'); //printMessage("building " + name + " " + stringdef); // check this at this point, since, stringdef might be modified // e.g., if it must be case insensitive bool isToIsolate = is_to_isolate(stringdef); if (!elem->isCaseSensitive()) stringdef = RegexPreProcessor::make_nonsensitive(stringdef); string exp_string = non_marking_group(stringdef); if (isToIsolate) exp_string = buildex_isolated(exp_string); RegExpFormatterPtr formatter(new RegExpFormatter(name)); add_exp(state, exp_string, elem, formatter); build(static_cast(elem), state); } /** * Case of a list of language elements, each representing a * marked subexpression * @param elem * @param state */ void RegExpStateBuilder::build(NamedSubExpsLangElem *elem, RegExpStatePointer state) { const ElementNames *elems = elem->getElementNames(); const StringDef *regexp = elem->getRegexpDef(); format_vector formatters; const string ®exp_string = regexp->toString(); // first check that the number of marked subexpressions is the same of // the specified element names subexpressions_info sexps = RegexPreProcessor::num_of_marked_subexpressions(regexp_string); if (sexps.errors.size()) { exitError(sexps.errors, elem); } if (sexps.marked != elems->size()) { exitError("number of marked subexpressions does not match number of elements", elem); } // for each named group build a formatter, that corresponds to that element for (ElementNames::const_iterator it = elems->begin(); it != elems->end(); ++it) { RegExpFormatterPtr formatter = RegExpFormatterPtr(new RegExpFormatter(*it)); // each formatter will share the same exit level, since it represents the // same matched regexp setFormatterExitLevel(elem, formatter); formatters.push_back(formatter); } // now add all the formatters for this element state->add_exp(regexp_string, elem, formatters); // record that all the subexpressions can match state->setAllAlternativesCanMatch(); } /** * Case of a delimited element * @param elem * @param state */ void RegExpStateBuilder::build(DelimitedLangElem *elem, RegExpStatePointer state) { RegExpStatePtr inner; const string &name = elem->getName(); StringDef *start = elem->getStart(); StringDef *end = elem->getEnd(); StringDef *escape = elem->getEscape(); string start_string; if (start) start_string = start->toString(); string exp_string = start_string; string end_string; if (end) end_string = end->toString(); string escape_string; if (escape) escape_string = escape->toString(); //printMessage("building " + name + " " + start_string + " " + end_string); if (! elem->getStateLangElem() && ! elem->isMultiline() && escaped_string_size(start_string) == 1 && escaped_string_size(end_string) == 1) { /* in case the expression is not the start element of a State/Environment and it must not spawn multiple lines, and the delimiters are only one character, build a regular expression of the shape (everything but delimiters) For instance if delimiters are "<" and ">" the built regular expression is "<(?:[^<>])*>" */ if (!escape) { exp_string = start_string + non_marking_group("[^" + start_string + (end_string != start_string ? end_string : "") + "]") + "*" + end_string; } else { /* in case of a specified escape character it will use it for the (everything but delimiters) part. For instace, if in the example above the escape character is the backslash, the generated expression is <(?:[^\\<\\>]|\\.)*> */ exp_string = start_string + non_marking_group("[^" + escape_string + start_string + (end_string != start_string ? escape_string + end_string : "") + "]|"+ escape_string + ".") + "*" + end_string; } } else { /* Otherwise we cannot simply build a regular expression as above but we must build more states of the automaton: if we match the start delimiter we enter a new state, called here "inner" */ inner = RegExpStatePtr(new RegExpState); // for internal elements nested_states.push_back(inner); /* Since this is a delimited element, everything inside this element, that does not match anything else, must be formatted in the same way of this element. */ inner->set_default_formatter(RegExpFormatterPtr(new RegExpFormatter(name))); /* We exit from this state when we match the end delimiter (or the end of buffer if no end delimiter was specified). For instance, consider this definition comment delim "[*" "*]" The inner state will contain the regular expression \*\] and when matched it will exit from the inner state. Notice that if this element has been specified with "exit" we must increment the synthetized "exit" RegExpFormatter, since it must exit the inner state and the state this element belongs to. */ RegExpFormatterPtr exit (new RegExpFormatter(name, RegExpStatePtr(), 1 + (elem->doExit() ? 1 : 0), elem->exitAll())); if (end) add_exp(inner, end_string, elem, exit); else inner->add_exp(buildex("\\z"), elem, exit); /* If an escape character was specified we must match also everything that is prefixed with the escape character. For instance, comment delim "[*" "*]" escape "\\" will generate the inner state (\*\])|(\\.) */ if (escape) { add_exp(inner, escape_string + ".", elem, RegExpFormatterPtr(new RegExpFormatter(name))); } /* If the delimited element can be nested (e.g., C comments) we must deal with counted occurrences of start delimiter and end delimiter. We thus create a "nested" formatter, that has as the next state the inner we saw above. We then add to the "inner" state above the start delimiter expression, corresponding to the "nested" formatter. This will implement the stack of occurrences. For instance, consider comment delim "[*" "*]" nested The inner state will have these expressions (\*\])|(\[\*) in case it matches the first one it will exit, in case it matches the second it will enter the same inner state */ if (elem->isNested()) { RegExpFormatterPtr nested(new RegExpFormatter(name, inner)); nested_formatters.push_back(nested); add_exp(inner, start_string, elem, nested); } } if (inner) { if (!freeze_state(inner)) foundBug("bug in expression parsing", __FILE__, __LINE__);; } RegExpFormatterPtr formatter(new RegExpFormatter(name, inner)); add_exp(state, exp_string, elem, formatter); build(static_cast(elem), state); } /** * (General) case for a lang element that can start a new * State/Environment. * @param elem * @param state */ void RegExpStateBuilder::build(StateStartLangElem *elem, RegExpStatePointer state) { RegExpFormatterPtr formatter = state->getLastFormatter(); setFormatterExitLevel(elem, formatter); } /** * Case of an element that makes a new State/Environment * @param elem * @param state */ void RegExpStateBuilder::build(StateLangElem *elem, RegExpStatePointer state) { StateStartLangElem *statestart = elem->getStateStart(); /* First act on the element that defines this new State */ build_DB(statestart, state); /* The last formatter corresponds to the formatter added for the element that defines this state, i.e., statestart */ RegExpFormatterPtr last = state->getLastFormatter(); /* We must make sure that this formatter has a next state, since we will use it to populate it with the elements of this state. For instance, a StringListLangElem does not have a next state, so we must create one. */ if (!last->getNextState()) { last->setNextState(RegExpStatePtr(new RegExpState)); } /* Use the last formatter next state to populate it with the elements of this State/Environment element */ RegExpStatePtr inner(last->getNextState()); /* If it's a State then the default formatter corresponds to NORMAL, otherwise (Environment) the default formatter is the same of the element itself. */ if (elem->isState()) inner->set_default_formatter(RegExpFormatterPtr(new RegExpFormatter(NORMAL))); else inner->set_default_formatter (RegExpFormatterPtr(new RegExpFormatter(last->elem))); LangElems *elems = elem->getElems(); build(elems, inner); }