#include "Lexesis/re.h" #include "RegexLexer.h" #include #include #include #include using namespace std; namespace lxs { string EmptyRE::toRe() { return "∅"; } State EmptyRE::toENFA(ENFA& enfa, State attach) { enfa.numStates = attach + 1; enfa.accepting.clear(); enfa.accepting.insert(attach + 1); return attach + 1; } string EpsilonRE::toRe() { return "ε"; } State EpsilonRE::toENFA(ENFA& enfa, State attach) { enfa.numStates = std::max(attach + 1, enfa.numStates); enfa.accepting.clear(); enfa.accepting.insert(attach + 1); enfa.epsilonTransitions[attach].insert(attach + 1); return attach + 1; } string SingleRE::toRe() { return string(1, c); } State SingleRE::toENFA(ENFA& enfa, State attach) { enfa.numStates = std::max(attach + 1, enfa.numStates); enfa.accepting.clear(); enfa.accepting.insert(attach + 1); enfa.delta[attach][c].insert(attach + 1); return attach + 1; } string MultiRE::toRe() { return "[" + string(chars.begin(), chars.end()) + "]"; } State MultiRE::toENFA(ENFA& enfa, State attach) { enfa.numStates = std::max(attach + 1, enfa.numStates); enfa.accepting.clear(); enfa.accepting.insert(attach + 1); for (char c : chars) { enfa.delta[attach][c].insert(attach + 1); } return attach + 1; } string ConcatRE::toRe() { return e->toRe() + f->toRe(); } State ConcatRE::toENFA(ENFA& enfa, State attach) { State a = e->toENFA(enfa, attach); enfa.epsilonTransitions[a].insert(a + 1); return f->toENFA(enfa, a + 1); } string StarRE::toRe() { return "(" + e->toRe() + ")*"; } State StarRE::toENFA(ENFA& enfa, State attach) { State a = e->toENFA(enfa, attach + 1); enfa.numStates = std::max(a + 1, enfa.numStates); enfa.accepting.clear(); enfa.accepting.insert(a + 1); enfa.epsilonTransitions[attach].insert(attach + 1); enfa.epsilonTransitions[attach].insert(a + 1); enfa.epsilonTransitions[a].insert(attach + 1); enfa.epsilonTransitions[a].insert(a + 1); return a + 1; } string PlusRE::toRe() { return "(" + e->toRe() + "|" + f->toRe() + ")"; } State PlusRE::toENFA(ENFA& enfa, State attach) { State a = e->toENFA(enfa, attach + 1); State b = f->toENFA(enfa, a + 1); enfa.numStates = std::max(enfa.numStates, b + 1); enfa.epsilonTransitions[attach].insert(attach + 1); enfa.epsilonTransitions[attach].insert(a + 1); enfa.epsilonTransitions[a].insert(b + 1); enfa.epsilonTransitions[b].insert(b + 1); enfa.accepting.clear(); enfa.accepting.insert(b + 1); return b + 1; } namespace { /** * Take the two top elements from `stk` and combine them with a ConcatRE */ void compress(stack>& stk) { std::shared_ptr a = stk.top(); stk.pop(); std::shared_ptr b = stk.top(); stk.pop(); stk.push(std::make_shared(b, a)); //Attention: reversed order because of stack } /** * Apply compress until only one RE remains on the stack */ void compactStack(stack >& stk) { if (stk.empty()) return; std::shared_ptr tp = stk.top(); stk.pop(); while (stk.size() >= 2) { compress(stk); } stk.push(tp); } /** * Parse a character class */ std::shared_ptr parseCharacterClass(const string& input) { std::set used_chars; bool invert = false; std::size_t start = 1; std::size_t end = input.size() - 1; if (input[1] == '^') { invert = true; start = 2; } if (input[start] == ']') { used_chars.insert(']'); ++start; } if (input[start] == '-') { used_chars.insert('-'); ++start; } if (input[end - 1] == '-') { used_chars.insert('-'); --end; } int last_char = -1; for (std::size_t idx = start; idx < end; idx++) { if (input[idx] == '-') { idx++; for (int i = last_char; i <= input[idx]; i++) { used_chars.insert((char) i); } last_char = -1; } else { if (idx == end - 1 || (idx < end - 1 && input[idx + 1] != '-')) used_chars.insert(input[idx]); else last_char = input[idx]; } } std::vector chars; for (int i = 0; i < 256; i++) { if (invert ^ (used_chars.count((char) i) > 0)) chars.push_back((char) i); } return std::make_shared(chars); } /** * Return the RE for the `.` pattern: everything except a newline */ std::shared_ptr dotChar() { std::vector any; for (int i = 0; i < 256; i++) if ((char) i != '\n') //Dot matches anything except newlines any.push_back((char) i); return std::make_shared(any); } /** * Parse the actual regex */ std::shared_ptr parseRE(RegexLexer& lex, bool& exit_by_closed_paren, bool inside_parens=false) { stack > stk; try { while (true) { exit_by_closed_paren = false; RegexLexer::Token tok = lex.nextToken(); std::shared_ptr n; switch (tok.type) { case RegexLexer::TAB: stk.push(std::make_shared('\t')); break; case RegexLexer::NEWLINE: stk.push(std::make_shared('\n')); break; case RegexLexer::CARRIAGE_RETURN: stk.push(std::make_shared('\r')); break; case RegexLexer::BACKSPACE: stk.push(std::make_shared('\b')); break; case RegexLexer::SPACE: stk.push(std::make_shared(' ')); break; case RegexLexer::BELL: stk.push(std::make_shared('\a')); break; case RegexLexer::FORMFEED: stk.push(std::make_shared('\f')); break; case RegexLexer::VTAB: stk.push(std::make_shared('\v')); break; case RegexLexer::BACKSLASH: case RegexLexer::ESCAPED_STAR: case RegexLexer::ESCAPED_PLUS: case RegexLexer::ESCAPED_PIPE: case RegexLexer::ESCAPED_LPAREN: case RegexLexer::ESCAPED_RPAREN: case RegexLexer::ESCAPED_LBRACKET: case RegexLexer::ESCAPED_RBRACKET: case RegexLexer::ESCAPED_QUESTIONMARK: case RegexLexer::ESCAPED_DOT: stk.push(std::make_shared(tok.content[1])); break; case RegexLexer::DOT: stk.push(dotChar()); break; case RegexLexer::STAR: if (stk.empty()) throw SyntaxError(lex, "Cannot apply kleene star to empty regex"); n = std::make_shared(stk.top()); stk.pop(); stk.push(n); break; case RegexLexer::PLUS: if (stk.empty()) throw SyntaxError(lex, "Cannot apply kleene plus to empty regex"); n = stk.top(); stk.pop(); n = std::make_shared(n, std::make_shared(n)); stk.push(n); break; case RegexLexer::QUESTIONMARK: if (stk.empty()) throw SyntaxError(lex, "Cannot apply '?' to empty regex"); n = std::make_shared(stk.top(), std::make_shared()); stk.pop(); stk.push(n); break; case RegexLexer::PIPE: if (stk.empty()) throw SyntaxError(lex, "Invalid regex: nothing to the left of '|'"); if (stk.size() > 1) compactStack(stk), compress(stk); n = std::make_shared(stk.top(), parseRE(lex, exit_by_closed_paren, inside_parens)); stk.pop(); stk.push(n); if (exit_by_closed_paren) { if (stk.size() == 1) return stk.top(); else if (stk.size() == 2) return compress(stk), stk.top(); else throw SyntaxError(lex, "Invalid regex"); } break; case RegexLexer::LPAREN: n = parseRE(lex, exit_by_closed_paren, true); if (!exit_by_closed_paren) { throw SyntaxError(lex, "Unclosed parenthesis"); } stk.push(n); break; case RegexLexer::RPAREN: if (!inside_parens) throw SyntaxError(lex, "Unopened parenthesis"); exit_by_closed_paren = true; if (stk.size() == 1) return stk.top(); else if (stk.size() == 2) return compress(stk), stk.top(); throw SyntaxError(lex, "Could not parse regex, nothing inside parentheses"); case RegexLexer::CHAR: stk.push(std::make_shared(tok.content[0])); break; case RegexLexer::CHAR_CLASS: stk.push(parseCharacterClass(tok.content)); break; case RegexLexer::ERROR: throw SyntaxError(lex, "Error on character: " + tok.content); case RegexLexer::ignore: case RegexLexer::nonmatching: //Just ignore these break; } compactStack(stk); } } catch (RegexLexer::NoMoreTokens& err) { if (stk.size() == 1) return stk.top(); else if (stk.size() == 2) return compress(stk), stk.top(); throw SyntaxError(lex, "Could not parse regex"); } } } std::shared_ptr parseRE(const string& input) { std::istringstream inputstream(input); RegexLexer lex(inputstream); bool exit_by_closed_paren = false; std::shared_ptr res = parseRE(lex, exit_by_closed_paren); return res; } }