#include "Lexesis/re.h" #include #include #include using namespace std; namespace lxs { string EmptyRE::toRe() { return "∅"; } State EmptyRE::toENFA(ENFA& enfa, State attach) { enfa.numStates = attach + 1; enfa.accepting.clear(); enfa.accepting.insert(attach + 1); return attach + 1; } string EpsilonRE::toRe() { return "ε"; } State EpsilonRE::toENFA(ENFA& enfa, State attach) { enfa.numStates = std::max(attach + 1, enfa.numStates); enfa.accepting.clear(); enfa.accepting.insert(attach + 1); enfa.epsilonTransitions[attach].insert(attach + 1); return attach + 1; } string SingleRE::toRe() { return string(1, c); } State SingleRE::toENFA(ENFA& enfa, State attach) { enfa.numStates = std::max(attach + 1, enfa.numStates); enfa.accepting.clear(); enfa.accepting.insert(attach + 1); enfa.delta[attach][c].insert(attach + 1); return attach + 1; } string MultiRE::toRe() { //FIXME: this does not consider characters that need escaping return "[" + string(chars.begin(), chars.end()) + "]"; } State MultiRE::toENFA(ENFA& enfa, State attach) { enfa.numStates = std::max(attach + 1, enfa.numStates); enfa.accepting.clear(); enfa.accepting.insert(attach + 1); for (char c : chars) { enfa.delta[attach][c].insert(attach + 1); } return attach + 1; } string ConcatRE::toRe() { return e->toRe() + f->toRe(); } State ConcatRE::toENFA(ENFA& enfa, State attach) { State a = e->toENFA(enfa, attach); enfa.epsilonTransitions[a].insert(a + 1); return f->toENFA(enfa, a + 1); } string StarRE::toRe() { return "(" + e->toRe() + ")*"; } State StarRE::toENFA(ENFA& enfa, State attach) { State a = e->toENFA(enfa, attach + 1); enfa.numStates = std::max(a + 1, enfa.numStates); enfa.accepting.clear(); enfa.accepting.insert(a + 1); enfa.epsilonTransitions[attach].insert(attach + 1); enfa.epsilonTransitions[attach].insert(a + 1); enfa.epsilonTransitions[a].insert(attach + 1); enfa.epsilonTransitions[a].insert(a + 1); return a + 1; } string PlusRE::toRe() { return "(" + e->toRe() + "|" + f->toRe() + ")"; } State PlusRE::toENFA(ENFA& enfa, State attach) { State a = e->toENFA(enfa, attach + 1); State b = f->toENFA(enfa, a + 1); enfa.numStates = std::max(enfa.numStates, b + 1); enfa.epsilonTransitions[attach].insert(attach + 1); enfa.epsilonTransitions[attach].insert(a + 1); enfa.epsilonTransitions[a].insert(b + 1); enfa.epsilonTransitions[b].insert(b + 1); enfa.accepting.clear(); enfa.accepting.insert(b + 1); return b + 1; } namespace { /** * Take the two top elements from `stk` and combine them with a ConcatRE */ void compress(stack>& stk) { std::shared_ptr a = stk.top(); stk.pop(); std::shared_ptr b = stk.top(); stk.pop(); stk.push(std::make_shared(b, a)); //Attention: reversed order because of stack } /** * Apply compress until only one RE remains on the stack */ void compactStack(stack >& stk) { if (stk.empty()) return; std::shared_ptr tp = stk.top(); stk.pop(); while (stk.size() >= 2) { compress(stk); } stk.push(tp); } /** * Get the actual char that should be used when c is placed after a backslash */ char parseEscapeChar(char c) { switch (c) { case '\\': case '*': case '+': case '|': case '(': case ')': case '[': case ']': case '?': case '.': break; case 'n': c = '\n'; break; case 'r': c = '\r'; break; case 'b': c = '\b'; break; case 't': c = '\t'; break; case 's': c = ' '; break; case 'a': c = '\a'; break; case 'f': c = '\f'; break; case 'v': c = '\v'; break; default: throw SyntaxError(("Invalid escape sequence: \\" + std::string(1, c)).c_str()); } return c; } /** * Parse a character class */ std::shared_ptr parseCharacterClass(const string& input, size_t& idx) { if (idx >= input.size()) throw SyntaxError("Unclosed character class"); std::set used_chars; bool invert = false; int last_char = -1; if (input[idx] == '^') { invert = true; idx++; } if (idx >= input.size()) throw SyntaxError("Unclosed character class"); if (input[idx] == ']') { used_chars.insert(']'); idx++; last_char = ']'; } if (idx >= input.size()) throw SyntaxError("Unclosed character class"); if (input[idx] == '-') { used_chars.insert('-'); idx++; last_char = '-'; } if (idx >= input.size()) throw SyntaxError("Unclosed character class"); for (; idx < input.size() && input[idx] != ']'; idx++) { if (input[idx] == '-') { idx++; if (idx >= input.size()) throw SyntaxError("Unclosed character class"); if (input[idx] == ']') { used_chars.insert('-'); idx--; } else { if (last_char == -1) throw SyntaxError("Nothing to apply range to"); for (int i = last_char + 1; i <= input[idx]; i++) { used_chars.insert((char) i); } last_char = -1; } } else { used_chars.insert(input[idx]); last_char = input[idx]; } } if (idx >= input.size()) throw SyntaxError("Unclosed character class"); std::vector chars; for (int i = 0; i < 256; i++) { if (invert ^ (used_chars.count((char) i) > 0)) chars.push_back((char) i); } return std::make_shared(chars); } /** * Return the RE for the `.` pattern: everything except a newline */ std::shared_ptr dotChar() { std::vector any; for (int i = 0; i < 256; i++) if ((char) i != '\n') //Dot matches anything except newlines any.push_back((char) i); return std::make_shared(any); } /** * Parse the actual regex */ std::shared_ptr parseRE(const string& input, size_t& idx) { stack > stk; for (; idx < input.length(); idx++) { std::shared_ptr n; switch (input[idx]) { case '\\': idx++; if (idx >= input.length()) throw SyntaxError("Escape sequence at the end of the string"); else stk.push(std::make_shared(parseEscapeChar(input[idx]))); break; case '[': stk.push(parseCharacterClass(input, ++idx)); break; case '.': stk.push(dotChar()); break; case ']': throw SyntaxError("Unopened ']'"); break; case '*': if (stk.empty()) throw SyntaxError("Cannot apply kleene star to empty regex"); n = std::make_shared(stk.top()); stk.pop(); stk.push(n); break; case '+': if (stk.empty()) throw SyntaxError("Cannot apply kleene plus to empty regex"); n = stk.top(); stk.pop(); n = std::make_shared(n, std::make_shared(n)); stk.push(n); break; case '?': if (stk.empty()) throw SyntaxError("Cannot apply '?' to empty regex"); n = std::make_shared(stk.top(), std::make_shared()); stk.pop(); stk.push(n); break; case '|': if (stk.empty()) throw SyntaxError("Invalid regex: nothing to the left of '|'"); if (stk.size() > 1) compactStack(stk), compress(stk); n = std::make_shared(stk.top(), parseRE(input, ++idx)); stk.pop(); stk.push(n); idx--; break; case '(': n = parseRE(input, ++idx); if (idx >= input.size() || input[idx] != ')') throw SyntaxError("Could not parse regex, unclosed parentheses"); stk.push(n); break; case ')': if (stk.size() == 1) return stk.top(); else if (stk.size() == 2) return compress(stk), stk.top(); throw SyntaxError("Could not parse regex, nothing inside parentheses"); default: stk.push(std::make_shared(input[idx])); } compactStack(stk); } if (stk.size() == 1) return stk.top(); else if (stk.size() == 2) return compress(stk), stk.top(); throw SyntaxError("Could not parse regex"); } } std::shared_ptr parseRE(const string& input) { size_t i = 0; std::shared_ptr res = parseRE(input, i); if (i < input.length() - 1) throw SyntaxError("Incorrect regex"); return res; } }