From c80fcdb8aa1ebb306e8b7db4489736a62bf4cd8e Mon Sep 17 00:00:00 2001 From: Robin Jadoul Date: Sun, 24 Apr 2016 16:00:18 +0200 Subject: [PATCH] Import re.cpp and modify ENFA generation to use new ENFA structures --- src/re.cpp | 206 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 206 insertions(+) create mode 100644 src/re.cpp diff --git a/src/re.cpp b/src/re.cpp new file mode 100644 index 0000000..e884ae4 --- /dev/null +++ b/src/re.cpp @@ -0,0 +1,206 @@ +#include "Lexesis/re.h" + +#include +#include +#include +using namespace std; + + +namespace lxs { + string EmptyRE::toRe() + { + return "∅"; + } + + State EmptyRE::toENFA(ENFA& enfa, State attach) + { + enfa.numStates = std::max(attach + 1, enfa.numStates); + enfa.accepting.clear(); + enfa.accepting.insert(attach + 1); + return attach + 1; + } + + string EpsilonRE::toRe() + { + return "ε"; + } + + State EpsilonRE::toENFA(ENFA& enfa, State attach) + { + enfa.numStates = std::max(attach + 1, enfa.numStates); + enfa.accepting.clear(); + enfa.accepting.insert(attach + 1); + enfa.epsilonTransitions[attach].insert(attach + 1); + return attach + 1; + } + + string SingleRE::toRe() + { + return string(1, c); + } + + State SingleRE::toENFA(ENFA& enfa, State attach) + { + enfa.numStates = std::max(attach + 1, enfa.numStates); + enfa.accepting.clear(); + enfa.accepting.insert(attach + 1); + enfa.delta[attach][c].insert(attach + 1); + return attach + 1; + } + + string ConcatRE::toRe() + { + return e->toRe() + f->toRe(); + } + + State ConcatRE::toENFA(ENFA& enfa, State attach) + { + State a = e->toENFA(enfa, attach); + enfa.epsilonTransitions[a].insert(a + 1); + return f->toENFA(enfa, a + 1); + } + + string StarRE::toRe() + { + return "(" + e->toRe() + ")*"; + } + + State StarRE::toENFA(ENFA& enfa, State attach) + { + State a = e->toENFA(enfa, attach + 1); + enfa.numStates = std::max(a + 1, enfa.numStates); + enfa.accepting.clear(); + enfa.accepting.insert(a + 1); + enfa.epsilonTransitions[attach].insert(attach + 1); + enfa.epsilonTransitions[attach].insert(a + 1); + enfa.epsilonTransitions[a].insert(attach + 1); + enfa.epsilonTransitions[a].insert(a + 1); + return a + 1; + } + + string PlusRE::toRe() + { + return "(" + e->toRe() + "+" + f->toRe() + ")"; + } + + State PlusRE::toENFA(ENFA& enfa, State attach) + { + State a = e->toENFA(enfa, attach + 1); + State b = f->toENFA(enfa, a + 1); + enfa.numStates = std::max(enfa.numStates, b + 1); + enfa.epsilonTransitions[attach].insert(attach + 1); + enfa.epsilonTransitions[attach].insert(a + 1); + enfa.epsilonTransitions[a].insert(b + 1); + enfa.epsilonTransitions[b].insert(b + 1); + enfa.accepting.clear(); + enfa.accepting.insert(b + 1); + return b + 1; + } + + static void compress(stack& stk) + { + RE* a = stk.top(); + stk.pop(); + RE* b = stk.top(); + stk.pop(); + stk.push(new ConcatRE(b, a)); //Attention: reversed order because of stack + } + + static void compactStack(stack& stk) + { + if (stk.empty()) return; + RE* tp = stk.top(); + stk.pop(); + while (stk.size() >= 2) + { + compress(stk); + } + stk.push(tp); + } + + namespace { + RE* parseRE(string& input, size_t& idx) + { + stack stk; + for (; idx < input.length(); idx++) + { + RE* n; + switch (input[idx]) + { + case '\n': + if (idx != input.size() - 1) + throw SyntaxError("Cannot have a newline inside of a regex"); + break; + + case '\\': + idx++; + if (idx >= input.length()) + throw SyntaxError("Escape sequence at the end of the string"); + if (input[idx] == 'e') + stk.push(new EpsilonRE()); + else if (input[idx] == 'E') + stk.push(new EmptyRE()); + else if (input[idx] == '\\' || input[idx] == '*' || input[idx] == '+' || input[idx] == '(' || input[idx] == ')') + stk.push(new SingleRE(input[idx])); + else + throw SyntaxError(("invalid escape sequence: \\" + string(1, input[idx])).c_str()); + break; + + case '*': + if (stk.empty()) + throw SyntaxError("Cannot apply kleene star to empty regex"); + n = new StarRE(stk.top()); + stk.pop(); + stk.push(n); + break; + + case '+': + if (stk.empty()) + throw SyntaxError("Invalid regex: nothing to the left of '+'"); + if (stk.size() > 1) + compactStack(stk), compress(stk); + n = new PlusRE(nullptr, nullptr); + ((PlusRE*) n)->e = stk.top(); + ((PlusRE*) n)->f = parseRE(input, ++idx); + stk.pop(); + stk.push(n); + idx--; + break; + + case '(': + n = parseRE(input, ++idx); + if (idx >= input.size() || input[idx] != ')') + throw SyntaxError("Could not parse regex, unclosed parentheses"); + stk.push(n); + break; + + case ')': + if (stk.size() == 1) + return stk.top(); + else if (stk.size() == 2) + return compress(stk), stk.top(); + throw SyntaxError("Could not parse regex, nothing inside parentheses"); + + default: + stk.push(new SingleRE(input[idx])); + } + compactStack(stk); + } + if (stk.size() == 1) + return stk.top(); + else if (stk.size() == 2) + return compress(stk), stk.top(); + throw SyntaxError("Could not parse regex"); + } + + } + + RE* parseRE(string& input) + { + size_t i = 0; + RE* res = parseRE(input, i); + if (i < input.length() - 1) + throw SyntaxError("Incorrect regex"); + return res; + } +}