Lexesis/src/re.cpp

379 lines
12 KiB
C++

#include "Lexesis/re.h"
#include <algorithm>
#include <iostream>
#include <stack>
using namespace std;
namespace lxs {
string EmptyRE::toRe()
{
return "";
}
State EmptyRE::toENFA(ENFA& enfa, State attach)
{
enfa.numStates = attach + 1;
enfa.accepting.clear();
enfa.accepting.insert(attach + 1);
return attach + 1;
}
string EpsilonRE::toRe()
{
return "ε";
}
State EpsilonRE::toENFA(ENFA& enfa, State attach)
{
enfa.numStates = std::max(attach + 1, enfa.numStates);
enfa.accepting.clear();
enfa.accepting.insert(attach + 1);
enfa.epsilonTransitions[attach].insert(attach + 1);
return attach + 1;
}
string SingleRE::toRe()
{
return string(1, c);
}
State SingleRE::toENFA(ENFA& enfa, State attach)
{
enfa.numStates = std::max(attach + 1, enfa.numStates);
enfa.accepting.clear();
enfa.accepting.insert(attach + 1);
enfa.delta[attach][c].insert(attach + 1);
return attach + 1;
}
string ConcatRE::toRe()
{
return e->toRe() + f->toRe();
}
State ConcatRE::toENFA(ENFA& enfa, State attach)
{
State a = e->toENFA(enfa, attach);
enfa.epsilonTransitions[a].insert(a + 1);
return f->toENFA(enfa, a + 1);
}
string StarRE::toRe()
{
return "(" + e->toRe() + ")*";
}
State StarRE::toENFA(ENFA& enfa, State attach)
{
State a = e->toENFA(enfa, attach + 1);
enfa.numStates = std::max(a + 1, enfa.numStates);
enfa.accepting.clear();
enfa.accepting.insert(a + 1);
enfa.epsilonTransitions[attach].insert(attach + 1);
enfa.epsilonTransitions[attach].insert(a + 1);
enfa.epsilonTransitions[a].insert(attach + 1);
enfa.epsilonTransitions[a].insert(a + 1);
return a + 1;
}
string PlusRE::toRe()
{
return "(" + e->toRe() + "+" + f->toRe() + ")";
}
State PlusRE::toENFA(ENFA& enfa, State attach)
{
State a = e->toENFA(enfa, attach + 1);
State b = f->toENFA(enfa, a + 1);
enfa.numStates = std::max(enfa.numStates, b + 1);
enfa.epsilonTransitions[attach].insert(attach + 1);
enfa.epsilonTransitions[attach].insert(a + 1);
enfa.epsilonTransitions[a].insert(b + 1);
enfa.epsilonTransitions[b].insert(b + 1);
enfa.accepting.clear();
enfa.accepting.insert(b + 1);
return b + 1;
}
namespace {
void compress(stack<std::shared_ptr<RE>>& stk)
{
std::shared_ptr<RE> a = stk.top();
stk.pop();
std::shared_ptr<RE> b = stk.top();
stk.pop();
stk.push(std::make_shared<ConcatRE>(b, a)); //Attention: reversed order because of stack
}
void compactStack(stack<std::shared_ptr<RE> >& stk)
{
if (stk.empty()) return;
std::shared_ptr<RE> tp = stk.top();
stk.pop();
while (stk.size() >= 2)
{
compress(stk);
}
stk.push(tp);
}
char parseEscapeChar(char c) {
switch (c)
{
case '\\':
case '*':
case '+':
case '|':
case '(':
case ')':
case '[':
case ']':
case '?':
case '.':
case '\'':
case '"':
case '-':
break;
case 'n':
c = '\n'; break;
case 'r':
c = '\r'; break;
case 'b':
c = '\b'; break;
case 't':
c = '\t'; break;
case 's':
c = ' '; break;
case 'a':
c = '\a'; break;
case 'f':
c = '\f'; break;
case 'v':
c = '\v'; break;
default:
throw SyntaxError(("Invalid escape sequence: \\" + std::string(1, c)).c_str());
}
return c;
}
void sumREs(std::vector<std::shared_ptr<RE> >& res)
{
for (std::size_t step = 1; step < res.size(); step <<= 1)
{
for (std::size_t i = 0; i < res.size(); i += step * 2)
{
if (i + step < res.size())
res[i] = std::make_shared<PlusRE>(res[i], res[i + step]);
}
}
}
std::shared_ptr<RE> parseCharacterClass(const string& input, size_t& idx) {
if (idx >= input.size())
throw SyntaxError("Unclosed character class");
std::vector<std::shared_ptr<RE> > chars;
std::set<char> used_chars;
bool invert = false;
int last_char = -1;
if (input[idx] == '^')
{
invert = true;
idx++;
}
if (idx >= input.size())
throw SyntaxError("Unclosed character class");
if (input[idx] == ']')
{
used_chars.insert(']');
idx++;
last_char = ']';
}
if (idx >= input.size())
throw SyntaxError("Unclosed character class");
if (input[idx] == '-')
{
used_chars.insert('-');
idx++;
last_char = '-';
}
if (idx >= input.size())
throw SyntaxError("Unclosed character class");
for (; idx < input.size() && input[idx] != ']'; idx++)
{
if (input[idx] == '-')
{
idx++;
if (idx >= input.size())
throw SyntaxError("Unclosed character class");
if (input[idx] == ']')
{
used_chars.insert('-');
}
else
{
if (last_char == -1)
throw SyntaxError("Nothing to apply range to");
for (int i = last_char + 1; i <= input[idx]; i++)
{
used_chars.insert((char) i);
}
last_char = -1;
}
}
else if (input[idx] == '\\')
{
idx++;
if (idx >= input.size())
throw SyntaxError("Unclosed character classe");
last_char = parseEscapeChar(input[idx]);
used_chars.insert(last_char);
}
else
{
used_chars.insert(input[idx]);
last_char = input[idx];
}
}
if (idx >= input.size())
throw SyntaxError("Unclosed character class");
idx++; //Eat the ]
for (int i = 0; i < 256; i++)
{
if (invert ^ (used_chars.count((char) i) > 0))
chars.push_back(std::make_shared<SingleRE>((char) i));
}
sumREs(chars);
return chars[0];
}
std::shared_ptr<RE> dotChar() {
std::vector<std::shared_ptr<RE> > any;
for (int i = 0; i < 256; i++)
if ((char) i != '\n') //Dot matches anything except newlines
any.push_back(std::make_shared<SingleRE>((char) i));
sumREs(any);
return any[0];
}
std::shared_ptr<RE> parseRE(const string& input, size_t& idx)
{
stack<std::shared_ptr<RE> > stk;
for (; idx < input.length(); idx++)
{
std::shared_ptr<RE> n;
switch (input[idx])
{
case '\\':
idx++;
if (idx >= input.length())
throw SyntaxError("Escape sequence at the end of the string");
else
stk.push(std::make_shared<SingleRE>(parseEscapeChar(input[idx])));
break;
case '[':
stk.push(parseCharacterClass(input, ++idx));
break;
case '.':
for (int c = 0; c <= 256; c++)
{
stk.push(dotChar());
}
break;
case ']':
throw SyntaxError("Unopened ']'");
break;
case '*':
if (stk.empty())
throw SyntaxError("Cannot apply kleene star to empty regex");
n = std::make_shared<StarRE>(stk.top());
stk.pop();
stk.push(n);
break;
case '+':
if (stk.empty())
throw SyntaxError("Cannot apply kleene plus to empty regex");
n = stk.top();
stk.pop();
n = std::make_shared<ConcatRE>(n, std::make_shared<StarRE>(n));
stk.push(n);
break;
case '?':
if (stk.empty())
throw SyntaxError("Cannot apply '?' to empty regex");
n = std::make_shared<PlusRE>(stk.top(), std::make_shared<EpsilonRE>());
stk.pop();
stk.push(n);
break;
case '|':
if (stk.empty())
throw SyntaxError("Invalid regex: nothing to the left of '|'");
if (stk.size() > 1)
compactStack(stk), compress(stk);
n = std::make_shared<PlusRE>(stk.top(), parseRE(input, ++idx));
stk.pop();
stk.push(n);
idx--;
break;
case '(':
n = parseRE(input, ++idx);
if (idx >= input.size() || input[idx] != ')')
throw SyntaxError("Could not parse regex, unclosed parentheses");
stk.push(n);
break;
case ')':
if (stk.size() == 1)
return stk.top();
else if (stk.size() == 2)
return compress(stk), stk.top();
throw SyntaxError("Could not parse regex, nothing inside parentheses");
default:
stk.push(std::make_shared<SingleRE>(input[idx]));
}
compactStack(stk);
}
if (stk.size() == 1)
return stk.top();
else if (stk.size() == 2)
return compress(stk), stk.top();
throw SyntaxError("Could not parse regex");
}
}
std::shared_ptr<RE> parseRE(const string& input)
{
size_t i = 0;
std::shared_ptr<RE> res = parseRE(input, i);
if (i < input.length() - 1)
throw SyntaxError("Incorrect regex");
return res;
}
}