385 lines
12 KiB
C++
385 lines
12 KiB
C++
#include "Lexesis/re.h"
|
|
|
|
#include <algorithm>
|
|
#include <iostream>
|
|
#include <stack>
|
|
using namespace std;
|
|
|
|
|
|
namespace lxs {
|
|
string EmptyRE::toRe()
|
|
{
|
|
return "∅";
|
|
}
|
|
|
|
State EmptyRE::toENFA(ENFA& enfa, State attach)
|
|
{
|
|
enfa.numStates = attach + 1;
|
|
enfa.accepting.clear();
|
|
enfa.accepting.insert(attach + 1);
|
|
return attach + 1;
|
|
}
|
|
|
|
string EpsilonRE::toRe()
|
|
{
|
|
return "ε";
|
|
}
|
|
|
|
State EpsilonRE::toENFA(ENFA& enfa, State attach)
|
|
{
|
|
enfa.numStates = std::max(attach + 1, enfa.numStates);
|
|
enfa.accepting.clear();
|
|
enfa.accepting.insert(attach + 1);
|
|
enfa.epsilonTransitions[attach].insert(attach + 1);
|
|
return attach + 1;
|
|
}
|
|
|
|
string SingleRE::toRe()
|
|
{
|
|
return string(1, c);
|
|
}
|
|
|
|
State SingleRE::toENFA(ENFA& enfa, State attach)
|
|
{
|
|
enfa.numStates = std::max(attach + 1, enfa.numStates);
|
|
enfa.accepting.clear();
|
|
enfa.accepting.insert(attach + 1);
|
|
enfa.delta[attach][c].insert(attach + 1);
|
|
return attach + 1;
|
|
}
|
|
|
|
string MultiRE::toRe()
|
|
{
|
|
//FIXME: this does not consider characters that need escaping
|
|
return "[" + string(chars.begin(), chars.end()) + "]";
|
|
}
|
|
|
|
State MultiRE::toENFA(ENFA& enfa, State attach)
|
|
{
|
|
enfa.numStates = std::max(attach + 1, enfa.numStates);
|
|
enfa.accepting.clear();
|
|
enfa.accepting.insert(attach + 1);
|
|
for (char c : chars) {
|
|
enfa.delta[attach][c].insert(attach + 1);
|
|
}
|
|
return attach + 1;
|
|
}
|
|
|
|
string ConcatRE::toRe()
|
|
{
|
|
return e->toRe() + f->toRe();
|
|
}
|
|
|
|
State ConcatRE::toENFA(ENFA& enfa, State attach)
|
|
{
|
|
State a = e->toENFA(enfa, attach);
|
|
enfa.epsilonTransitions[a].insert(a + 1);
|
|
return f->toENFA(enfa, a + 1);
|
|
}
|
|
|
|
string StarRE::toRe()
|
|
{
|
|
return "(" + e->toRe() + ")*";
|
|
}
|
|
|
|
State StarRE::toENFA(ENFA& enfa, State attach)
|
|
{
|
|
State a = e->toENFA(enfa, attach + 1);
|
|
enfa.numStates = std::max(a + 1, enfa.numStates);
|
|
enfa.accepting.clear();
|
|
enfa.accepting.insert(a + 1);
|
|
enfa.epsilonTransitions[attach].insert(attach + 1);
|
|
enfa.epsilonTransitions[attach].insert(a + 1);
|
|
enfa.epsilonTransitions[a].insert(attach + 1);
|
|
enfa.epsilonTransitions[a].insert(a + 1);
|
|
return a + 1;
|
|
}
|
|
|
|
string PlusRE::toRe()
|
|
{
|
|
return "(" + e->toRe() + "|" + f->toRe() + ")";
|
|
}
|
|
|
|
State PlusRE::toENFA(ENFA& enfa, State attach)
|
|
{
|
|
State a = e->toENFA(enfa, attach + 1);
|
|
State b = f->toENFA(enfa, a + 1);
|
|
enfa.numStates = std::max(enfa.numStates, b + 1);
|
|
enfa.epsilonTransitions[attach].insert(attach + 1);
|
|
enfa.epsilonTransitions[attach].insert(a + 1);
|
|
enfa.epsilonTransitions[a].insert(b + 1);
|
|
enfa.epsilonTransitions[b].insert(b + 1);
|
|
enfa.accepting.clear();
|
|
enfa.accepting.insert(b + 1);
|
|
return b + 1;
|
|
}
|
|
|
|
namespace {
|
|
/**
|
|
* Take the two top elements from `stk` and combine them with a ConcatRE
|
|
*/
|
|
void compress(stack<std::shared_ptr<RE>>& stk)
|
|
{
|
|
std::shared_ptr<RE> a = stk.top();
|
|
stk.pop();
|
|
std::shared_ptr<RE> b = stk.top();
|
|
stk.pop();
|
|
stk.push(std::make_shared<ConcatRE>(b, a)); //Attention: reversed order because of stack
|
|
}
|
|
|
|
/**
|
|
* Apply compress until only one RE remains on the stack
|
|
*/
|
|
void compactStack(stack<std::shared_ptr<RE> >& stk)
|
|
{
|
|
if (stk.empty()) return;
|
|
std::shared_ptr<RE> tp = stk.top();
|
|
stk.pop();
|
|
while (stk.size() >= 2)
|
|
{
|
|
compress(stk);
|
|
}
|
|
stk.push(tp);
|
|
}
|
|
|
|
/**
|
|
* Get the actual char that should be used when c is placed after a backslash
|
|
*/
|
|
char parseEscapeChar(char c) {
|
|
switch (c)
|
|
{
|
|
case '\\':
|
|
case '*':
|
|
case '+':
|
|
case '|':
|
|
case '(':
|
|
case ')':
|
|
case '[':
|
|
case ']':
|
|
case '?':
|
|
case '.':
|
|
break;
|
|
case 'n':
|
|
c = '\n'; break;
|
|
case 'r':
|
|
c = '\r'; break;
|
|
case 'b':
|
|
c = '\b'; break;
|
|
case 't':
|
|
c = '\t'; break;
|
|
case 's':
|
|
c = ' '; break;
|
|
case 'a':
|
|
c = '\a'; break;
|
|
case 'f':
|
|
c = '\f'; break;
|
|
case 'v':
|
|
c = '\v'; break;
|
|
default:
|
|
throw SyntaxError(("Invalid escape sequence: \\" + std::string(1, c)).c_str());
|
|
}
|
|
return c;
|
|
}
|
|
|
|
/**
|
|
* Parse a character class
|
|
*/
|
|
std::shared_ptr<RE> parseCharacterClass(const string& input, size_t& idx) {
|
|
if (idx >= input.size())
|
|
throw SyntaxError("Unclosed character class");
|
|
std::set<char> used_chars;
|
|
|
|
bool invert = false;
|
|
int last_char = -1;
|
|
|
|
if (input[idx] == '^')
|
|
{
|
|
invert = true;
|
|
idx++;
|
|
}
|
|
|
|
if (idx >= input.size())
|
|
throw SyntaxError("Unclosed character class");
|
|
|
|
|
|
if (input[idx] == ']')
|
|
{
|
|
used_chars.insert(']');
|
|
idx++;
|
|
last_char = ']';
|
|
}
|
|
|
|
if (idx >= input.size())
|
|
throw SyntaxError("Unclosed character class");
|
|
|
|
if (input[idx] == '-')
|
|
{
|
|
used_chars.insert('-');
|
|
idx++;
|
|
last_char = '-';
|
|
}
|
|
|
|
if (idx >= input.size())
|
|
throw SyntaxError("Unclosed character class");
|
|
|
|
for (; idx < input.size() && input[idx] != ']'; idx++)
|
|
{
|
|
if (input[idx] == '-')
|
|
{
|
|
idx++;
|
|
|
|
if (idx >= input.size())
|
|
throw SyntaxError("Unclosed character class");
|
|
|
|
if (input[idx] == ']')
|
|
{
|
|
used_chars.insert('-');
|
|
idx--;
|
|
}
|
|
else
|
|
{
|
|
if (last_char == -1)
|
|
throw SyntaxError("Nothing to apply range to");
|
|
for (int i = last_char + 1; i <= input[idx]; i++)
|
|
{
|
|
used_chars.insert((char) i);
|
|
}
|
|
last_char = -1;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
used_chars.insert(input[idx]);
|
|
last_char = input[idx];
|
|
}
|
|
}
|
|
|
|
if (idx >= input.size())
|
|
throw SyntaxError("Unclosed character class");
|
|
|
|
std::vector<char> chars;
|
|
for (int i = 0; i < 256; i++)
|
|
{
|
|
if (invert ^ (used_chars.count((char) i) > 0))
|
|
chars.push_back((char) i);
|
|
}
|
|
|
|
return std::make_shared<MultiRE>(chars);
|
|
}
|
|
|
|
/**
|
|
* Return the RE for the `.` pattern: everything except a newline
|
|
*/
|
|
std::shared_ptr<RE> dotChar() {
|
|
std::vector<char> any;
|
|
for (int i = 0; i < 256; i++)
|
|
if ((char) i != '\n') //Dot matches anything except newlines
|
|
any.push_back((char) i);
|
|
return std::make_shared<MultiRE>(any);
|
|
}
|
|
|
|
/**
|
|
* Parse the actual regex
|
|
*/
|
|
std::shared_ptr<RE> parseRE(const string& input, size_t& idx)
|
|
{
|
|
stack<std::shared_ptr<RE> > stk;
|
|
for (; idx < input.length(); idx++)
|
|
{
|
|
std::shared_ptr<RE> n;
|
|
switch (input[idx])
|
|
{
|
|
case '\\':
|
|
idx++;
|
|
if (idx >= input.length())
|
|
throw SyntaxError("Escape sequence at the end of the string");
|
|
else
|
|
stk.push(std::make_shared<SingleRE>(parseEscapeChar(input[idx])));
|
|
break;
|
|
|
|
case '[':
|
|
stk.push(parseCharacterClass(input, ++idx));
|
|
break;
|
|
|
|
case '.':
|
|
stk.push(dotChar());
|
|
break;
|
|
|
|
case ']':
|
|
throw SyntaxError("Unopened ']'");
|
|
break;
|
|
|
|
case '*':
|
|
if (stk.empty())
|
|
throw SyntaxError("Cannot apply kleene star to empty regex");
|
|
n = std::make_shared<StarRE>(stk.top());
|
|
stk.pop();
|
|
stk.push(n);
|
|
break;
|
|
|
|
case '+':
|
|
if (stk.empty())
|
|
throw SyntaxError("Cannot apply kleene plus to empty regex");
|
|
n = stk.top();
|
|
stk.pop();
|
|
n = std::make_shared<ConcatRE>(n, std::make_shared<StarRE>(n));
|
|
stk.push(n);
|
|
break;
|
|
|
|
case '?':
|
|
if (stk.empty())
|
|
throw SyntaxError("Cannot apply '?' to empty regex");
|
|
n = std::make_shared<PlusRE>(stk.top(), std::make_shared<EpsilonRE>());
|
|
stk.pop();
|
|
stk.push(n);
|
|
break;
|
|
|
|
case '|':
|
|
if (stk.empty())
|
|
throw SyntaxError("Invalid regex: nothing to the left of '|'");
|
|
if (stk.size() > 1)
|
|
compactStack(stk), compress(stk);
|
|
n = std::make_shared<PlusRE>(stk.top(), parseRE(input, ++idx));
|
|
stk.pop();
|
|
stk.push(n);
|
|
idx--;
|
|
break;
|
|
|
|
case '(':
|
|
n = parseRE(input, ++idx);
|
|
if (idx >= input.size() || input[idx] != ')')
|
|
throw SyntaxError("Could not parse regex, unclosed parentheses");
|
|
stk.push(n);
|
|
break;
|
|
|
|
case ')':
|
|
if (stk.size() == 1)
|
|
return stk.top();
|
|
else if (stk.size() == 2)
|
|
return compress(stk), stk.top();
|
|
throw SyntaxError("Could not parse regex, nothing inside parentheses");
|
|
|
|
default:
|
|
stk.push(std::make_shared<SingleRE>(input[idx]));
|
|
}
|
|
compactStack(stk);
|
|
}
|
|
if (stk.size() == 1)
|
|
return stk.top();
|
|
else if (stk.size() == 2)
|
|
return compress(stk), stk.top();
|
|
throw SyntaxError("Could not parse regex");
|
|
}
|
|
|
|
}
|
|
|
|
std::shared_ptr<RE> parseRE(const string& input)
|
|
{
|
|
size_t i = 0;
|
|
std::shared_ptr<RE> res = parseRE(input, i);
|
|
if (i < input.length() - 1)
|
|
throw SyntaxError("Incorrect regex");
|
|
return res;
|
|
}
|
|
}
|