Lexesis/src/re.cpp

370 lines
13 KiB
C++

#include "Lexesis/re.h"
#include "RegexLexer.h"
#include <algorithm>
#include <iostream>
#include <sstream>
#include <stack>
using namespace std;
namespace lxs {
string EmptyRE::toRe()
{
return "";
}
State EmptyRE::toENFA(ENFA& enfa, State attach)
{
enfa.numStates = attach + 1;
enfa.accepting.clear();
enfa.accepting.insert(attach + 1);
return attach + 1;
}
string EpsilonRE::toRe()
{
return "ε";
}
State EpsilonRE::toENFA(ENFA& enfa, State attach)
{
enfa.numStates = std::max(attach + 1, enfa.numStates);
enfa.accepting.clear();
enfa.accepting.insert(attach + 1);
enfa.epsilonTransitions[attach].insert(attach + 1);
return attach + 1;
}
string SingleRE::toRe()
{
return string(1, c);
}
State SingleRE::toENFA(ENFA& enfa, State attach)
{
enfa.numStates = std::max(attach + 1, enfa.numStates);
enfa.accepting.clear();
enfa.accepting.insert(attach + 1);
enfa.delta[attach][c].insert(attach + 1);
return attach + 1;
}
string MultiRE::toRe()
{
return "[" + string(chars.begin(), chars.end()) + "]";
}
State MultiRE::toENFA(ENFA& enfa, State attach)
{
enfa.numStates = std::max(attach + 1, enfa.numStates);
enfa.accepting.clear();
enfa.accepting.insert(attach + 1);
for (char c : chars) {
enfa.delta[attach][c].insert(attach + 1);
}
return attach + 1;
}
string ConcatRE::toRe()
{
return e->toRe() + f->toRe();
}
State ConcatRE::toENFA(ENFA& enfa, State attach)
{
State a = e->toENFA(enfa, attach);
enfa.epsilonTransitions[a].insert(a + 1);
return f->toENFA(enfa, a + 1);
}
string StarRE::toRe()
{
return "(" + e->toRe() + ")*";
}
State StarRE::toENFA(ENFA& enfa, State attach)
{
State a = e->toENFA(enfa, attach + 1);
enfa.numStates = std::max(a + 1, enfa.numStates);
enfa.accepting.clear();
enfa.accepting.insert(a + 1);
enfa.epsilonTransitions[attach].insert(attach + 1);
enfa.epsilonTransitions[attach].insert(a + 1);
enfa.epsilonTransitions[a].insert(attach + 1);
enfa.epsilonTransitions[a].insert(a + 1);
return a + 1;
}
string PlusRE::toRe()
{
return "(" + e->toRe() + "|" + f->toRe() + ")";
}
State PlusRE::toENFA(ENFA& enfa, State attach)
{
State a = e->toENFA(enfa, attach + 1);
State b = f->toENFA(enfa, a + 1);
enfa.numStates = std::max(enfa.numStates, b + 1);
enfa.epsilonTransitions[attach].insert(attach + 1);
enfa.epsilonTransitions[attach].insert(a + 1);
enfa.epsilonTransitions[a].insert(b + 1);
enfa.epsilonTransitions[b].insert(b + 1);
enfa.accepting.clear();
enfa.accepting.insert(b + 1);
return b + 1;
}
namespace {
/**
* Take the two top elements from `stk` and combine them with a ConcatRE
*/
void compress(stack<std::shared_ptr<RE>>& stk)
{
std::shared_ptr<RE> a = stk.top();
stk.pop();
std::shared_ptr<RE> b = stk.top();
stk.pop();
stk.push(std::make_shared<ConcatRE>(b, a)); //Attention: reversed order because of stack
}
/**
* Apply compress until only one RE remains on the stack
*/
void compactStack(stack<std::shared_ptr<RE> >& stk)
{
if (stk.empty()) return;
std::shared_ptr<RE> tp = stk.top();
stk.pop();
while (stk.size() >= 2)
{
compress(stk);
}
stk.push(tp);
}
/**
* Parse a character class
*/
std::shared_ptr<RE> parseCharacterClass(const string& input) {
std::set<char> used_chars;
bool invert = false;
std::size_t start = 1;
std::size_t end = input.size() - 1;
if (input[1] == '^') {
invert = true;
start = 2;
}
if (input[start] == ']') {
used_chars.insert(']');
++start;
}
if (input[start] == '-') {
used_chars.insert('-');
++start;
}
if (input[end - 1] == '-') {
used_chars.insert('-');
--end;
}
int last_char = -1;
for (std::size_t idx = start; idx < end; idx++)
{
if (input[idx] == '-')
{
idx++;
for (int i = last_char; i <= input[idx]; i++) {
used_chars.insert((char) i);
}
last_char = -1;
}
else
{
if (idx == end - 1 || (idx < end - 1 && input[idx + 1] != '-'))
used_chars.insert(input[idx]);
else
last_char = input[idx];
}
}
std::vector<char> chars;
for (int i = 0; i < 256; i++) {
if (invert ^ (used_chars.count((char) i) > 0))
chars.push_back((char) i);
}
return std::make_shared<MultiRE>(chars);
}
/**
* Return the RE for the `.` pattern: everything except a newline
*/
std::shared_ptr<RE> dotChar() {
std::vector<char> any;
for (int i = 0; i < 256; i++)
if ((char) i != '\n') //Dot matches anything except newlines
any.push_back((char) i);
return std::make_shared<MultiRE>(any);
}
/**
* Parse the actual regex
*/
std::shared_ptr<RE> parseRE(RegexLexer& lex, bool& exit_by_closed_paren, bool inside_parens=false)
{
stack<std::shared_ptr<RE> > stk;
try {
while (true) {
exit_by_closed_paren = false;
RegexLexer::Token tok = lex.nextToken();
std::shared_ptr<RE> n;
switch (tok.type) {
case RegexLexer::TAB:
stk.push(std::make_shared<SingleRE>('\t'));
break;
case RegexLexer::NEWLINE:
stk.push(std::make_shared<SingleRE>('\n'));
break;
case RegexLexer::CARRIAGE_RETURN:
stk.push(std::make_shared<SingleRE>('\r'));
break;
case RegexLexer::BACKSPACE:
stk.push(std::make_shared<SingleRE>('\b'));
break;
case RegexLexer::SPACE:
stk.push(std::make_shared<SingleRE>(' '));
break;
case RegexLexer::BELL:
stk.push(std::make_shared<SingleRE>('\a'));
break;
case RegexLexer::FORMFEED:
stk.push(std::make_shared<SingleRE>('\f'));
break;
case RegexLexer::VTAB:
stk.push(std::make_shared<SingleRE>('\v'));
break;
case RegexLexer::BACKSLASH:
case RegexLexer::ESCAPED_STAR:
case RegexLexer::ESCAPED_PLUS:
case RegexLexer::ESCAPED_PIPE:
case RegexLexer::ESCAPED_LPAREN:
case RegexLexer::ESCAPED_RPAREN:
case RegexLexer::ESCAPED_LBRACKET:
case RegexLexer::ESCAPED_RBRACKET:
case RegexLexer::ESCAPED_QUESTIONMARK:
case RegexLexer::ESCAPED_DOT:
stk.push(std::make_shared<SingleRE>(tok.content[1]));
break;
case RegexLexer::DOT:
stk.push(dotChar());
break;
case RegexLexer::STAR:
if (stk.empty())
throw SyntaxError(lex, "Cannot apply kleene star to empty regex");
n = std::make_shared<StarRE>(stk.top());
stk.pop();
stk.push(n);
break;
case RegexLexer::PLUS:
if (stk.empty())
throw SyntaxError(lex, "Cannot apply kleene plus to empty regex");
n = stk.top();
stk.pop();
n = std::make_shared<ConcatRE>(n, std::make_shared<StarRE>(n));
stk.push(n);
break;
case RegexLexer::QUESTIONMARK:
if (stk.empty())
throw SyntaxError(lex, "Cannot apply '?' to empty regex");
n = std::make_shared<PlusRE>(stk.top(), std::make_shared<EpsilonRE>());
stk.pop();
stk.push(n);
break;
case RegexLexer::PIPE:
if (stk.empty())
throw SyntaxError(lex, "Invalid regex: nothing to the left of '|'");
if (stk.size() > 1)
compactStack(stk), compress(stk);
n = std::make_shared<PlusRE>(stk.top(), parseRE(lex, exit_by_closed_paren, inside_parens));
stk.pop();
stk.push(n);
if (exit_by_closed_paren) {
if (stk.size() == 1)
return stk.top();
else if (stk.size() == 2)
return compress(stk), stk.top();
else
throw SyntaxError(lex, "Invalid regex");
}
break;
case RegexLexer::LPAREN:
n = parseRE(lex, exit_by_closed_paren, true);
if (!exit_by_closed_paren) {
throw SyntaxError(lex, "Unclosed parenthesis");
}
stk.push(n);
break;
case RegexLexer::RPAREN:
if (!inside_parens)
throw SyntaxError(lex, "Unopened parenthesis");
exit_by_closed_paren = true;
if (stk.size() == 1)
return stk.top();
else if (stk.size() == 2)
return compress(stk), stk.top();
throw SyntaxError(lex, "Could not parse regex, nothing inside parentheses");
case RegexLexer::CHAR:
stk.push(std::make_shared<SingleRE>(tok.content[0]));
break;
case RegexLexer::CHAR_CLASS:
stk.push(parseCharacterClass(tok.content));
break;
case RegexLexer::ERROR:
throw SyntaxError(lex, "Error on character: " + tok.content);
case RegexLexer::ignore: case RegexLexer::nonmatching:
//Just ignore these
break;
}
compactStack(stk);
}
} catch (RegexLexer::NoMoreTokens& err) {
if (stk.size() == 1)
return stk.top();
else if (stk.size() == 2)
return compress(stk), stk.top();
throw SyntaxError(lex, "Could not parse regex");
}
}
}
std::shared_ptr<RE> parseRE(const string& input)
{
std::istringstream inputstream(input);
RegexLexer lex(inputstream);
bool exit_by_closed_paren = false;
std::shared_ptr<RE> res = parseRE(lex, exit_by_closed_paren);
return res;
}
}