370 lines
13 KiB
C++
370 lines
13 KiB
C++
#include "Lexesis/re.h"
|
|
#include "RegexLexer.h"
|
|
|
|
#include <algorithm>
|
|
#include <iostream>
|
|
#include <sstream>
|
|
#include <stack>
|
|
using namespace std;
|
|
|
|
|
|
namespace lxs {
|
|
string EmptyRE::toRe()
|
|
{
|
|
return "∅";
|
|
}
|
|
|
|
State EmptyRE::toENFA(ENFA& enfa, State attach)
|
|
{
|
|
enfa.numStates = attach + 1;
|
|
enfa.accepting.clear();
|
|
enfa.accepting.insert(attach + 1);
|
|
return attach + 1;
|
|
}
|
|
|
|
string EpsilonRE::toRe()
|
|
{
|
|
return "ε";
|
|
}
|
|
|
|
State EpsilonRE::toENFA(ENFA& enfa, State attach)
|
|
{
|
|
enfa.numStates = std::max(attach + 1, enfa.numStates);
|
|
enfa.accepting.clear();
|
|
enfa.accepting.insert(attach + 1);
|
|
enfa.epsilonTransitions[attach].insert(attach + 1);
|
|
return attach + 1;
|
|
}
|
|
|
|
string SingleRE::toRe()
|
|
{
|
|
return string(1, c);
|
|
}
|
|
|
|
State SingleRE::toENFA(ENFA& enfa, State attach)
|
|
{
|
|
enfa.numStates = std::max(attach + 1, enfa.numStates);
|
|
enfa.accepting.clear();
|
|
enfa.accepting.insert(attach + 1);
|
|
enfa.delta[attach][c].insert(attach + 1);
|
|
return attach + 1;
|
|
}
|
|
|
|
string MultiRE::toRe()
|
|
{
|
|
return "[" + string(chars.begin(), chars.end()) + "]";
|
|
}
|
|
|
|
State MultiRE::toENFA(ENFA& enfa, State attach)
|
|
{
|
|
enfa.numStates = std::max(attach + 1, enfa.numStates);
|
|
enfa.accepting.clear();
|
|
enfa.accepting.insert(attach + 1);
|
|
for (char c : chars) {
|
|
enfa.delta[attach][c].insert(attach + 1);
|
|
}
|
|
return attach + 1;
|
|
}
|
|
|
|
string ConcatRE::toRe()
|
|
{
|
|
return e->toRe() + f->toRe();
|
|
}
|
|
|
|
State ConcatRE::toENFA(ENFA& enfa, State attach)
|
|
{
|
|
State a = e->toENFA(enfa, attach);
|
|
enfa.epsilonTransitions[a].insert(a + 1);
|
|
return f->toENFA(enfa, a + 1);
|
|
}
|
|
|
|
string StarRE::toRe()
|
|
{
|
|
return "(" + e->toRe() + ")*";
|
|
}
|
|
|
|
State StarRE::toENFA(ENFA& enfa, State attach)
|
|
{
|
|
State a = e->toENFA(enfa, attach + 1);
|
|
enfa.numStates = std::max(a + 1, enfa.numStates);
|
|
enfa.accepting.clear();
|
|
enfa.accepting.insert(a + 1);
|
|
enfa.epsilonTransitions[attach].insert(attach + 1);
|
|
enfa.epsilonTransitions[attach].insert(a + 1);
|
|
enfa.epsilonTransitions[a].insert(attach + 1);
|
|
enfa.epsilonTransitions[a].insert(a + 1);
|
|
return a + 1;
|
|
}
|
|
|
|
string PlusRE::toRe()
|
|
{
|
|
return "(" + e->toRe() + "|" + f->toRe() + ")";
|
|
}
|
|
|
|
State PlusRE::toENFA(ENFA& enfa, State attach)
|
|
{
|
|
State a = e->toENFA(enfa, attach + 1);
|
|
State b = f->toENFA(enfa, a + 1);
|
|
enfa.numStates = std::max(enfa.numStates, b + 1);
|
|
enfa.epsilonTransitions[attach].insert(attach + 1);
|
|
enfa.epsilonTransitions[attach].insert(a + 1);
|
|
enfa.epsilonTransitions[a].insert(b + 1);
|
|
enfa.epsilonTransitions[b].insert(b + 1);
|
|
enfa.accepting.clear();
|
|
enfa.accepting.insert(b + 1);
|
|
return b + 1;
|
|
}
|
|
|
|
namespace {
|
|
/**
|
|
* Take the two top elements from `stk` and combine them with a ConcatRE
|
|
*/
|
|
void compress(stack<std::shared_ptr<RE>>& stk)
|
|
{
|
|
std::shared_ptr<RE> a = stk.top();
|
|
stk.pop();
|
|
std::shared_ptr<RE> b = stk.top();
|
|
stk.pop();
|
|
stk.push(std::make_shared<ConcatRE>(b, a)); //Attention: reversed order because of stack
|
|
}
|
|
|
|
/**
|
|
* Apply compress until only one RE remains on the stack
|
|
*/
|
|
void compactStack(stack<std::shared_ptr<RE> >& stk)
|
|
{
|
|
if (stk.empty()) return;
|
|
std::shared_ptr<RE> tp = stk.top();
|
|
stk.pop();
|
|
while (stk.size() >= 2)
|
|
{
|
|
compress(stk);
|
|
}
|
|
stk.push(tp);
|
|
}
|
|
|
|
/**
|
|
* Parse a character class
|
|
*/
|
|
std::shared_ptr<RE> parseCharacterClass(const string& input) {
|
|
std::set<char> used_chars;
|
|
|
|
bool invert = false;
|
|
std::size_t start = 1;
|
|
std::size_t end = input.size() - 1;
|
|
|
|
if (input[1] == '^') {
|
|
invert = true;
|
|
start = 2;
|
|
}
|
|
|
|
if (input[start] == ']') {
|
|
used_chars.insert(']');
|
|
++start;
|
|
}
|
|
|
|
if (input[start] == '-') {
|
|
used_chars.insert('-');
|
|
++start;
|
|
}
|
|
|
|
if (input[end - 1] == '-') {
|
|
used_chars.insert('-');
|
|
--end;
|
|
}
|
|
|
|
int last_char = -1;
|
|
for (std::size_t idx = start; idx < end; idx++)
|
|
{
|
|
if (input[idx] == '-')
|
|
{
|
|
idx++;
|
|
for (int i = last_char; i <= input[idx]; i++) {
|
|
used_chars.insert((char) i);
|
|
}
|
|
last_char = -1;
|
|
}
|
|
else
|
|
{
|
|
if (idx == end - 1 || (idx < end - 1 && input[idx + 1] != '-'))
|
|
used_chars.insert(input[idx]);
|
|
else
|
|
last_char = input[idx];
|
|
}
|
|
}
|
|
|
|
std::vector<char> chars;
|
|
for (int i = 0; i < 256; i++) {
|
|
if (invert ^ (used_chars.count((char) i) > 0))
|
|
chars.push_back((char) i);
|
|
}
|
|
|
|
return std::make_shared<MultiRE>(chars);
|
|
}
|
|
|
|
/**
|
|
* Return the RE for the `.` pattern: everything except a newline
|
|
*/
|
|
std::shared_ptr<RE> dotChar() {
|
|
std::vector<char> any;
|
|
for (int i = 0; i < 256; i++)
|
|
if ((char) i != '\n') //Dot matches anything except newlines
|
|
any.push_back((char) i);
|
|
return std::make_shared<MultiRE>(any);
|
|
}
|
|
|
|
/**
|
|
* Parse the actual regex
|
|
*/
|
|
std::shared_ptr<RE> parseRE(RegexLexer& lex, bool& exit_by_closed_paren, bool inside_parens=false)
|
|
{
|
|
stack<std::shared_ptr<RE> > stk;
|
|
|
|
try {
|
|
while (true) {
|
|
exit_by_closed_paren = false;
|
|
RegexLexer::Token tok = lex.nextToken();
|
|
std::shared_ptr<RE> n;
|
|
switch (tok.type) {
|
|
case RegexLexer::TAB:
|
|
stk.push(std::make_shared<SingleRE>('\t'));
|
|
break;
|
|
case RegexLexer::NEWLINE:
|
|
stk.push(std::make_shared<SingleRE>('\n'));
|
|
break;
|
|
case RegexLexer::CARRIAGE_RETURN:
|
|
stk.push(std::make_shared<SingleRE>('\r'));
|
|
break;
|
|
case RegexLexer::BACKSPACE:
|
|
stk.push(std::make_shared<SingleRE>('\b'));
|
|
break;
|
|
case RegexLexer::SPACE:
|
|
stk.push(std::make_shared<SingleRE>(' '));
|
|
break;
|
|
case RegexLexer::BELL:
|
|
stk.push(std::make_shared<SingleRE>('\a'));
|
|
break;
|
|
case RegexLexer::FORMFEED:
|
|
stk.push(std::make_shared<SingleRE>('\f'));
|
|
break;
|
|
case RegexLexer::VTAB:
|
|
stk.push(std::make_shared<SingleRE>('\v'));
|
|
break;
|
|
|
|
case RegexLexer::BACKSLASH:
|
|
case RegexLexer::ESCAPED_STAR:
|
|
case RegexLexer::ESCAPED_PLUS:
|
|
case RegexLexer::ESCAPED_PIPE:
|
|
case RegexLexer::ESCAPED_LPAREN:
|
|
case RegexLexer::ESCAPED_RPAREN:
|
|
case RegexLexer::ESCAPED_LBRACKET:
|
|
case RegexLexer::ESCAPED_RBRACKET:
|
|
case RegexLexer::ESCAPED_QUESTIONMARK:
|
|
case RegexLexer::ESCAPED_DOT:
|
|
stk.push(std::make_shared<SingleRE>(tok.content[1]));
|
|
break;
|
|
|
|
case RegexLexer::DOT:
|
|
stk.push(dotChar());
|
|
break;
|
|
|
|
case RegexLexer::STAR:
|
|
if (stk.empty())
|
|
throw SyntaxError(lex, "Cannot apply kleene star to empty regex");
|
|
n = std::make_shared<StarRE>(stk.top());
|
|
stk.pop();
|
|
stk.push(n);
|
|
break;
|
|
|
|
case RegexLexer::PLUS:
|
|
if (stk.empty())
|
|
throw SyntaxError(lex, "Cannot apply kleene plus to empty regex");
|
|
n = stk.top();
|
|
stk.pop();
|
|
n = std::make_shared<ConcatRE>(n, std::make_shared<StarRE>(n));
|
|
stk.push(n);
|
|
break;
|
|
|
|
case RegexLexer::QUESTIONMARK:
|
|
if (stk.empty())
|
|
throw SyntaxError(lex, "Cannot apply '?' to empty regex");
|
|
n = std::make_shared<PlusRE>(stk.top(), std::make_shared<EpsilonRE>());
|
|
stk.pop();
|
|
stk.push(n);
|
|
break;
|
|
|
|
case RegexLexer::PIPE:
|
|
if (stk.empty())
|
|
throw SyntaxError(lex, "Invalid regex: nothing to the left of '|'");
|
|
if (stk.size() > 1)
|
|
compactStack(stk), compress(stk);
|
|
n = std::make_shared<PlusRE>(stk.top(), parseRE(lex, exit_by_closed_paren, inside_parens));
|
|
stk.pop();
|
|
stk.push(n);
|
|
if (exit_by_closed_paren) {
|
|
if (stk.size() == 1)
|
|
return stk.top();
|
|
else if (stk.size() == 2)
|
|
return compress(stk), stk.top();
|
|
else
|
|
throw SyntaxError(lex, "Invalid regex");
|
|
}
|
|
break;
|
|
|
|
case RegexLexer::LPAREN:
|
|
n = parseRE(lex, exit_by_closed_paren, true);
|
|
if (!exit_by_closed_paren) {
|
|
throw SyntaxError(lex, "Unclosed parenthesis");
|
|
}
|
|
stk.push(n);
|
|
break;
|
|
|
|
case RegexLexer::RPAREN:
|
|
if (!inside_parens)
|
|
throw SyntaxError(lex, "Unopened parenthesis");
|
|
|
|
exit_by_closed_paren = true;
|
|
if (stk.size() == 1)
|
|
return stk.top();
|
|
else if (stk.size() == 2)
|
|
return compress(stk), stk.top();
|
|
throw SyntaxError(lex, "Could not parse regex, nothing inside parentheses");
|
|
|
|
case RegexLexer::CHAR:
|
|
stk.push(std::make_shared<SingleRE>(tok.content[0]));
|
|
break;
|
|
|
|
case RegexLexer::CHAR_CLASS:
|
|
stk.push(parseCharacterClass(tok.content));
|
|
break;
|
|
|
|
case RegexLexer::ERROR:
|
|
throw SyntaxError(lex, "Error on character: " + tok.content);
|
|
|
|
case RegexLexer::ignore: case RegexLexer::nonmatching:
|
|
//Just ignore these
|
|
break;
|
|
}
|
|
compactStack(stk);
|
|
}
|
|
} catch (RegexLexer::NoMoreTokens& err) {
|
|
if (stk.size() == 1)
|
|
return stk.top();
|
|
else if (stk.size() == 2)
|
|
return compress(stk), stk.top();
|
|
throw SyntaxError(lex, "Could not parse regex");
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
std::shared_ptr<RE> parseRE(const string& input)
|
|
{
|
|
std::istringstream inputstream(input);
|
|
RegexLexer lex(inputstream);
|
|
bool exit_by_closed_paren = false;
|
|
std::shared_ptr<RE> res = parseRE(lex, exit_by_closed_paren);
|
|
return res;
|
|
}
|
|
}
|