This commit is contained in:
Robin Jadoul 2016-05-29 22:47:10 +02:00
parent 3fd362bf2d
commit 7f9c7aed44
4 changed files with 406 additions and 165 deletions

View File

@ -10,6 +10,7 @@ add_library(lxs
backend.cpp backend.cpp
backendmanager.cpp backendmanager.cpp
driver.cpp driver.cpp
RegexLexer.cpp
re.cpp re.cpp
inputparser.cpp inputparser.cpp
template.cpp template.cpp

166
src/RegexLexer.cpp Normal file
View File

@ -0,0 +1,166 @@
#include "RegexLexer.h"
#include <sstream>
#include <iostream>
namespace { //The automaton data
typedef std::size_t State;
State REJECT = 39;
unsigned char TRANS_IDX[256] = { (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)1, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)2, (unsigned char)3, (unsigned char)4, (unsigned char)5, (unsigned char)0, (unsigned char)6, (unsigned char)7, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)8, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)9, (unsigned char)10, (unsigned char)11, (unsigned char)12, (unsigned char)0, (unsigned char)0, (unsigned char)13, (unsigned char)14, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)15, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)16, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)17, (unsigned char)18, (unsigned char)19, (unsigned char)0, (unsigned char)20, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)21, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, };
State TABLE[40 - 1][22] = {
{ 37, 16, 35, 36, 31, 32, 37, 30, 33, 1, 11, 38, 37, 37, 37, 37, 37, 37, 37, 37, 37, 34, },
{ 6, 6, 6, 6, 6, 6, 5, 6, 6, 6, 6, 4, 2, 6, 6, 6, 6, 6, 6, 6, 6, 6, },
{ 6, 6, 6, 6, 6, 6, 5, 6, 6, 6, 6, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, },
{ 6, 6, 6, 6, 6, 6, 5, 6, 6, 6, 6, 10, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, },
{ 6, 6, 6, 6, 6, 6, 5, 6, 6, 6, 6, 10, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, },
{ 6, 6, 6, 6, 6, 6, 39, 6, 6, 6, 6, 10, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, },
{ 6, 6, 6, 6, 6, 6, 7, 6, 6, 6, 6, 10, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, },
{ 8, 8, 8, 8, 8, 8, 39, 8, 8, 8, 8, 10, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, },
{ 6, 6, 6, 6, 6, 6, 9, 6, 6, 6, 6, 10, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, },
{ 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 10, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, },
{ 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, },
{ 39, 39, 24, 25, 21, 22, 39, 29, 28, 26, 20, 27, 39, 17, 15, 18, 13, 14, 16, 12, 19, 23, },
{ 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, },
{ 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, },
{ 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, },
{ 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, },
{ 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, },
{ 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, },
{ 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, },
{ 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, },
{ 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, },
{ 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, },
{ 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, },
{ 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, },
{ 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, },
{ 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, },
{ 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, },
{ 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, },
{ 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, },
{ 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, },
{ 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, },
{ 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, },
{ 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, },
{ 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, },
{ 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, },
{ 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, },
{ 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, },
{ 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, },
{ 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, },
};
RegexLexer::TokenType TOKENS[40] = { RegexLexer::nonmatching, RegexLexer::ERROR, RegexLexer::nonmatching, RegexLexer::CHAR_CLASS, RegexLexer::nonmatching, RegexLexer::nonmatching, RegexLexer::nonmatching, RegexLexer::nonmatching, RegexLexer::nonmatching, RegexLexer::nonmatching, RegexLexer::CHAR_CLASS, RegexLexer::ERROR, RegexLexer::TAB, RegexLexer::NEWLINE, RegexLexer::CARRIAGE_RETURN, RegexLexer::BACKSPACE, RegexLexer::SPACE, RegexLexer::BELL, RegexLexer::FORMFEED, RegexLexer::VTAB, RegexLexer::BACKSLASH, RegexLexer::ESCAPED_STAR, RegexLexer::ESCAPED_PLUS, RegexLexer::ESCAPED_PIPE, RegexLexer::ESCAPED_LPAREN, RegexLexer::ESCAPED_RPAREN, RegexLexer::ESCAPED_LBRACKET, RegexLexer::ESCAPED_RBRACKET, RegexLexer::ESCAPED_QUESTIONMARK, RegexLexer::ESCAPED_DOT, RegexLexer::DOT, RegexLexer::STAR, RegexLexer::PLUS, RegexLexer::QUESTIONMARK, RegexLexer::PIPE, RegexLexer::LPAREN, RegexLexer::RPAREN, RegexLexer::CHAR, RegexLexer::ERROR, RegexLexer::nonmatching, };
}
RegexLexer::RegexLexer(std::istream& in) : m_offset(0), m_input(in) {
}
RegexLexer::~RegexLexer() {
}
RegexLexer::Token RegexLexer::nextToken() {
TokenType type = ignore;
std::string token;
while (type == ignore) {
State state = 0;
std::size_t match_length = 0;
token = "";
while (!m_input.eof() && state != REJECT) {
char c = m_input.peek();
if (m_input.eof())
break;
token += c;
state = TABLE[state][TRANS_IDX[(unsigned char)c]];
if (TOKENS[state])
{
match_length = token.length();
type = TOKENS[state];
}
m_input.get();
++m_offset;
}
std::size_t sdiff = token.length() - match_length;
for (std::size_t i = 0; i < sdiff; i++)
{
m_input.putback(token[token.length() - i - 1]);
}
m_offset -= sdiff;
if (!type || !match_length) {
if (m_input.eof())
throw NoMoreTokens();
throw NoMatch();
}
token = token.substr(0, match_length);
}
Token t;
t.type = type;
t.content = token;
return t;
}
void RegexLexer::skip(std::size_t n) {
for (size_t i = 0; i < n; i++) {
m_input.get();
++m_offset;
}
}
char RegexLexer::peek() {
if (m_input.eof())
throw NoMoreTokens();
return m_input.peek();
}
std::size_t RegexLexer::getByteOffset() {
return m_offset;
}

89
src/RegexLexer.h Normal file
View File

@ -0,0 +1,89 @@
#pragma once
#ifndef LEXER_RegexLexer_H
#define LEXER_RegexLexer_H
#include <exception>
#include <istream>
#include <string>
class RegexLexer {
public:
class NoMoreTokens : public std::exception {};
class NoMatch : public std::exception {};
RegexLexer(const RegexLexer&) = delete;
enum TokenType {
nonmatching,
BACKSLASH,
BACKSPACE,
BELL,
CARRIAGE_RETURN,
CHAR,
CHAR_CLASS,
DOT,
ERROR,
ESCAPED_DOT,
ESCAPED_LBRACKET,
ESCAPED_LPAREN,
ESCAPED_PIPE,
ESCAPED_PLUS,
ESCAPED_QUESTIONMARK,
ESCAPED_RBRACKET,
ESCAPED_RPAREN,
ESCAPED_STAR,
FORMFEED,
LPAREN,
NEWLINE,
PIPE,
PLUS,
QUESTIONMARK,
RPAREN,
SPACE,
STAR,
TAB,
VTAB,
ignore,
};
struct Token {
TokenType type;
std::string content;
};
RegexLexer(std::istream& in);
~RegexLexer();
/**
* Get the next token
*
* @throws NoMoreTokens if no more tokens are available
* @throws NoMatch if no match was found
*/
Token nextToken();
/**
* Skip the following `n` bytes.
*
* @param n The number of bytes to skip
*/
void skip(std::size_t n);
/**
* Peek at the current head of the input stream, useful in error reporting when a character mismatches for example
*
* @throws NoMoreTokens if the input stream is at an end
*/
char peek();
/**
* Get the current byte offset
*/
std::size_t getByteOffset();
private:
std::size_t m_offset;
std::istream& m_input;
};
#endif //LEXER_RegexLexer_H

View File

@ -1,7 +1,9 @@
#include "Lexesis/re.h" #include "Lexesis/re.h"
#include "RegexLexer.h"
#include <algorithm> #include <algorithm>
#include <iostream> #include <iostream>
#include <sstream>
#include <stack> #include <stack>
using namespace std; using namespace std;
@ -142,110 +144,46 @@ namespace lxs {
stk.push(tp); stk.push(tp);
} }
/**
* Get the actual char that should be used when c is placed after a backslash
*/
char parseEscapeChar(char c) {
switch (c)
{
case '\\':
case '*':
case '+':
case '|':
case '(':
case ')':
case '[':
case ']':
case '?':
case '.':
break;
case 'n':
c = '\n'; break;
case 'r':
c = '\r'; break;
case 'b':
c = '\b'; break;
case 't':
c = '\t'; break;
case 's':
c = ' '; break;
case 'a':
c = '\a'; break;
case 'f':
c = '\f'; break;
case 'v':
c = '\v'; break;
default:
throw SyntaxError(("Invalid escape sequence: \\" + std::string(1, c)).c_str());
}
return c;
}
/** /**
* Parse a character class * Parse a character class
*/ */
std::shared_ptr<RE> parseCharacterClass(const string& input, size_t& idx) { std::shared_ptr<RE> parseCharacterClass(const string& input) {
if (idx >= input.size())
throw SyntaxError("Unclosed character class");
std::set<char> used_chars; std::set<char> used_chars;
bool invert = false; bool invert = false;
int last_char = -1; std::size_t start = 1;
std::size_t end = input.size() - 1;
if (input[idx] == '^') if (input[1] == '^') {
{
invert = true; invert = true;
idx++; start = 2;
} }
if (idx >= input.size()) if (input[start] == ']') {
throw SyntaxError("Unclosed character class");
if (input[idx] == ']')
{
used_chars.insert(']'); used_chars.insert(']');
idx++; ++start;
last_char = ']';
} }
if (idx >= input.size())
throw SyntaxError("Unclosed character class");
if (input[idx] == '-') if (input[start] == '-') {
{
used_chars.insert('-'); used_chars.insert('-');
idx++; ++start;
last_char = '-';
} }
if (idx >= input.size()) if (input[end] == '-') {
throw SyntaxError("Unclosed character class"); used_chars.insert('-');
--end;
}
for (; idx < input.size() && input[idx] != ']'; idx++) int last_char = -1;
for (std::size_t idx = start; idx < end; idx++)
{ {
if (input[idx] == '-') if (input[idx] == '-')
{ {
idx++; idx++;
for (int i = last_char + 1; i <= input[idx]; i++) {
if (idx >= input.size()) used_chars.insert((char) i);
throw SyntaxError("Unclosed character class");
if (input[idx] == ']')
{
used_chars.insert('-');
idx--;
}
else
{
if (last_char == -1)
throw SyntaxError("Nothing to apply range to");
for (int i = last_char + 1; i <= input[idx]; i++)
{
used_chars.insert((char) i);
}
last_char = -1;
} }
last_char = -1;
} }
else else
{ {
@ -253,13 +191,9 @@ namespace lxs {
last_char = input[idx]; last_char = input[idx];
} }
} }
if (idx >= input.size())
throw SyntaxError("Unclosed character class");
std::vector<char> chars; std::vector<char> chars;
for (int i = 0; i < 256; i++) for (int i = 0; i < 256; i++) {
{
if (invert ^ (used_chars.count((char) i) > 0)) if (invert ^ (used_chars.count((char) i) > 0))
chars.push_back((char) i); chars.push_back((char) i);
} }
@ -281,104 +215,155 @@ namespace lxs {
/** /**
* Parse the actual regex * Parse the actual regex
*/ */
std::shared_ptr<RE> parseRE(const string& input, size_t& idx) std::shared_ptr<RE> parseRE(RegexLexer& lex, bool& exit_by_closed_paren, bool inside_parens=false)
{ {
stack<std::shared_ptr<RE> > stk; stack<std::shared_ptr<RE> > stk;
for (; idx < input.length(); idx++) //TODO: report location in regex on error
{
std::shared_ptr<RE> n;
switch (input[idx])
{
case '\\':
idx++;
if (idx >= input.length())
throw SyntaxError("Escape sequence at the end of the string");
else
stk.push(std::make_shared<SingleRE>(parseEscapeChar(input[idx])));
break;
case '[': try {
stk.push(parseCharacterClass(input, ++idx)); while (true) {
break; exit_by_closed_paren = false;
RegexLexer::Token tok = lex.nextToken();
std::shared_ptr<RE> n;
switch (tok.type) {
case RegexLexer::TAB:
stk.push(std::make_shared<SingleRE>('\t'));
break;
case RegexLexer::NEWLINE:
stk.push(std::make_shared<SingleRE>('\n'));
break;
case RegexLexer::CARRIAGE_RETURN:
stk.push(std::make_shared<SingleRE>('\r'));
break;
case RegexLexer::BACKSPACE:
stk.push(std::make_shared<SingleRE>('\b'));
break;
case RegexLexer::SPACE:
stk.push(std::make_shared<SingleRE>(' '));
break;
case RegexLexer::BELL:
stk.push(std::make_shared<SingleRE>('\a'));
break;
case RegexLexer::FORMFEED:
stk.push(std::make_shared<SingleRE>('\f'));
break;
case RegexLexer::VTAB:
stk.push(std::make_shared<SingleRE>('\v'));
break;
case '.': case RegexLexer::BACKSLASH:
stk.push(dotChar()); case RegexLexer::ESCAPED_STAR:
break; case RegexLexer::ESCAPED_PLUS:
case RegexLexer::ESCAPED_PIPE:
case RegexLexer::ESCAPED_LPAREN:
case RegexLexer::ESCAPED_RPAREN:
case RegexLexer::ESCAPED_LBRACKET:
case RegexLexer::ESCAPED_RBRACKET:
case RegexLexer::ESCAPED_QUESTIONMARK:
case RegexLexer::ESCAPED_DOT:
stk.push(std::make_shared<SingleRE>(tok.content[1]));
break;
case RegexLexer::DOT:
stk.push(dotChar());
break;
case ']': case RegexLexer::STAR:
throw SyntaxError("Unopened ']'"); if (stk.empty())
break; throw SyntaxError("Cannot apply kleene star to empty regex");
n = std::make_shared<StarRE>(stk.top());
stk.pop();
stk.push(n);
break;
case '*': case RegexLexer::PLUS:
if (stk.empty()) if (stk.empty())
throw SyntaxError("Cannot apply kleene star to empty regex"); throw SyntaxError("Cannot apply kleene plus to empty regex");
n = std::make_shared<StarRE>(stk.top()); n = stk.top();
stk.pop(); stk.pop();
stk.push(n); n = std::make_shared<ConcatRE>(n, std::make_shared<StarRE>(n));
break; stk.push(n);
break;
case '+': case RegexLexer::QUESTIONMARK:
if (stk.empty()) if (stk.empty())
throw SyntaxError("Cannot apply kleene plus to empty regex"); throw SyntaxError("Cannot apply '?' to empty regex");
n = stk.top(); n = std::make_shared<PlusRE>(stk.top(), std::make_shared<EpsilonRE>());
stk.pop(); stk.pop();
n = std::make_shared<ConcatRE>(n, std::make_shared<StarRE>(n)); stk.push(n);
stk.push(n); break;
break;
case '?': case RegexLexer::PIPE:
if (stk.empty()) if (stk.empty())
throw SyntaxError("Cannot apply '?' to empty regex"); throw SyntaxError("Invalid regex: nothing to the left of '|'");
n = std::make_shared<PlusRE>(stk.top(), std::make_shared<EpsilonRE>()); if (stk.size() > 1)
stk.pop(); compactStack(stk), compress(stk);
stk.push(n); n = std::make_shared<PlusRE>(stk.top(), parseRE(lex, exit_by_closed_paren, inside_parens));
break; stk.pop();
stk.push(n);
if (exit_by_closed_paren) {
if (stk.size() == 1)
return stk.top();
else if (stk.size() == 2)
return compress(stk), stk.top();
else
throw SyntaxError("Invalid regex");
}
break;
case '|': case RegexLexer::LPAREN:
if (stk.empty()) n = parseRE(lex, exit_by_closed_paren, true);
throw SyntaxError("Invalid regex: nothing to the left of '|'"); if (!exit_by_closed_paren) {
if (stk.size() > 1) throw SyntaxError("Unclosed parenthesis");
compactStack(stk), compress(stk); }
n = std::make_shared<PlusRE>(stk.top(), parseRE(input, ++idx)); stk.push(n);
stk.pop(); break;
stk.push(n);
idx--;
break;
case '(': case RegexLexer::RPAREN:
n = parseRE(input, ++idx); if (!inside_parens)
if (idx >= input.size() || input[idx] != ')') throw SyntaxError("Unopened parenthesis");
throw SyntaxError("Could not parse regex, unclosed parentheses");
stk.push(n);
break;
case ')': exit_by_closed_paren = true;
if (stk.size() == 1) if (stk.size() == 1)
return stk.top(); return stk.top();
else if (stk.size() == 2) else if (stk.size() == 2)
return compress(stk), stk.top(); return compress(stk), stk.top();
throw SyntaxError("Could not parse regex, nothing inside parentheses"); throw SyntaxError("Could not parse regex, nothing inside parentheses");
default: case RegexLexer::CHAR:
stk.push(std::make_shared<SingleRE>(input[idx])); stk.push(std::make_shared<SingleRE>(tok.content[0]));
break;
case RegexLexer::CHAR_CLASS:
stk.push(parseCharacterClass(tok.content));
break;
case RegexLexer::ERROR:
throw SyntaxError(("Error on character: " + tok.content).c_str());
case RegexLexer::ignore: case RegexLexer::nonmatching:
//Just ignore these
break;
}
compactStack(stk);
} }
compactStack(stk); } catch (RegexLexer::NoMoreTokens& err) {
if (stk.size() == 1)
return stk.top();
else if (stk.size() == 2)
return compress(stk), stk.top();
throw SyntaxError("Could not parse regex");
} }
if (stk.size() == 1)
return stk.top();
else if (stk.size() == 2)
return compress(stk), stk.top();
throw SyntaxError("Could not parse regex");
} }
} }
std::shared_ptr<RE> parseRE(const string& input) std::shared_ptr<RE> parseRE(const string& input)
{ {
size_t i = 0; std::istringstream inputstream(input);
std::shared_ptr<RE> res = parseRE(input, i); RegexLexer lex(inputstream);
if (i < input.length() - 1) bool exit_by_closed_paren = false;
throw SyntaxError("Incorrect regex"); std::shared_ptr<RE> res = parseRE(lex, exit_by_closed_paren);
return res; return res;
} }
} }