WIP
This commit is contained in:
parent
3fd362bf2d
commit
7f9c7aed44
|
@ -10,6 +10,7 @@ add_library(lxs
|
||||||
backend.cpp
|
backend.cpp
|
||||||
backendmanager.cpp
|
backendmanager.cpp
|
||||||
driver.cpp
|
driver.cpp
|
||||||
|
RegexLexer.cpp
|
||||||
re.cpp
|
re.cpp
|
||||||
inputparser.cpp
|
inputparser.cpp
|
||||||
template.cpp
|
template.cpp
|
||||||
|
|
|
@ -0,0 +1,166 @@
|
||||||
|
#include "RegexLexer.h"
|
||||||
|
|
||||||
|
#include <sstream>
|
||||||
|
#include <iostream>
|
||||||
|
|
||||||
|
namespace { //The automaton data
|
||||||
|
typedef std::size_t State;
|
||||||
|
|
||||||
|
State REJECT = 39;
|
||||||
|
|
||||||
|
unsigned char TRANS_IDX[256] = { (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)1, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)2, (unsigned char)3, (unsigned char)4, (unsigned char)5, (unsigned char)0, (unsigned char)6, (unsigned char)7, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)8, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)9, (unsigned char)10, (unsigned char)11, (unsigned char)12, (unsigned char)0, (unsigned char)0, (unsigned char)13, (unsigned char)14, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)15, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)16, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)17, (unsigned char)18, (unsigned char)19, (unsigned char)0, (unsigned char)20, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)21, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, };
|
||||||
|
|
||||||
|
State TABLE[40 - 1][22] = {
|
||||||
|
{ 37, 16, 35, 36, 31, 32, 37, 30, 33, 1, 11, 38, 37, 37, 37, 37, 37, 37, 37, 37, 37, 34, },
|
||||||
|
|
||||||
|
{ 6, 6, 6, 6, 6, 6, 5, 6, 6, 6, 6, 4, 2, 6, 6, 6, 6, 6, 6, 6, 6, 6, },
|
||||||
|
|
||||||
|
{ 6, 6, 6, 6, 6, 6, 5, 6, 6, 6, 6, 3, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, },
|
||||||
|
|
||||||
|
{ 6, 6, 6, 6, 6, 6, 5, 6, 6, 6, 6, 10, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, },
|
||||||
|
|
||||||
|
{ 6, 6, 6, 6, 6, 6, 5, 6, 6, 6, 6, 10, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, },
|
||||||
|
|
||||||
|
{ 6, 6, 6, 6, 6, 6, 39, 6, 6, 6, 6, 10, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, },
|
||||||
|
|
||||||
|
{ 6, 6, 6, 6, 6, 6, 7, 6, 6, 6, 6, 10, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, },
|
||||||
|
|
||||||
|
{ 8, 8, 8, 8, 8, 8, 39, 8, 8, 8, 8, 10, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, },
|
||||||
|
|
||||||
|
{ 6, 6, 6, 6, 6, 6, 9, 6, 6, 6, 6, 10, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, },
|
||||||
|
|
||||||
|
{ 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 10, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, },
|
||||||
|
|
||||||
|
{ 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, },
|
||||||
|
|
||||||
|
{ 39, 39, 24, 25, 21, 22, 39, 29, 28, 26, 20, 27, 39, 17, 15, 18, 13, 14, 16, 12, 19, 23, },
|
||||||
|
|
||||||
|
{ 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, },
|
||||||
|
|
||||||
|
{ 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, },
|
||||||
|
|
||||||
|
{ 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, },
|
||||||
|
|
||||||
|
{ 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, },
|
||||||
|
|
||||||
|
{ 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, },
|
||||||
|
|
||||||
|
{ 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, },
|
||||||
|
|
||||||
|
{ 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, },
|
||||||
|
|
||||||
|
{ 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, },
|
||||||
|
|
||||||
|
{ 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, },
|
||||||
|
|
||||||
|
{ 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, },
|
||||||
|
|
||||||
|
{ 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, },
|
||||||
|
|
||||||
|
{ 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, },
|
||||||
|
|
||||||
|
{ 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, },
|
||||||
|
|
||||||
|
{ 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, },
|
||||||
|
|
||||||
|
{ 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, },
|
||||||
|
|
||||||
|
{ 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, },
|
||||||
|
|
||||||
|
{ 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, },
|
||||||
|
|
||||||
|
{ 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, },
|
||||||
|
|
||||||
|
{ 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, },
|
||||||
|
|
||||||
|
{ 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, },
|
||||||
|
|
||||||
|
{ 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, },
|
||||||
|
|
||||||
|
{ 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, },
|
||||||
|
|
||||||
|
{ 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, },
|
||||||
|
|
||||||
|
{ 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, },
|
||||||
|
|
||||||
|
{ 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, },
|
||||||
|
|
||||||
|
{ 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, },
|
||||||
|
|
||||||
|
{ 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, },
|
||||||
|
};
|
||||||
|
|
||||||
|
RegexLexer::TokenType TOKENS[40] = { RegexLexer::nonmatching, RegexLexer::ERROR, RegexLexer::nonmatching, RegexLexer::CHAR_CLASS, RegexLexer::nonmatching, RegexLexer::nonmatching, RegexLexer::nonmatching, RegexLexer::nonmatching, RegexLexer::nonmatching, RegexLexer::nonmatching, RegexLexer::CHAR_CLASS, RegexLexer::ERROR, RegexLexer::TAB, RegexLexer::NEWLINE, RegexLexer::CARRIAGE_RETURN, RegexLexer::BACKSPACE, RegexLexer::SPACE, RegexLexer::BELL, RegexLexer::FORMFEED, RegexLexer::VTAB, RegexLexer::BACKSLASH, RegexLexer::ESCAPED_STAR, RegexLexer::ESCAPED_PLUS, RegexLexer::ESCAPED_PIPE, RegexLexer::ESCAPED_LPAREN, RegexLexer::ESCAPED_RPAREN, RegexLexer::ESCAPED_LBRACKET, RegexLexer::ESCAPED_RBRACKET, RegexLexer::ESCAPED_QUESTIONMARK, RegexLexer::ESCAPED_DOT, RegexLexer::DOT, RegexLexer::STAR, RegexLexer::PLUS, RegexLexer::QUESTIONMARK, RegexLexer::PIPE, RegexLexer::LPAREN, RegexLexer::RPAREN, RegexLexer::CHAR, RegexLexer::ERROR, RegexLexer::nonmatching, };
|
||||||
|
}
|
||||||
|
|
||||||
|
RegexLexer::RegexLexer(std::istream& in) : m_offset(0), m_input(in) {
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
RegexLexer::~RegexLexer() {
|
||||||
|
}
|
||||||
|
|
||||||
|
RegexLexer::Token RegexLexer::nextToken() {
|
||||||
|
TokenType type = ignore;
|
||||||
|
std::string token;
|
||||||
|
|
||||||
|
while (type == ignore) {
|
||||||
|
State state = 0;
|
||||||
|
std::size_t match_length = 0;
|
||||||
|
token = "";
|
||||||
|
|
||||||
|
while (!m_input.eof() && state != REJECT) {
|
||||||
|
char c = m_input.peek();
|
||||||
|
if (m_input.eof())
|
||||||
|
break;
|
||||||
|
|
||||||
|
token += c;
|
||||||
|
|
||||||
|
state = TABLE[state][TRANS_IDX[(unsigned char)c]];
|
||||||
|
if (TOKENS[state])
|
||||||
|
{
|
||||||
|
match_length = token.length();
|
||||||
|
type = TOKENS[state];
|
||||||
|
}
|
||||||
|
m_input.get();
|
||||||
|
++m_offset;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::size_t sdiff = token.length() - match_length;
|
||||||
|
for (std::size_t i = 0; i < sdiff; i++)
|
||||||
|
{
|
||||||
|
m_input.putback(token[token.length() - i - 1]);
|
||||||
|
}
|
||||||
|
m_offset -= sdiff;
|
||||||
|
|
||||||
|
if (!type || !match_length) {
|
||||||
|
if (m_input.eof())
|
||||||
|
throw NoMoreTokens();
|
||||||
|
throw NoMatch();
|
||||||
|
}
|
||||||
|
|
||||||
|
token = token.substr(0, match_length);
|
||||||
|
}
|
||||||
|
|
||||||
|
Token t;
|
||||||
|
t.type = type;
|
||||||
|
t.content = token;
|
||||||
|
return t;
|
||||||
|
}
|
||||||
|
|
||||||
|
void RegexLexer::skip(std::size_t n) {
|
||||||
|
for (size_t i = 0; i < n; i++) {
|
||||||
|
m_input.get();
|
||||||
|
++m_offset;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
char RegexLexer::peek() {
|
||||||
|
if (m_input.eof())
|
||||||
|
throw NoMoreTokens();
|
||||||
|
return m_input.peek();
|
||||||
|
}
|
||||||
|
|
||||||
|
std::size_t RegexLexer::getByteOffset() {
|
||||||
|
return m_offset;
|
||||||
|
}
|
|
@ -0,0 +1,89 @@
|
||||||
|
#pragma once
|
||||||
|
#ifndef LEXER_RegexLexer_H
|
||||||
|
#define LEXER_RegexLexer_H
|
||||||
|
|
||||||
|
#include <exception>
|
||||||
|
#include <istream>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
class RegexLexer {
|
||||||
|
public:
|
||||||
|
class NoMoreTokens : public std::exception {};
|
||||||
|
class NoMatch : public std::exception {};
|
||||||
|
|
||||||
|
RegexLexer(const RegexLexer&) = delete;
|
||||||
|
|
||||||
|
enum TokenType {
|
||||||
|
nonmatching,
|
||||||
|
BACKSLASH,
|
||||||
|
BACKSPACE,
|
||||||
|
BELL,
|
||||||
|
CARRIAGE_RETURN,
|
||||||
|
CHAR,
|
||||||
|
CHAR_CLASS,
|
||||||
|
DOT,
|
||||||
|
ERROR,
|
||||||
|
ESCAPED_DOT,
|
||||||
|
ESCAPED_LBRACKET,
|
||||||
|
ESCAPED_LPAREN,
|
||||||
|
ESCAPED_PIPE,
|
||||||
|
ESCAPED_PLUS,
|
||||||
|
ESCAPED_QUESTIONMARK,
|
||||||
|
ESCAPED_RBRACKET,
|
||||||
|
ESCAPED_RPAREN,
|
||||||
|
ESCAPED_STAR,
|
||||||
|
FORMFEED,
|
||||||
|
LPAREN,
|
||||||
|
NEWLINE,
|
||||||
|
PIPE,
|
||||||
|
PLUS,
|
||||||
|
QUESTIONMARK,
|
||||||
|
RPAREN,
|
||||||
|
SPACE,
|
||||||
|
STAR,
|
||||||
|
TAB,
|
||||||
|
VTAB,
|
||||||
|
ignore,
|
||||||
|
};
|
||||||
|
|
||||||
|
struct Token {
|
||||||
|
TokenType type;
|
||||||
|
std::string content;
|
||||||
|
};
|
||||||
|
|
||||||
|
RegexLexer(std::istream& in);
|
||||||
|
~RegexLexer();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the next token
|
||||||
|
*
|
||||||
|
* @throws NoMoreTokens if no more tokens are available
|
||||||
|
* @throws NoMatch if no match was found
|
||||||
|
*/
|
||||||
|
Token nextToken();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Skip the following `n` bytes.
|
||||||
|
*
|
||||||
|
* @param n The number of bytes to skip
|
||||||
|
*/
|
||||||
|
void skip(std::size_t n);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Peek at the current head of the input stream, useful in error reporting when a character mismatches for example
|
||||||
|
*
|
||||||
|
* @throws NoMoreTokens if the input stream is at an end
|
||||||
|
*/
|
||||||
|
char peek();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get the current byte offset
|
||||||
|
*/
|
||||||
|
std::size_t getByteOffset();
|
||||||
|
|
||||||
|
private:
|
||||||
|
std::size_t m_offset;
|
||||||
|
std::istream& m_input;
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif //LEXER_RegexLexer_H
|
315
src/re.cpp
315
src/re.cpp
|
@ -1,7 +1,9 @@
|
||||||
#include "Lexesis/re.h"
|
#include "Lexesis/re.h"
|
||||||
|
#include "RegexLexer.h"
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
|
#include <sstream>
|
||||||
#include <stack>
|
#include <stack>
|
||||||
using namespace std;
|
using namespace std;
|
||||||
|
|
||||||
|
@ -142,110 +144,46 @@ namespace lxs {
|
||||||
stk.push(tp);
|
stk.push(tp);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Get the actual char that should be used when c is placed after a backslash
|
|
||||||
*/
|
|
||||||
char parseEscapeChar(char c) {
|
|
||||||
switch (c)
|
|
||||||
{
|
|
||||||
case '\\':
|
|
||||||
case '*':
|
|
||||||
case '+':
|
|
||||||
case '|':
|
|
||||||
case '(':
|
|
||||||
case ')':
|
|
||||||
case '[':
|
|
||||||
case ']':
|
|
||||||
case '?':
|
|
||||||
case '.':
|
|
||||||
break;
|
|
||||||
case 'n':
|
|
||||||
c = '\n'; break;
|
|
||||||
case 'r':
|
|
||||||
c = '\r'; break;
|
|
||||||
case 'b':
|
|
||||||
c = '\b'; break;
|
|
||||||
case 't':
|
|
||||||
c = '\t'; break;
|
|
||||||
case 's':
|
|
||||||
c = ' '; break;
|
|
||||||
case 'a':
|
|
||||||
c = '\a'; break;
|
|
||||||
case 'f':
|
|
||||||
c = '\f'; break;
|
|
||||||
case 'v':
|
|
||||||
c = '\v'; break;
|
|
||||||
default:
|
|
||||||
throw SyntaxError(("Invalid escape sequence: \\" + std::string(1, c)).c_str());
|
|
||||||
}
|
|
||||||
return c;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Parse a character class
|
* Parse a character class
|
||||||
*/
|
*/
|
||||||
std::shared_ptr<RE> parseCharacterClass(const string& input, size_t& idx) {
|
std::shared_ptr<RE> parseCharacterClass(const string& input) {
|
||||||
if (idx >= input.size())
|
|
||||||
throw SyntaxError("Unclosed character class");
|
|
||||||
std::set<char> used_chars;
|
std::set<char> used_chars;
|
||||||
|
|
||||||
bool invert = false;
|
bool invert = false;
|
||||||
int last_char = -1;
|
std::size_t start = 1;
|
||||||
|
std::size_t end = input.size() - 1;
|
||||||
|
|
||||||
if (input[idx] == '^')
|
if (input[1] == '^') {
|
||||||
{
|
|
||||||
invert = true;
|
invert = true;
|
||||||
idx++;
|
start = 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (idx >= input.size())
|
if (input[start] == ']') {
|
||||||
throw SyntaxError("Unclosed character class");
|
|
||||||
|
|
||||||
|
|
||||||
if (input[idx] == ']')
|
|
||||||
{
|
|
||||||
used_chars.insert(']');
|
used_chars.insert(']');
|
||||||
idx++;
|
++start;
|
||||||
last_char = ']';
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (idx >= input.size())
|
|
||||||
throw SyntaxError("Unclosed character class");
|
|
||||||
|
|
||||||
if (input[idx] == '-')
|
if (input[start] == '-') {
|
||||||
{
|
|
||||||
used_chars.insert('-');
|
used_chars.insert('-');
|
||||||
idx++;
|
++start;
|
||||||
last_char = '-';
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (idx >= input.size())
|
if (input[end] == '-') {
|
||||||
throw SyntaxError("Unclosed character class");
|
used_chars.insert('-');
|
||||||
|
--end;
|
||||||
|
}
|
||||||
|
|
||||||
for (; idx < input.size() && input[idx] != ']'; idx++)
|
int last_char = -1;
|
||||||
|
for (std::size_t idx = start; idx < end; idx++)
|
||||||
{
|
{
|
||||||
if (input[idx] == '-')
|
if (input[idx] == '-')
|
||||||
{
|
{
|
||||||
idx++;
|
idx++;
|
||||||
|
for (int i = last_char + 1; i <= input[idx]; i++) {
|
||||||
if (idx >= input.size())
|
used_chars.insert((char) i);
|
||||||
throw SyntaxError("Unclosed character class");
|
|
||||||
|
|
||||||
if (input[idx] == ']')
|
|
||||||
{
|
|
||||||
used_chars.insert('-');
|
|
||||||
idx--;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
{
|
|
||||||
if (last_char == -1)
|
|
||||||
throw SyntaxError("Nothing to apply range to");
|
|
||||||
for (int i = last_char + 1; i <= input[idx]; i++)
|
|
||||||
{
|
|
||||||
used_chars.insert((char) i);
|
|
||||||
}
|
|
||||||
last_char = -1;
|
|
||||||
}
|
}
|
||||||
|
last_char = -1;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@ -253,13 +191,9 @@ namespace lxs {
|
||||||
last_char = input[idx];
|
last_char = input[idx];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (idx >= input.size())
|
|
||||||
throw SyntaxError("Unclosed character class");
|
|
||||||
|
|
||||||
std::vector<char> chars;
|
std::vector<char> chars;
|
||||||
for (int i = 0; i < 256; i++)
|
for (int i = 0; i < 256; i++) {
|
||||||
{
|
|
||||||
if (invert ^ (used_chars.count((char) i) > 0))
|
if (invert ^ (used_chars.count((char) i) > 0))
|
||||||
chars.push_back((char) i);
|
chars.push_back((char) i);
|
||||||
}
|
}
|
||||||
|
@ -281,104 +215,155 @@ namespace lxs {
|
||||||
/**
|
/**
|
||||||
* Parse the actual regex
|
* Parse the actual regex
|
||||||
*/
|
*/
|
||||||
std::shared_ptr<RE> parseRE(const string& input, size_t& idx)
|
std::shared_ptr<RE> parseRE(RegexLexer& lex, bool& exit_by_closed_paren, bool inside_parens=false)
|
||||||
{
|
{
|
||||||
stack<std::shared_ptr<RE> > stk;
|
stack<std::shared_ptr<RE> > stk;
|
||||||
for (; idx < input.length(); idx++)
|
//TODO: report location in regex on error
|
||||||
{
|
|
||||||
std::shared_ptr<RE> n;
|
|
||||||
switch (input[idx])
|
|
||||||
{
|
|
||||||
case '\\':
|
|
||||||
idx++;
|
|
||||||
if (idx >= input.length())
|
|
||||||
throw SyntaxError("Escape sequence at the end of the string");
|
|
||||||
else
|
|
||||||
stk.push(std::make_shared<SingleRE>(parseEscapeChar(input[idx])));
|
|
||||||
break;
|
|
||||||
|
|
||||||
case '[':
|
try {
|
||||||
stk.push(parseCharacterClass(input, ++idx));
|
while (true) {
|
||||||
break;
|
exit_by_closed_paren = false;
|
||||||
|
RegexLexer::Token tok = lex.nextToken();
|
||||||
|
std::shared_ptr<RE> n;
|
||||||
|
switch (tok.type) {
|
||||||
|
case RegexLexer::TAB:
|
||||||
|
stk.push(std::make_shared<SingleRE>('\t'));
|
||||||
|
break;
|
||||||
|
case RegexLexer::NEWLINE:
|
||||||
|
stk.push(std::make_shared<SingleRE>('\n'));
|
||||||
|
break;
|
||||||
|
case RegexLexer::CARRIAGE_RETURN:
|
||||||
|
stk.push(std::make_shared<SingleRE>('\r'));
|
||||||
|
break;
|
||||||
|
case RegexLexer::BACKSPACE:
|
||||||
|
stk.push(std::make_shared<SingleRE>('\b'));
|
||||||
|
break;
|
||||||
|
case RegexLexer::SPACE:
|
||||||
|
stk.push(std::make_shared<SingleRE>(' '));
|
||||||
|
break;
|
||||||
|
case RegexLexer::BELL:
|
||||||
|
stk.push(std::make_shared<SingleRE>('\a'));
|
||||||
|
break;
|
||||||
|
case RegexLexer::FORMFEED:
|
||||||
|
stk.push(std::make_shared<SingleRE>('\f'));
|
||||||
|
break;
|
||||||
|
case RegexLexer::VTAB:
|
||||||
|
stk.push(std::make_shared<SingleRE>('\v'));
|
||||||
|
break;
|
||||||
|
|
||||||
case '.':
|
case RegexLexer::BACKSLASH:
|
||||||
stk.push(dotChar());
|
case RegexLexer::ESCAPED_STAR:
|
||||||
break;
|
case RegexLexer::ESCAPED_PLUS:
|
||||||
|
case RegexLexer::ESCAPED_PIPE:
|
||||||
|
case RegexLexer::ESCAPED_LPAREN:
|
||||||
|
case RegexLexer::ESCAPED_RPAREN:
|
||||||
|
case RegexLexer::ESCAPED_LBRACKET:
|
||||||
|
case RegexLexer::ESCAPED_RBRACKET:
|
||||||
|
case RegexLexer::ESCAPED_QUESTIONMARK:
|
||||||
|
case RegexLexer::ESCAPED_DOT:
|
||||||
|
stk.push(std::make_shared<SingleRE>(tok.content[1]));
|
||||||
|
break;
|
||||||
|
|
||||||
|
case RegexLexer::DOT:
|
||||||
|
stk.push(dotChar());
|
||||||
|
break;
|
||||||
|
|
||||||
case ']':
|
case RegexLexer::STAR:
|
||||||
throw SyntaxError("Unopened ']'");
|
if (stk.empty())
|
||||||
break;
|
throw SyntaxError("Cannot apply kleene star to empty regex");
|
||||||
|
n = std::make_shared<StarRE>(stk.top());
|
||||||
|
stk.pop();
|
||||||
|
stk.push(n);
|
||||||
|
break;
|
||||||
|
|
||||||
case '*':
|
case RegexLexer::PLUS:
|
||||||
if (stk.empty())
|
if (stk.empty())
|
||||||
throw SyntaxError("Cannot apply kleene star to empty regex");
|
throw SyntaxError("Cannot apply kleene plus to empty regex");
|
||||||
n = std::make_shared<StarRE>(stk.top());
|
n = stk.top();
|
||||||
stk.pop();
|
stk.pop();
|
||||||
stk.push(n);
|
n = std::make_shared<ConcatRE>(n, std::make_shared<StarRE>(n));
|
||||||
break;
|
stk.push(n);
|
||||||
|
break;
|
||||||
|
|
||||||
case '+':
|
case RegexLexer::QUESTIONMARK:
|
||||||
if (stk.empty())
|
if (stk.empty())
|
||||||
throw SyntaxError("Cannot apply kleene plus to empty regex");
|
throw SyntaxError("Cannot apply '?' to empty regex");
|
||||||
n = stk.top();
|
n = std::make_shared<PlusRE>(stk.top(), std::make_shared<EpsilonRE>());
|
||||||
stk.pop();
|
stk.pop();
|
||||||
n = std::make_shared<ConcatRE>(n, std::make_shared<StarRE>(n));
|
stk.push(n);
|
||||||
stk.push(n);
|
break;
|
||||||
break;
|
|
||||||
|
|
||||||
case '?':
|
case RegexLexer::PIPE:
|
||||||
if (stk.empty())
|
if (stk.empty())
|
||||||
throw SyntaxError("Cannot apply '?' to empty regex");
|
throw SyntaxError("Invalid regex: nothing to the left of '|'");
|
||||||
n = std::make_shared<PlusRE>(stk.top(), std::make_shared<EpsilonRE>());
|
if (stk.size() > 1)
|
||||||
stk.pop();
|
compactStack(stk), compress(stk);
|
||||||
stk.push(n);
|
n = std::make_shared<PlusRE>(stk.top(), parseRE(lex, exit_by_closed_paren, inside_parens));
|
||||||
break;
|
stk.pop();
|
||||||
|
stk.push(n);
|
||||||
|
if (exit_by_closed_paren) {
|
||||||
|
if (stk.size() == 1)
|
||||||
|
return stk.top();
|
||||||
|
else if (stk.size() == 2)
|
||||||
|
return compress(stk), stk.top();
|
||||||
|
else
|
||||||
|
throw SyntaxError("Invalid regex");
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
case '|':
|
case RegexLexer::LPAREN:
|
||||||
if (stk.empty())
|
n = parseRE(lex, exit_by_closed_paren, true);
|
||||||
throw SyntaxError("Invalid regex: nothing to the left of '|'");
|
if (!exit_by_closed_paren) {
|
||||||
if (stk.size() > 1)
|
throw SyntaxError("Unclosed parenthesis");
|
||||||
compactStack(stk), compress(stk);
|
}
|
||||||
n = std::make_shared<PlusRE>(stk.top(), parseRE(input, ++idx));
|
stk.push(n);
|
||||||
stk.pop();
|
break;
|
||||||
stk.push(n);
|
|
||||||
idx--;
|
|
||||||
break;
|
|
||||||
|
|
||||||
case '(':
|
case RegexLexer::RPAREN:
|
||||||
n = parseRE(input, ++idx);
|
if (!inside_parens)
|
||||||
if (idx >= input.size() || input[idx] != ')')
|
throw SyntaxError("Unopened parenthesis");
|
||||||
throw SyntaxError("Could not parse regex, unclosed parentheses");
|
|
||||||
stk.push(n);
|
|
||||||
break;
|
|
||||||
|
|
||||||
case ')':
|
exit_by_closed_paren = true;
|
||||||
if (stk.size() == 1)
|
if (stk.size() == 1)
|
||||||
return stk.top();
|
return stk.top();
|
||||||
else if (stk.size() == 2)
|
else if (stk.size() == 2)
|
||||||
return compress(stk), stk.top();
|
return compress(stk), stk.top();
|
||||||
throw SyntaxError("Could not parse regex, nothing inside parentheses");
|
throw SyntaxError("Could not parse regex, nothing inside parentheses");
|
||||||
|
|
||||||
default:
|
case RegexLexer::CHAR:
|
||||||
stk.push(std::make_shared<SingleRE>(input[idx]));
|
stk.push(std::make_shared<SingleRE>(tok.content[0]));
|
||||||
|
break;
|
||||||
|
|
||||||
|
case RegexLexer::CHAR_CLASS:
|
||||||
|
stk.push(parseCharacterClass(tok.content));
|
||||||
|
break;
|
||||||
|
|
||||||
|
case RegexLexer::ERROR:
|
||||||
|
throw SyntaxError(("Error on character: " + tok.content).c_str());
|
||||||
|
|
||||||
|
case RegexLexer::ignore: case RegexLexer::nonmatching:
|
||||||
|
//Just ignore these
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
compactStack(stk);
|
||||||
}
|
}
|
||||||
compactStack(stk);
|
} catch (RegexLexer::NoMoreTokens& err) {
|
||||||
|
if (stk.size() == 1)
|
||||||
|
return stk.top();
|
||||||
|
else if (stk.size() == 2)
|
||||||
|
return compress(stk), stk.top();
|
||||||
|
throw SyntaxError("Could not parse regex");
|
||||||
}
|
}
|
||||||
if (stk.size() == 1)
|
|
||||||
return stk.top();
|
|
||||||
else if (stk.size() == 2)
|
|
||||||
return compress(stk), stk.top();
|
|
||||||
throw SyntaxError("Could not parse regex");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
std::shared_ptr<RE> parseRE(const string& input)
|
std::shared_ptr<RE> parseRE(const string& input)
|
||||||
{
|
{
|
||||||
size_t i = 0;
|
std::istringstream inputstream(input);
|
||||||
std::shared_ptr<RE> res = parseRE(input, i);
|
RegexLexer lex(inputstream);
|
||||||
if (i < input.length() - 1)
|
bool exit_by_closed_paren = false;
|
||||||
throw SyntaxError("Incorrect regex");
|
std::shared_ptr<RE> res = parseRE(lex, exit_by_closed_paren);
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue