Merge branch 'bootstrapping'
This commit is contained in:
commit
e19e48f150
|
@ -3,6 +3,7 @@
|
|||
#define RE_H
|
||||
|
||||
#include "Lexesis/automata.h"
|
||||
#include "RegexLexer.h"
|
||||
|
||||
#include <memory>
|
||||
#include <stdexcept>
|
||||
|
@ -154,7 +155,7 @@ namespace lxs {
|
|||
class SyntaxError : public std::runtime_error
|
||||
{
|
||||
public:
|
||||
SyntaxError(const char* w) : std::runtime_error(w) {}
|
||||
SyntaxError(RegexLexer& lex, const std::string w) : std::runtime_error((std::to_string(lex.getByteOffset()) + ": " + w)) {}
|
||||
};
|
||||
|
||||
} //namespace lxs
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/config.h.in ${CMAKE_CURRENT_BINARY_DIR}/config.h @ONLY)
|
||||
include_directories(${CMAKE_CURRENT_BINARY_DIR})
|
||||
include_directories(${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR})
|
||||
|
||||
add_library(Lexesis-backends
|
||||
backends/cpp.cpp
|
||||
|
@ -10,6 +10,7 @@ add_library(lxs
|
|||
backend.cpp
|
||||
backendmanager.cpp
|
||||
driver.cpp
|
||||
RegexLexer.cpp
|
||||
re.cpp
|
||||
inputparser.cpp
|
||||
template.cpp
|
||||
|
|
|
@ -0,0 +1,162 @@
|
|||
#include "RegexLexer.h"
|
||||
|
||||
#include <sstream>
|
||||
#include <iostream>
|
||||
|
||||
namespace { //The automaton data
|
||||
typedef std::size_t State;
|
||||
|
||||
State REJECT = 37;
|
||||
|
||||
unsigned char TRANS_IDX[256] = { (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)1, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)2, (unsigned char)3, (unsigned char)4, (unsigned char)5, (unsigned char)0, (unsigned char)6, (unsigned char)7, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)8, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)9, (unsigned char)10, (unsigned char)11, (unsigned char)12, (unsigned char)0, (unsigned char)0, (unsigned char)13, (unsigned char)14, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)15, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)16, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)17, (unsigned char)18, (unsigned char)19, (unsigned char)0, (unsigned char)20, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)21, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, };
|
||||
|
||||
State TABLE[38 - 1][22] = {
|
||||
{ 35, 14, 33, 34, 29, 30, 35, 28, 31, 1, 9, 36, 35, 35, 35, 35, 35, 35, 35, 35, 35, 32, },
|
||||
|
||||
{ 5, 5, 5, 5, 5, 5, 4, 5, 5, 5, 5, 3, 2, 5, 5, 5, 5, 5, 5, 5, 5, 5, },
|
||||
|
||||
{ 5, 5, 5, 5, 5, 5, 4, 5, 5, 5, 5, 3, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, },
|
||||
|
||||
{ 5, 5, 5, 5, 5, 5, 4, 5, 5, 5, 5, 8, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, },
|
||||
|
||||
{ 5, 5, 5, 5, 5, 5, 7, 5, 5, 5, 5, 8, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, },
|
||||
|
||||
{ 5, 5, 5, 5, 5, 5, 6, 5, 5, 5, 5, 8, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, },
|
||||
|
||||
{ 4, 4, 4, 4, 4, 4, 37, 4, 4, 4, 4, 8, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, },
|
||||
|
||||
{ 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 8, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, },
|
||||
|
||||
{ 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, },
|
||||
|
||||
{ 37, 37, 22, 23, 19, 20, 37, 27, 26, 24, 18, 25, 37, 15, 13, 16, 11, 12, 14, 10, 17, 21, },
|
||||
|
||||
{ 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, },
|
||||
|
||||
{ 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, },
|
||||
|
||||
{ 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, },
|
||||
|
||||
{ 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, },
|
||||
|
||||
{ 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, },
|
||||
|
||||
{ 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, },
|
||||
|
||||
{ 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, },
|
||||
|
||||
{ 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, },
|
||||
|
||||
{ 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, },
|
||||
|
||||
{ 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, },
|
||||
|
||||
{ 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, },
|
||||
|
||||
{ 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, },
|
||||
|
||||
{ 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, },
|
||||
|
||||
{ 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, },
|
||||
|
||||
{ 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, },
|
||||
|
||||
{ 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, },
|
||||
|
||||
{ 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, },
|
||||
|
||||
{ 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, },
|
||||
|
||||
{ 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, },
|
||||
|
||||
{ 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, },
|
||||
|
||||
{ 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, },
|
||||
|
||||
{ 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, },
|
||||
|
||||
{ 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, },
|
||||
|
||||
{ 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, },
|
||||
|
||||
{ 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, },
|
||||
|
||||
{ 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, },
|
||||
|
||||
{ 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, 37, },
|
||||
};
|
||||
|
||||
RegexLexer::TokenType TOKENS[38] = { RegexLexer::nonmatching, RegexLexer::ERROR, RegexLexer::nonmatching, RegexLexer::nonmatching, RegexLexer::nonmatching, RegexLexer::nonmatching, RegexLexer::nonmatching, RegexLexer::nonmatching, RegexLexer::CHAR_CLASS, RegexLexer::ERROR, RegexLexer::TAB, RegexLexer::NEWLINE, RegexLexer::CARRIAGE_RETURN, RegexLexer::BACKSPACE, RegexLexer::SPACE, RegexLexer::BELL, RegexLexer::FORMFEED, RegexLexer::VTAB, RegexLexer::BACKSLASH, RegexLexer::ESCAPED_STAR, RegexLexer::ESCAPED_PLUS, RegexLexer::ESCAPED_PIPE, RegexLexer::ESCAPED_LPAREN, RegexLexer::ESCAPED_RPAREN, RegexLexer::ESCAPED_LBRACKET, RegexLexer::ESCAPED_RBRACKET, RegexLexer::ESCAPED_QUESTIONMARK, RegexLexer::ESCAPED_DOT, RegexLexer::DOT, RegexLexer::STAR, RegexLexer::PLUS, RegexLexer::QUESTIONMARK, RegexLexer::PIPE, RegexLexer::LPAREN, RegexLexer::RPAREN, RegexLexer::CHAR, RegexLexer::ERROR, RegexLexer::nonmatching, };
|
||||
}
|
||||
|
||||
RegexLexer::RegexLexer(std::istream& in) : m_offset(0), m_input(in) {
|
||||
|
||||
}
|
||||
|
||||
RegexLexer::~RegexLexer() {
|
||||
}
|
||||
|
||||
RegexLexer::Token RegexLexer::nextToken() {
|
||||
TokenType type = ignore;
|
||||
std::string token;
|
||||
|
||||
while (type == ignore) {
|
||||
State state = 0;
|
||||
std::size_t match_length = 0;
|
||||
token = "";
|
||||
|
||||
while (!m_input.eof() && state != REJECT) {
|
||||
char c = m_input.peek();
|
||||
if (m_input.eof())
|
||||
break;
|
||||
|
||||
token += c;
|
||||
|
||||
state = TABLE[state][TRANS_IDX[(unsigned char)c]];
|
||||
if (TOKENS[state])
|
||||
{
|
||||
match_length = token.length();
|
||||
type = TOKENS[state];
|
||||
}
|
||||
m_input.get();
|
||||
++m_offset;
|
||||
}
|
||||
|
||||
std::size_t sdiff = token.length() - match_length;
|
||||
for (std::size_t i = 0; i < sdiff; i++)
|
||||
{
|
||||
m_input.putback(token[token.length() - i - 1]);
|
||||
}
|
||||
m_offset -= sdiff;
|
||||
|
||||
if (!type || !match_length) {
|
||||
if (m_input.eof())
|
||||
throw NoMoreTokens();
|
||||
throw NoMatch();
|
||||
}
|
||||
|
||||
token = token.substr(0, match_length);
|
||||
}
|
||||
|
||||
Token t;
|
||||
t.type = type;
|
||||
t.content = token;
|
||||
return t;
|
||||
}
|
||||
|
||||
void RegexLexer::skip(std::size_t n) {
|
||||
for (size_t i = 0; i < n; i++) {
|
||||
m_input.get();
|
||||
++m_offset;
|
||||
}
|
||||
}
|
||||
|
||||
char RegexLexer::peek() {
|
||||
if (m_input.eof())
|
||||
throw NoMoreTokens();
|
||||
return m_input.peek();
|
||||
}
|
||||
|
||||
std::size_t RegexLexer::getByteOffset() {
|
||||
return m_offset;
|
||||
}
|
|
@ -0,0 +1,87 @@
|
|||
#pragma once
|
||||
#ifndef LEXER_RegexLexer_H
|
||||
#define LEXER_RegexLexer_H
|
||||
|
||||
#include <exception>
|
||||
#include <istream>
|
||||
#include <string>
|
||||
|
||||
class RegexLexer {
|
||||
public:
|
||||
class NoMoreTokens : public std::exception {};
|
||||
class NoMatch : public std::exception {};
|
||||
|
||||
enum TokenType {
|
||||
nonmatching,
|
||||
BACKSLASH,
|
||||
BACKSPACE,
|
||||
BELL,
|
||||
CARRIAGE_RETURN,
|
||||
CHAR,
|
||||
CHAR_CLASS,
|
||||
DOT,
|
||||
ERROR,
|
||||
ESCAPED_DOT,
|
||||
ESCAPED_LBRACKET,
|
||||
ESCAPED_LPAREN,
|
||||
ESCAPED_PIPE,
|
||||
ESCAPED_PLUS,
|
||||
ESCAPED_QUESTIONMARK,
|
||||
ESCAPED_RBRACKET,
|
||||
ESCAPED_RPAREN,
|
||||
ESCAPED_STAR,
|
||||
FORMFEED,
|
||||
LPAREN,
|
||||
NEWLINE,
|
||||
PIPE,
|
||||
PLUS,
|
||||
QUESTIONMARK,
|
||||
RPAREN,
|
||||
SPACE,
|
||||
STAR,
|
||||
TAB,
|
||||
VTAB,
|
||||
ignore,
|
||||
};
|
||||
|
||||
struct Token {
|
||||
TokenType type;
|
||||
std::string content;
|
||||
};
|
||||
|
||||
RegexLexer(std::istream& in);
|
||||
~RegexLexer();
|
||||
|
||||
/**
|
||||
* Get the next token
|
||||
*
|
||||
* @throws NoMoreTokens if no more tokens are available
|
||||
* @throws NoMatch if no match was found
|
||||
*/
|
||||
Token nextToken();
|
||||
|
||||
/**
|
||||
* Skip the following `n` bytes.
|
||||
*
|
||||
* @param n The number of bytes to skip
|
||||
*/
|
||||
void skip(std::size_t n);
|
||||
|
||||
/**
|
||||
* Peek at the current head of the input stream, useful in error reporting when a character mismatches for example
|
||||
*
|
||||
* @throws NoMoreTokens if the input stream is at an end
|
||||
*/
|
||||
char peek();
|
||||
|
||||
/**
|
||||
* Get the current byte offset
|
||||
*/
|
||||
std::size_t getByteOffset();
|
||||
|
||||
private:
|
||||
std::size_t m_offset;
|
||||
std::istream& m_input;
|
||||
};
|
||||
|
||||
#endif //LEXER_RegexLexer_H
|
324
src/re.cpp
324
src/re.cpp
|
@ -1,7 +1,9 @@
|
|||
#include "Lexesis/re.h"
|
||||
#include "RegexLexer.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <stack>
|
||||
using namespace std;
|
||||
|
||||
|
@ -142,127 +144,58 @@ namespace lxs {
|
|||
stk.push(tp);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the actual char that should be used when c is placed after a backslash
|
||||
*/
|
||||
char parseEscapeChar(char c) {
|
||||
switch (c)
|
||||
{
|
||||
case '\\':
|
||||
case '*':
|
||||
case '+':
|
||||
case '|':
|
||||
case '(':
|
||||
case ')':
|
||||
case '[':
|
||||
case ']':
|
||||
case '?':
|
||||
case '.':
|
||||
case '\'':
|
||||
case '"':
|
||||
case '-':
|
||||
break;
|
||||
case 'n':
|
||||
c = '\n'; break;
|
||||
case 'r':
|
||||
c = '\r'; break;
|
||||
case 'b':
|
||||
c = '\b'; break;
|
||||
case 't':
|
||||
c = '\t'; break;
|
||||
case 's':
|
||||
c = ' '; break;
|
||||
case 'a':
|
||||
c = '\a'; break;
|
||||
case 'f':
|
||||
c = '\f'; break;
|
||||
case 'v':
|
||||
c = '\v'; break;
|
||||
default:
|
||||
throw SyntaxError(("Invalid escape sequence: \\" + std::string(1, c)).c_str());
|
||||
}
|
||||
return c;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse a character class
|
||||
*/
|
||||
std::shared_ptr<RE> parseCharacterClass(const string& input, size_t& idx) {
|
||||
if (idx >= input.size())
|
||||
throw SyntaxError("Unclosed character class");
|
||||
std::shared_ptr<RE> parseCharacterClass(const string& input) {
|
||||
std::set<char> used_chars;
|
||||
|
||||
bool invert = false;
|
||||
int last_char = -1;
|
||||
std::size_t start = 1;
|
||||
std::size_t end = input.size() - 1;
|
||||
|
||||
if (input[idx] == '^')
|
||||
{
|
||||
if (input[1] == '^') {
|
||||
invert = true;
|
||||
idx++;
|
||||
start = 2;
|
||||
}
|
||||
|
||||
if (idx >= input.size())
|
||||
throw SyntaxError("Unclosed character class");
|
||||
|
||||
|
||||
if (input[idx] == ']')
|
||||
{
|
||||
if (input[start] == ']') {
|
||||
used_chars.insert(']');
|
||||
idx++;
|
||||
last_char = ']';
|
||||
++start;
|
||||
}
|
||||
|
||||
if (idx >= input.size())
|
||||
throw SyntaxError("Unclosed character class");
|
||||
|
||||
if (input[idx] == '-')
|
||||
{
|
||||
if (input[start] == '-') {
|
||||
used_chars.insert('-');
|
||||
idx++;
|
||||
last_char = '-';
|
||||
++start;
|
||||
}
|
||||
|
||||
if (idx >= input.size())
|
||||
throw SyntaxError("Unclosed character class");
|
||||
if (input[end - 1] == '-') {
|
||||
used_chars.insert('-');
|
||||
--end;
|
||||
}
|
||||
|
||||
for (; idx < input.size() && input[idx] != ']'; idx++)
|
||||
int last_char = -1;
|
||||
for (std::size_t idx = start; idx < end; idx++)
|
||||
{
|
||||
if (input[idx] == '-')
|
||||
{
|
||||
idx++;
|
||||
|
||||
if (idx >= input.size())
|
||||
throw SyntaxError("Unclosed character class");
|
||||
|
||||
if (input[idx] == ']')
|
||||
{
|
||||
used_chars.insert('-');
|
||||
idx--;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (last_char == -1)
|
||||
throw SyntaxError("Nothing to apply range to");
|
||||
for (int i = last_char + 1; i <= input[idx]; i++)
|
||||
{
|
||||
used_chars.insert((char) i);
|
||||
}
|
||||
last_char = -1;
|
||||
for (int i = last_char + 1; i <= input[idx]; i++) {
|
||||
used_chars.insert((char) i);
|
||||
}
|
||||
last_char = -1;
|
||||
}
|
||||
else
|
||||
{
|
||||
used_chars.insert(input[idx]);
|
||||
last_char = input[idx];
|
||||
if (idx == end - 1 || (idx < end - 1 && input[idx + 1] != '-'))
|
||||
used_chars.insert(input[idx]);
|
||||
else
|
||||
last_char = input[idx];
|
||||
}
|
||||
}
|
||||
|
||||
if (idx >= input.size())
|
||||
throw SyntaxError("Unclosed character class");
|
||||
|
||||
std::vector<char> chars;
|
||||
for (int i = 0; i < 256; i++)
|
||||
{
|
||||
for (int i = 0; i < 256; i++) {
|
||||
if (invert ^ (used_chars.count((char) i) > 0))
|
||||
chars.push_back((char) i);
|
||||
}
|
||||
|
@ -284,104 +217,155 @@ namespace lxs {
|
|||
/**
|
||||
* Parse the actual regex
|
||||
*/
|
||||
std::shared_ptr<RE> parseRE(const string& input, size_t& idx)
|
||||
std::shared_ptr<RE> parseRE(RegexLexer& lex, bool& exit_by_closed_paren, bool inside_parens=false)
|
||||
{
|
||||
stack<std::shared_ptr<RE> > stk;
|
||||
for (; idx < input.length(); idx++)
|
||||
{
|
||||
std::shared_ptr<RE> n;
|
||||
switch (input[idx])
|
||||
{
|
||||
case '\\':
|
||||
idx++;
|
||||
if (idx >= input.length())
|
||||
throw SyntaxError("Escape sequence at the end of the string");
|
||||
else
|
||||
stk.push(std::make_shared<SingleRE>(parseEscapeChar(input[idx])));
|
||||
break;
|
||||
//TODO: report location in regex on error
|
||||
|
||||
case '[':
|
||||
stk.push(parseCharacterClass(input, ++idx));
|
||||
break;
|
||||
try {
|
||||
while (true) {
|
||||
exit_by_closed_paren = false;
|
||||
RegexLexer::Token tok = lex.nextToken();
|
||||
std::shared_ptr<RE> n;
|
||||
switch (tok.type) {
|
||||
case RegexLexer::TAB:
|
||||
stk.push(std::make_shared<SingleRE>('\t'));
|
||||
break;
|
||||
case RegexLexer::NEWLINE:
|
||||
stk.push(std::make_shared<SingleRE>('\n'));
|
||||
break;
|
||||
case RegexLexer::CARRIAGE_RETURN:
|
||||
stk.push(std::make_shared<SingleRE>('\r'));
|
||||
break;
|
||||
case RegexLexer::BACKSPACE:
|
||||
stk.push(std::make_shared<SingleRE>('\b'));
|
||||
break;
|
||||
case RegexLexer::SPACE:
|
||||
stk.push(std::make_shared<SingleRE>(' '));
|
||||
break;
|
||||
case RegexLexer::BELL:
|
||||
stk.push(std::make_shared<SingleRE>('\a'));
|
||||
break;
|
||||
case RegexLexer::FORMFEED:
|
||||
stk.push(std::make_shared<SingleRE>('\f'));
|
||||
break;
|
||||
case RegexLexer::VTAB:
|
||||
stk.push(std::make_shared<SingleRE>('\v'));
|
||||
break;
|
||||
|
||||
case '.':
|
||||
stk.push(dotChar());
|
||||
break;
|
||||
case RegexLexer::BACKSLASH:
|
||||
case RegexLexer::ESCAPED_STAR:
|
||||
case RegexLexer::ESCAPED_PLUS:
|
||||
case RegexLexer::ESCAPED_PIPE:
|
||||
case RegexLexer::ESCAPED_LPAREN:
|
||||
case RegexLexer::ESCAPED_RPAREN:
|
||||
case RegexLexer::ESCAPED_LBRACKET:
|
||||
case RegexLexer::ESCAPED_RBRACKET:
|
||||
case RegexLexer::ESCAPED_QUESTIONMARK:
|
||||
case RegexLexer::ESCAPED_DOT:
|
||||
stk.push(std::make_shared<SingleRE>(tok.content[1]));
|
||||
break;
|
||||
|
||||
case RegexLexer::DOT:
|
||||
stk.push(dotChar());
|
||||
break;
|
||||
|
||||
case ']':
|
||||
throw SyntaxError("Unopened ']'");
|
||||
break;
|
||||
case RegexLexer::STAR:
|
||||
if (stk.empty())
|
||||
throw SyntaxError(lex, "Cannot apply kleene star to empty regex");
|
||||
n = std::make_shared<StarRE>(stk.top());
|
||||
stk.pop();
|
||||
stk.push(n);
|
||||
break;
|
||||
|
||||
case '*':
|
||||
if (stk.empty())
|
||||
throw SyntaxError("Cannot apply kleene star to empty regex");
|
||||
n = std::make_shared<StarRE>(stk.top());
|
||||
stk.pop();
|
||||
stk.push(n);
|
||||
break;
|
||||
case RegexLexer::PLUS:
|
||||
if (stk.empty())
|
||||
throw SyntaxError(lex, "Cannot apply kleene plus to empty regex");
|
||||
n = stk.top();
|
||||
stk.pop();
|
||||
n = std::make_shared<ConcatRE>(n, std::make_shared<StarRE>(n));
|
||||
stk.push(n);
|
||||
break;
|
||||
|
||||
case '+':
|
||||
if (stk.empty())
|
||||
throw SyntaxError("Cannot apply kleene plus to empty regex");
|
||||
n = stk.top();
|
||||
stk.pop();
|
||||
n = std::make_shared<ConcatRE>(n, std::make_shared<StarRE>(n));
|
||||
stk.push(n);
|
||||
break;
|
||||
case RegexLexer::QUESTIONMARK:
|
||||
if (stk.empty())
|
||||
throw SyntaxError(lex, "Cannot apply '?' to empty regex");
|
||||
n = std::make_shared<PlusRE>(stk.top(), std::make_shared<EpsilonRE>());
|
||||
stk.pop();
|
||||
stk.push(n);
|
||||
break;
|
||||
|
||||
case '?':
|
||||
if (stk.empty())
|
||||
throw SyntaxError("Cannot apply '?' to empty regex");
|
||||
n = std::make_shared<PlusRE>(stk.top(), std::make_shared<EpsilonRE>());
|
||||
stk.pop();
|
||||
stk.push(n);
|
||||
break;
|
||||
case RegexLexer::PIPE:
|
||||
if (stk.empty())
|
||||
throw SyntaxError(lex, "Invalid regex: nothing to the left of '|'");
|
||||
if (stk.size() > 1)
|
||||
compactStack(stk), compress(stk);
|
||||
n = std::make_shared<PlusRE>(stk.top(), parseRE(lex, exit_by_closed_paren, inside_parens));
|
||||
stk.pop();
|
||||
stk.push(n);
|
||||
if (exit_by_closed_paren) {
|
||||
if (stk.size() == 1)
|
||||
return stk.top();
|
||||
else if (stk.size() == 2)
|
||||
return compress(stk), stk.top();
|
||||
else
|
||||
throw SyntaxError(lex, "Invalid regex");
|
||||
}
|
||||
break;
|
||||
|
||||
case '|':
|
||||
if (stk.empty())
|
||||
throw SyntaxError("Invalid regex: nothing to the left of '|'");
|
||||
if (stk.size() > 1)
|
||||
compactStack(stk), compress(stk);
|
||||
n = std::make_shared<PlusRE>(stk.top(), parseRE(input, ++idx));
|
||||
stk.pop();
|
||||
stk.push(n);
|
||||
idx--;
|
||||
break;
|
||||
case RegexLexer::LPAREN:
|
||||
n = parseRE(lex, exit_by_closed_paren, true);
|
||||
if (!exit_by_closed_paren) {
|
||||
throw SyntaxError(lex, "Unclosed parenthesis");
|
||||
}
|
||||
stk.push(n);
|
||||
break;
|
||||
|
||||
case '(':
|
||||
n = parseRE(input, ++idx);
|
||||
if (idx >= input.size() || input[idx] != ')')
|
||||
throw SyntaxError("Could not parse regex, unclosed parentheses");
|
||||
stk.push(n);
|
||||
break;
|
||||
case RegexLexer::RPAREN:
|
||||
if (!inside_parens)
|
||||
throw SyntaxError(lex, "Unopened parenthesis");
|
||||
|
||||
case ')':
|
||||
if (stk.size() == 1)
|
||||
return stk.top();
|
||||
else if (stk.size() == 2)
|
||||
return compress(stk), stk.top();
|
||||
throw SyntaxError("Could not parse regex, nothing inside parentheses");
|
||||
exit_by_closed_paren = true;
|
||||
if (stk.size() == 1)
|
||||
return stk.top();
|
||||
else if (stk.size() == 2)
|
||||
return compress(stk), stk.top();
|
||||
throw SyntaxError(lex, "Could not parse regex, nothing inside parentheses");
|
||||
|
||||
default:
|
||||
stk.push(std::make_shared<SingleRE>(input[idx]));
|
||||
case RegexLexer::CHAR:
|
||||
stk.push(std::make_shared<SingleRE>(tok.content[0]));
|
||||
break;
|
||||
|
||||
case RegexLexer::CHAR_CLASS:
|
||||
stk.push(parseCharacterClass(tok.content));
|
||||
break;
|
||||
|
||||
case RegexLexer::ERROR:
|
||||
throw SyntaxError(lex, "Error on character: " + tok.content);
|
||||
|
||||
case RegexLexer::ignore: case RegexLexer::nonmatching:
|
||||
//Just ignore these
|
||||
break;
|
||||
}
|
||||
compactStack(stk);
|
||||
}
|
||||
compactStack(stk);
|
||||
} catch (RegexLexer::NoMoreTokens& err) {
|
||||
if (stk.size() == 1)
|
||||
return stk.top();
|
||||
else if (stk.size() == 2)
|
||||
return compress(stk), stk.top();
|
||||
throw SyntaxError(lex, "Could not parse regex");
|
||||
}
|
||||
if (stk.size() == 1)
|
||||
return stk.top();
|
||||
else if (stk.size() == 2)
|
||||
return compress(stk), stk.top();
|
||||
throw SyntaxError("Could not parse regex");
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
std::shared_ptr<RE> parseRE(const string& input)
|
||||
{
|
||||
size_t i = 0;
|
||||
std::shared_ptr<RE> res = parseRE(input, i);
|
||||
if (i < input.length())
|
||||
throw SyntaxError("Incorrect regex");
|
||||
std::istringstream inputstream(input);
|
||||
RegexLexer lex(inputstream);
|
||||
bool exit_by_closed_paren = false;
|
||||
std::shared_ptr<RE> res = parseRE(lex, exit_by_closed_paren);
|
||||
return res;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,49 @@
|
|||
# \[
|
||||
# ( ^ ( \]-? | - | X' )
|
||||
# | \]-? | - | Y' )
|
||||
# X'*
|
||||
# -? \]
|
||||
#
|
||||
# With X' = (X - X | X)
|
||||
# Y' = (Y - X | Y)
|
||||
# X = [^]-]
|
||||
# Y = [^]-^]
|
||||
#
|
||||
CHAR_CLASS = \[(^(\]-?|-|([^]-]-[^]-]|[^]-]))|\]-?|-|([^]-^]-[^]-]|[^]-^]))([^]-]-[^]-]|[^]-])*-?\]
|
||||
|
||||
#All the escape sequences
|
||||
TAB = \\t
|
||||
NEWLINE = \\n
|
||||
CARRIAGE_RETURN = \\r
|
||||
BACKSPACE = \\b
|
||||
SPACE = \\s|\s
|
||||
BELL = \\a
|
||||
FORMFEED = \\f
|
||||
VTAB = \\v
|
||||
|
||||
BACKSLASH = \\\\
|
||||
ESCAPED_STAR = \\\*
|
||||
ESCAPED_PLUS = \\\+
|
||||
ESCAPED_PIPE = \\\|
|
||||
ESCAPED_LPAREN = \\\(
|
||||
ESCAPED_RPAREN = \\\)
|
||||
ESCAPED_LBRACKET = \\\[
|
||||
ESCAPED_RBRACKET = \\\]
|
||||
ESCAPED_QUESTIONMARK = \\\?
|
||||
ESCAPED_DOT = \\\.
|
||||
|
||||
# The special chars
|
||||
DOT = \.
|
||||
STAR = \*
|
||||
PLUS = \+
|
||||
QUESTIONMARK = \?
|
||||
PIPE = \|
|
||||
LPAREN = \(
|
||||
RPAREN = \)
|
||||
|
||||
# Anything other than these is valid
|
||||
# It cannot be ] or [, since those should be handled by CHAR_CLASS
|
||||
# If it is a \, it is an invalid escape sequence
|
||||
CHAR = [^][\]
|
||||
|
||||
ERROR = .
|
Loading…
Reference in New Issue