436 lines
16 KiB
C++
436 lines
16 KiB
C++
/*
|
|
* This software is provided 'as-is', without any express or implied warranty. In no event will the authors be held liable for any damages arising from the use of this software.
|
|
*
|
|
* Permission is granted to anyone to use this software for any purpose, including commercial applications, and to alter it and redistribute it freely, subject to the following restrictions:
|
|
*
|
|
* 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
|
|
*
|
|
* 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
|
|
*
|
|
* 3. This notice may not be removed or altered from any source distribution.
|
|
*/
|
|
#pragma once
|
|
#ifndef PARSODUS_PARSER_RegexParser_H
|
|
#define PARSODUS_PARSER_RegexParser_H
|
|
|
|
#include <cassert>
|
|
#include <cstdint>
|
|
#include <deque>
|
|
#include <stack>
|
|
#include <stdexcept>
|
|
#include <vector>
|
|
|
|
/**
|
|
* Represents the type of the symbol (both terminals and nonterminals)
|
|
*/
|
|
enum class RegexParser_Symbol : std::uint64_t {
|
|
T_EOF,
|
|
T_BACKSLASH,
|
|
T_BACKSPACE,
|
|
T_BELL,
|
|
T_CARRIAGE_RETURN,
|
|
T_CHAR,
|
|
T_CHAR_CLASS,
|
|
T_DOT,
|
|
T_ERROR,
|
|
T_ESCAPED_DOT,
|
|
T_ESCAPED_LBRACKET,
|
|
T_ESCAPED_LPAREN,
|
|
T_ESCAPED_PIPE,
|
|
T_ESCAPED_PLUS,
|
|
T_ESCAPED_QUESTIONMARK,
|
|
T_ESCAPED_RBRACKET,
|
|
T_ESCAPED_RPAREN,
|
|
T_ESCAPED_STAR,
|
|
T_FORMFEED,
|
|
T_LPAREN,
|
|
T_NEWLINE,
|
|
T_PIPE,
|
|
T_PLUS,
|
|
T_QUESTIONMARK,
|
|
T_RPAREN,
|
|
T_SPACE,
|
|
T_STAR,
|
|
T_TAB,
|
|
T_VTAB,
|
|
V_error,
|
|
V_re,
|
|
};
|
|
|
|
class SyntaxError : public std::runtime_error {
|
|
public:
|
|
SyntaxError(const char* c) : std::runtime_error(c) {}
|
|
};
|
|
|
|
template <typename Value>
|
|
class RegexParser {
|
|
public:
|
|
RegexParser() {}
|
|
virtual ~RegexParser() {}
|
|
|
|
/**
|
|
* Parse it
|
|
*/
|
|
Value parse();
|
|
|
|
protected:
|
|
/**
|
|
* A token, consisting of a Symbol type (should be a terminal) and a Value
|
|
*/
|
|
struct Token {
|
|
Token(const RegexParser_Symbol& sym, const Value& val) : symbol(sym), value(val) {}
|
|
Token(const RegexParser_Symbol& sym, Value&& val) : symbol(sym), value(std::move(val)) {}
|
|
RegexParser_Symbol symbol;
|
|
Value value;
|
|
};
|
|
|
|
|
|
/******************************************
|
|
* Functions to be supplied by the user *
|
|
******************************************/
|
|
|
|
/**
|
|
* Handle an error
|
|
* current is the current Token, one that has no action associated in the current state
|
|
* expected is a listing of all terminals that do have an action
|
|
*
|
|
* By default throws an error
|
|
*/
|
|
virtual Value error(Token current, const std::vector<RegexParser_Symbol>& expected);
|
|
|
|
/**
|
|
* Get the next token from the lexer
|
|
*/
|
|
virtual Token lex() = 0;
|
|
|
|
/**
|
|
* Apply a reduction (a grammar rule in reverse)
|
|
*/
|
|
virtual Value reduce_char(std::deque<Token> subparts) = 0;
|
|
virtual Value reduce_concat(std::deque<Token> subparts) = 0;
|
|
virtual Value reduce_optional(std::deque<Token> subparts) = 0;
|
|
virtual Value reduce_or(std::deque<Token> subparts) = 0;
|
|
virtual Value reduce_paren(std::deque<Token> subparts) = 0;
|
|
virtual Value reduce_plus(std::deque<Token> subparts) = 0;
|
|
virtual Value reduce_star(std::deque<Token> subparts) = 0;
|
|
|
|
private:
|
|
};
|
|
|
|
template <>
|
|
class RegexParser<bool> {
|
|
public:
|
|
RegexParser() {}
|
|
virtual ~RegexParser() {}
|
|
|
|
/**
|
|
* Parse it
|
|
*/
|
|
bool parse();
|
|
|
|
protected:
|
|
/******************************************
|
|
* Functions to be supplied by the user *
|
|
******************************************/
|
|
|
|
/**
|
|
* Get the next token from the lexer
|
|
*/
|
|
virtual RegexParser_Symbol lex() = 0;
|
|
};
|
|
|
|
#define TABLE RegexParser___Table___RegexParser
|
|
#define REDUCE_COUNT RegexParser___Num_Reduces___RegexParser
|
|
// Not a static member because the table should not be replicated for different instantiations of the parser
|
|
extern const std::uint64_t TABLE[32][31];
|
|
extern const unsigned char REDUCE_COUNT[27];
|
|
|
|
enum Action {
|
|
ERROR = 0,
|
|
SHIFT = 1,
|
|
REDUCE = 2,
|
|
ACCEPT = 3
|
|
};
|
|
|
|
|
|
/*********************************************
|
|
* Translate a Symbol to a readable string *
|
|
*********************************************/
|
|
inline std::string to_string(RegexParser_Symbol s) {
|
|
switch (s) {
|
|
case RegexParser_Symbol::T_EOF:
|
|
return "T_EOF";
|
|
case RegexParser_Symbol::T_BACKSLASH:
|
|
return "T_BACKSLASH";
|
|
case RegexParser_Symbol::T_BACKSPACE:
|
|
return "T_BACKSPACE";
|
|
case RegexParser_Symbol::T_BELL:
|
|
return "T_BELL";
|
|
case RegexParser_Symbol::T_CARRIAGE_RETURN:
|
|
return "T_CARRIAGE_RETURN";
|
|
case RegexParser_Symbol::T_CHAR:
|
|
return "T_CHAR";
|
|
case RegexParser_Symbol::T_CHAR_CLASS:
|
|
return "T_CHAR_CLASS";
|
|
case RegexParser_Symbol::T_DOT:
|
|
return "T_DOT";
|
|
case RegexParser_Symbol::T_ERROR:
|
|
return "T_ERROR";
|
|
case RegexParser_Symbol::T_ESCAPED_DOT:
|
|
return "T_ESCAPED_DOT";
|
|
case RegexParser_Symbol::T_ESCAPED_LBRACKET:
|
|
return "T_ESCAPED_LBRACKET";
|
|
case RegexParser_Symbol::T_ESCAPED_LPAREN:
|
|
return "T_ESCAPED_LPAREN";
|
|
case RegexParser_Symbol::T_ESCAPED_PIPE:
|
|
return "T_ESCAPED_PIPE";
|
|
case RegexParser_Symbol::T_ESCAPED_PLUS:
|
|
return "T_ESCAPED_PLUS";
|
|
case RegexParser_Symbol::T_ESCAPED_QUESTIONMARK:
|
|
return "T_ESCAPED_QUESTIONMARK";
|
|
case RegexParser_Symbol::T_ESCAPED_RBRACKET:
|
|
return "T_ESCAPED_RBRACKET";
|
|
case RegexParser_Symbol::T_ESCAPED_RPAREN:
|
|
return "T_ESCAPED_RPAREN";
|
|
case RegexParser_Symbol::T_ESCAPED_STAR:
|
|
return "T_ESCAPED_STAR";
|
|
case RegexParser_Symbol::T_FORMFEED:
|
|
return "T_FORMFEED";
|
|
case RegexParser_Symbol::T_LPAREN:
|
|
return "T_LPAREN";
|
|
case RegexParser_Symbol::T_NEWLINE:
|
|
return "T_NEWLINE";
|
|
case RegexParser_Symbol::T_PIPE:
|
|
return "T_PIPE";
|
|
case RegexParser_Symbol::T_PLUS:
|
|
return "T_PLUS";
|
|
case RegexParser_Symbol::T_QUESTIONMARK:
|
|
return "T_QUESTIONMARK";
|
|
case RegexParser_Symbol::T_RPAREN:
|
|
return "T_RPAREN";
|
|
case RegexParser_Symbol::T_SPACE:
|
|
return "T_SPACE";
|
|
case RegexParser_Symbol::T_STAR:
|
|
return "T_STAR";
|
|
case RegexParser_Symbol::T_TAB:
|
|
return "T_TAB";
|
|
case RegexParser_Symbol::T_VTAB:
|
|
return "T_VTAB";
|
|
case RegexParser_Symbol::V_error:
|
|
return "V_error";
|
|
case RegexParser_Symbol::V_re:
|
|
return "V_re";
|
|
}
|
|
}
|
|
|
|
|
|
/**************************
|
|
* Default error method *
|
|
**************************/
|
|
template <typename Value>
|
|
Value RegexParser<Value>::error(Token current, const std::vector<RegexParser_Symbol>& expected) {
|
|
std::string msg = "Syntax Error: got " + to_string(current.symbol) + "\n Expected any of:";
|
|
for (auto& s : expected) {
|
|
msg += "\n " + to_string(s);
|
|
}
|
|
throw SyntaxError(msg.c_str());
|
|
}
|
|
|
|
|
|
/***************************
|
|
* Parser implementation *
|
|
***************************/
|
|
template <typename Value>
|
|
Value RegexParser<Value>::parse() {
|
|
std::stack<Token> valueStack;
|
|
std::stack<std::uint64_t> stateStack;
|
|
|
|
stateStack.push(0);
|
|
Token tok = lex();
|
|
|
|
while (true) {
|
|
std::uint64_t act = TABLE[stateStack.top()][static_cast<std::uint64_t>(tok.symbol)];
|
|
|
|
switch (act & 0x3) {
|
|
case ERROR:
|
|
{
|
|
constexpr std::uint64_t verr = static_cast<std::uint64_t>(RegexParser_Symbol::V_error);
|
|
std::vector<RegexParser_Symbol> expected;
|
|
std::uint64_t top = stateStack.top();
|
|
for (std::uint64_t i = 0; i <= static_cast<std::uint64_t>(RegexParser_Symbol::T_VTAB); i++) {
|
|
if ((TABLE[top][i] & 0x3) != ERROR)
|
|
expected.emplace_back(static_cast<RegexParser_Symbol>(i));
|
|
}
|
|
Token report = Token{tok.symbol, std::move(tok.value)};
|
|
Value errorVal = error(std::move(report), expected);
|
|
|
|
while (!valueStack.empty() && !TABLE[stateStack.top()][verr]) {
|
|
valueStack.pop();
|
|
stateStack.pop();
|
|
}
|
|
if (!TABLE[stateStack.top()][verr]) {
|
|
throw SyntaxError("Syntax error: could not recover");
|
|
}
|
|
|
|
stateStack.push(TABLE[stateStack.top()][verr] >> 2);
|
|
valueStack.emplace(Token{ RegexParser_Symbol::V_error, std::move(errorVal)});
|
|
|
|
while (tok.symbol != RegexParser_Symbol::T_EOF && (TABLE[stateStack.top()][static_cast<std::uint64_t>(tok.symbol)] & 0x3) == ERROR) {
|
|
tok = lex();
|
|
}
|
|
if ((TABLE[stateStack.top()][static_cast<std::uint64_t>(tok.symbol)] & 0x3) == ERROR) {
|
|
throw SyntaxError("Syntax error: could not recover");
|
|
}
|
|
}
|
|
break;
|
|
case SHIFT:
|
|
valueStack.emplace(std::move(tok));
|
|
stateStack.push(act >> 2);
|
|
tok = lex();
|
|
break;
|
|
case REDUCE:
|
|
{
|
|
std::uint64_t tmp = act >> 2;
|
|
RegexParser_Symbol symbol = static_cast<RegexParser_Symbol>(tmp >> 31);
|
|
std::uint32_t rule = tmp & ((1ull << 31) - 1);
|
|
|
|
std::deque<Token> dq;
|
|
for (unsigned char i = 0; i < REDUCE_COUNT[rule]; i++) {
|
|
dq.emplace_front(std::move(valueStack.top()));
|
|
valueStack.pop();
|
|
stateStack.pop();
|
|
}
|
|
|
|
switch (rule) {
|
|
case 0:
|
|
valueStack.emplace(symbol, reduce_or(std::move(dq)));
|
|
|
|
break;
|
|
case 1:
|
|
valueStack.emplace(symbol, reduce_paren(std::move(dq)));
|
|
|
|
break;
|
|
case 2:
|
|
valueStack.emplace(symbol, reduce_star(std::move(dq)));
|
|
|
|
break;
|
|
case 3:
|
|
valueStack.emplace(symbol, reduce_plus(std::move(dq)));
|
|
|
|
break;
|
|
case 4:
|
|
valueStack.emplace(symbol, reduce_optional(std::move(dq)));
|
|
|
|
break;
|
|
case 5:
|
|
valueStack.emplace(symbol, reduce_concat(std::move(dq)));
|
|
|
|
break;
|
|
case 6:
|
|
valueStack.emplace(symbol, reduce_char(std::move(dq)));
|
|
|
|
break;
|
|
case 7:
|
|
valueStack.emplace(symbol, reduce_char(std::move(dq)));
|
|
|
|
break;
|
|
case 8:
|
|
valueStack.emplace(symbol, reduce_char(std::move(dq)));
|
|
|
|
break;
|
|
case 9:
|
|
valueStack.emplace(symbol, reduce_char(std::move(dq)));
|
|
|
|
break;
|
|
case 10:
|
|
valueStack.emplace(symbol, reduce_char(std::move(dq)));
|
|
|
|
break;
|
|
case 11:
|
|
valueStack.emplace(symbol, reduce_char(std::move(dq)));
|
|
|
|
break;
|
|
case 12:
|
|
valueStack.emplace(symbol, reduce_char(std::move(dq)));
|
|
|
|
break;
|
|
case 13:
|
|
valueStack.emplace(symbol, reduce_char(std::move(dq)));
|
|
|
|
break;
|
|
case 14:
|
|
valueStack.emplace(symbol, reduce_char(std::move(dq)));
|
|
|
|
break;
|
|
case 15:
|
|
valueStack.emplace(symbol, reduce_char(std::move(dq)));
|
|
|
|
break;
|
|
case 16:
|
|
valueStack.emplace(symbol, reduce_char(std::move(dq)));
|
|
|
|
break;
|
|
case 17:
|
|
valueStack.emplace(symbol, reduce_char(std::move(dq)));
|
|
|
|
break;
|
|
case 18:
|
|
valueStack.emplace(symbol, reduce_char(std::move(dq)));
|
|
|
|
break;
|
|
case 19:
|
|
valueStack.emplace(symbol, reduce_char(std::move(dq)));
|
|
|
|
break;
|
|
case 20:
|
|
valueStack.emplace(symbol, reduce_char(std::move(dq)));
|
|
|
|
break;
|
|
case 21:
|
|
valueStack.emplace(symbol, reduce_char(std::move(dq)));
|
|
|
|
break;
|
|
case 22:
|
|
valueStack.emplace(symbol, reduce_char(std::move(dq)));
|
|
|
|
break;
|
|
case 23:
|
|
valueStack.emplace(symbol, reduce_char(std::move(dq)));
|
|
|
|
break;
|
|
case 24:
|
|
valueStack.emplace(symbol, reduce_char(std::move(dq)));
|
|
|
|
break;
|
|
case 25:
|
|
valueStack.emplace(symbol, reduce_char(std::move(dq)));
|
|
|
|
break;
|
|
case 26:
|
|
valueStack.emplace(symbol, reduce_char(std::move(dq)));
|
|
|
|
break;
|
|
default:
|
|
assert(false); //There should be no such rule
|
|
break;
|
|
}
|
|
|
|
stateStack.push(TABLE[stateStack.top()][static_cast<std::uint64_t>(valueStack.top().symbol)] >> 2);
|
|
}
|
|
break;
|
|
case ACCEPT:
|
|
assert(stateStack.size() == 2);
|
|
assert(valueStack.size() == 1);
|
|
return std::move(valueStack.top().value);
|
|
default:
|
|
//IMPOSSIBLE
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
#undef REDUCE_COUNT
|
|
#undef TABLE
|
|
|
|
#endif /* PARSODUS_PARSER_RegexParser_H */
|