From b2a62daed1b54b9027241cbf6bf06b22711e2b0f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Av=C3=A9?= Date: Sat, 26 Nov 2016 19:26:16 +0100 Subject: [PATCH] First version of input parser --- include/InputLexer.h | 77 +++++++++++ include/Parsodus/config.h | 19 +++ include/Parsodus/grammar.h | 9 ++ include/Parsodus/inputparser.h | 34 +++++ parser.example.pds | 12 +- src/CMakeLists.txt | 2 + src/InputLexer.cpp | 246 +++++++++++++++++++++++++++++++++ src/inputparser.cpp | 129 +++++++++++++++++ src/inputparser.lxs | 3 +- src/main.cpp | 47 ++++++- 10 files changed, 566 insertions(+), 12 deletions(-) create mode 100644 include/InputLexer.h create mode 100644 include/Parsodus/config.h create mode 100644 include/Parsodus/inputparser.h create mode 100644 src/InputLexer.cpp create mode 100644 src/inputparser.cpp diff --git a/include/InputLexer.h b/include/InputLexer.h new file mode 100644 index 0000000..f8f21c5 --- /dev/null +++ b/include/InputLexer.h @@ -0,0 +1,77 @@ +#pragma once +#ifndef LEXER_InputLexer_H +#define LEXER_InputLexer_H + +#include +#include +#include + +class InputLexer { + public: + class NoMoreTokens : public std::exception {}; + class NoMatch : public std::exception {}; + + enum TokenType { + nonmatching, + ARROW, + COLON, + GRAMMAR, + LEFT, + LEXESIS, + LEXESISNAME, + NONASSOC, + NUM, + PARSER, + PARSERTYPE, + PIPE, + PRECEDENCE, + RIGHT, + SEMICOLON, + START, + TERMINAL, + TERMINALS, + VARIABLE, + ignore, + }; + + struct Token { + TokenType type; + std::string content; + }; + + InputLexer(std::istream& in); + ~InputLexer(); + + /** + * Get the next token + * + * @throws NoMoreTokens if no more tokens are available + * @throws NoMatch if no match was found + */ + Token nextToken(); + + /** + * Skip the following `n` bytes. + * + * @param n The number of bytes to skip + */ + void skip(std::size_t n); + + /** + * Peek at the current head of the input stream, useful in error reporting when a character mismatches for example + * + * @throws NoMoreTokens if the input stream is at an end + */ + char peek(); + + /** + * Get the current byte offset + */ + std::size_t getByteOffset(); + + private: + std::size_t m_offset; + std::istream& m_input; +}; + +#endif //LEXER_InputLexer_H diff --git a/include/Parsodus/config.h b/include/Parsodus/config.h new file mode 100644 index 0000000..927219f --- /dev/null +++ b/include/Parsodus/config.h @@ -0,0 +1,19 @@ +#pragma once +#ifndef PARSODUS_CONFIG_H +#define PARSODUS_CONFIG_H + +#include "Parsodus/grammar.h" + +namespace pds { + + struct Config { + enum class ParserType {LALR_1}; + + ParserType parserType; + std::string lexesisFile; + Grammar grammar; + }; + +} + +#endif //PARSODUS_CONFIG_H diff --git a/include/Parsodus/grammar.h b/include/Parsodus/grammar.h index 7c47eb2..c4b194f 100644 --- a/include/Parsodus/grammar.h +++ b/include/Parsodus/grammar.h @@ -14,6 +14,14 @@ namespace pds { struct Rule { std::string head; ///< The replaced variable std::vector tail; ///< The replacement rule + + bool operator<(const Rule& other) const { + if(head != other.head){ + return head < other.head; + } else { + return tail < other.tail; + } + } }; /** @@ -21,6 +29,7 @@ namespace pds { * Keeps track of variables, terminals and replacement rules */ struct Grammar { + std::string start; ///< the starting variable std::set variables; ///< the variables std::set terminals; ///< the terminals std::map > rules; ///< the replacement rules diff --git a/include/Parsodus/inputparser.h b/include/Parsodus/inputparser.h new file mode 100644 index 0000000..6d81ea4 --- /dev/null +++ b/include/Parsodus/inputparser.h @@ -0,0 +1,34 @@ +#pragma once +#ifndef PARSODUS_INPUT_PARSER_H +#define PARSODUS_INPUT_PARSER_H + +#include +#include "Parsodus/config.h" +#include "InputLexer.h" + +namespace pds { + + class InputParser { + public: + + static Config parseInput(std::istream& is); + + private: + static void lexColon(InputLexer& lex, InputLexer::Token &token, bool nextoken = true); + }; + + /** + * Used to throw errors when the inputfile was not valid + */ + + class InputParserException: public std::exception { + public: + InputParserException(std::string what); + virtual const char* what() const throw(); + + private: + std::string m_what; + }; +} + +#endif // PARSODUS_INPUT_PARSER_H diff --git a/parser.example.pds b/parser.example.pds index df0e819..0e5b582 100644 --- a/parser.example.pds +++ b/parser.example.pds @@ -1,16 +1,10 @@ -parser: lalr(1) -precedence: #optional - left 5 PLUS - right 6 TIMES - nonassoc 2 LT +parser: lalr(1) lexesis: lexer.lxs -# of terminals: TERMINAL - ... -start: start +start: s grammar: - start -> a + s -> a s | b ; diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index fe4662a..de992e5 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -11,6 +11,8 @@ add_library(Parsodus-backends add_library(pds driver.cpp + inputparser.cpp + InputLexer.cpp ) add_executable(Parsodus diff --git a/src/InputLexer.cpp b/src/InputLexer.cpp new file mode 100644 index 0000000..6c65f3d --- /dev/null +++ b/src/InputLexer.cpp @@ -0,0 +1,246 @@ +#include "InputLexer.h" + +#include +#include + +namespace { //The automaton data + typedef std::size_t State; + + State REJECT = 79; + + unsigned char TRANS_IDX[256] = { (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)1, (unsigned char)1, (unsigned char)0, (unsigned char)0, (unsigned char)1, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)1, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)2, (unsigned char)3, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)4, (unsigned char)5, (unsigned char)0, (unsigned char)6, (unsigned char)7, (unsigned char)7, (unsigned char)7, (unsigned char)7, (unsigned char)7, (unsigned char)7, (unsigned char)7, (unsigned char)7, (unsigned char)7, (unsigned char)8, (unsigned char)9, (unsigned char)0, (unsigned char)0, (unsigned char)10, (unsigned char)0, (unsigned char)0, (unsigned char)11, (unsigned char)11, (unsigned char)11, (unsigned char)11, (unsigned char)11, (unsigned char)11, (unsigned char)11, (unsigned char)11, (unsigned char)11, (unsigned char)11, (unsigned char)11, (unsigned char)11, (unsigned char)11, (unsigned char)11, (unsigned char)11, (unsigned char)11, (unsigned char)11, (unsigned char)11, (unsigned char)11, (unsigned char)11, (unsigned char)11, (unsigned char)11, (unsigned char)11, (unsigned char)11, (unsigned char)11, (unsigned char)11, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)12, (unsigned char)0, (unsigned char)13, (unsigned char)14, (unsigned char)15, (unsigned char)16, (unsigned char)17, (unsigned char)18, (unsigned char)19, (unsigned char)20, (unsigned char)21, (unsigned char)14, (unsigned char)14, (unsigned char)22, (unsigned char)23, (unsigned char)24, (unsigned char)25, (unsigned char)26, (unsigned char)14, (unsigned char)27, (unsigned char)28, (unsigned char)29, (unsigned char)14, (unsigned char)14, (unsigned char)14, (unsigned char)30, (unsigned char)14, (unsigned char)14, (unsigned char)0, (unsigned char)31, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)32, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)33, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)34, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, (unsigned char)0, }; + + State TABLE[80 - 1][35] = { + { 79, 78, 79, 79, 71, 79, 79, 66, 76, 75, 79, 46, 47, 48, 48, 48, 48, 48, 48, 37, 48, 48, 16, 48, 45, 48, 1, 44, 32, 23, 48, 77, 79, 79, 73, }, + + { 79, 79, 63, 79, 79, 67, 79, 79, 79, 79, 79, 62, 48, 2, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 7, 48, 48, 48, 79, 79, 79, 79, }, + + { 79, 79, 63, 79, 79, 67, 79, 79, 79, 79, 79, 62, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 3, 48, 48, 48, 79, 79, 79, 79, }, + + { 79, 79, 63, 79, 79, 67, 79, 79, 79, 79, 79, 62, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 4, 48, 48, 79, 79, 79, 79, }, + + { 79, 79, 63, 79, 79, 67, 79, 79, 79, 79, 79, 62, 48, 48, 48, 48, 48, 5, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 79, 79, 79, 79, }, + + { 79, 79, 63, 79, 79, 67, 79, 79, 79, 79, 79, 62, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 6, 48, 48, 48, 79, 79, 79, 79, }, + + { 79, 79, 63, 79, 79, 67, 79, 79, 79, 79, 79, 62, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 79, 79, 79, 79, }, + + { 79, 79, 63, 79, 79, 67, 79, 79, 79, 79, 79, 62, 48, 48, 48, 48, 48, 8, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 79, 79, 79, 79, }, + + { 79, 79, 63, 79, 79, 67, 79, 79, 79, 79, 79, 62, 48, 48, 48, 9, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 79, 79, 79, 79, }, + + { 79, 79, 63, 79, 79, 67, 79, 79, 79, 79, 79, 62, 48, 48, 48, 48, 48, 10, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 79, 79, 79, 79, }, + + { 79, 79, 63, 79, 79, 67, 79, 79, 79, 79, 79, 62, 48, 48, 48, 48, 11, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 79, 79, 79, 79, }, + + { 79, 79, 63, 79, 79, 67, 79, 79, 79, 79, 79, 62, 48, 48, 48, 48, 48, 12, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 79, 79, 79, 79, }, + + { 79, 79, 63, 79, 79, 67, 79, 79, 79, 79, 79, 62, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 13, 48, 48, 48, 48, 48, 48, 79, 79, 79, 79, }, + + { 79, 79, 63, 79, 79, 67, 79, 79, 79, 79, 79, 62, 48, 48, 48, 14, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 79, 79, 79, 79, }, + + { 79, 79, 63, 79, 79, 67, 79, 79, 79, 79, 79, 62, 48, 48, 48, 48, 48, 15, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 79, 79, 79, 79, }, + + { 79, 79, 63, 79, 79, 67, 79, 79, 79, 79, 79, 62, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 79, 79, 79, 79, }, + + { 79, 79, 63, 79, 79, 67, 79, 79, 79, 79, 79, 62, 48, 48, 48, 48, 48, 17, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 79, 79, 79, 79, }, + + { 79, 79, 63, 79, 79, 67, 79, 79, 79, 79, 79, 62, 48, 48, 48, 48, 48, 48, 49, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 18, 79, 79, 79, 79, }, + + { 79, 79, 63, 79, 79, 67, 79, 79, 79, 79, 79, 62, 48, 48, 48, 48, 48, 19, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 79, 79, 79, 79, }, + + { 79, 79, 63, 79, 79, 67, 79, 79, 79, 79, 79, 62, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 20, 48, 48, 79, 79, 79, 79, }, + + { 79, 79, 63, 79, 79, 67, 79, 79, 79, 79, 79, 62, 48, 48, 48, 48, 48, 48, 48, 48, 48, 21, 48, 48, 48, 48, 48, 48, 48, 48, 48, 79, 79, 79, 79, }, + + { 79, 79, 63, 79, 79, 67, 79, 79, 79, 79, 79, 62, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 22, 48, 48, 79, 79, 79, 79, }, + + { 79, 79, 63, 79, 79, 67, 79, 79, 79, 79, 79, 62, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 79, 79, 79, 79, }, + + { 79, 79, 63, 79, 79, 67, 79, 79, 79, 79, 79, 62, 48, 48, 48, 48, 48, 24, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 79, 79, 79, 79, }, + + { 79, 79, 63, 79, 79, 67, 79, 79, 79, 79, 79, 62, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 25, 48, 48, 48, 79, 79, 79, 79, }, + + { 79, 79, 63, 79, 79, 67, 79, 79, 79, 79, 79, 62, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 26, 48, 48, 48, 48, 48, 48, 48, 79, 79, 79, 79, }, + + { 79, 79, 63, 79, 79, 67, 79, 79, 79, 79, 79, 62, 48, 48, 48, 48, 48, 48, 48, 48, 48, 27, 48, 48, 48, 48, 48, 48, 48, 48, 48, 79, 79, 79, 79, }, + + { 79, 79, 63, 79, 79, 67, 79, 79, 79, 79, 79, 62, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 28, 48, 48, 48, 48, 48, 48, 79, 79, 79, 79, }, + + { 79, 79, 63, 79, 79, 67, 79, 79, 79, 79, 79, 62, 48, 29, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 79, 79, 79, 79, }, + + { 79, 79, 63, 79, 79, 67, 79, 79, 79, 79, 79, 62, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 30, 48, 48, 48, 48, 48, 48, 48, 48, 79, 79, 79, 79, }, + + { 79, 79, 63, 79, 79, 67, 79, 79, 79, 79, 79, 62, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 31, 48, 48, 79, 79, 79, 79, }, + + { 79, 79, 63, 79, 79, 67, 79, 79, 79, 79, 79, 62, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 79, 79, 79, 79, }, + + { 79, 79, 63, 79, 79, 67, 79, 79, 79, 79, 79, 62, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 33, 48, 79, 79, 79, 79, }, + + { 79, 79, 63, 79, 79, 67, 79, 79, 79, 79, 79, 62, 48, 34, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 79, 79, 79, 79, }, + + { 79, 79, 63, 79, 79, 67, 79, 79, 79, 79, 79, 62, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 35, 48, 48, 48, 79, 79, 79, 79, }, + + { 79, 79, 63, 79, 79, 67, 79, 79, 79, 79, 79, 62, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 36, 48, 79, 79, 79, 79, }, + + { 79, 79, 63, 79, 79, 67, 79, 79, 79, 79, 79, 62, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 79, 79, 79, 79, }, + + { 79, 79, 63, 79, 79, 67, 79, 79, 79, 79, 79, 62, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 38, 48, 48, 48, 79, 79, 79, 79, }, + + { 79, 79, 63, 79, 79, 67, 79, 79, 79, 79, 79, 62, 48, 39, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 79, 79, 79, 79, }, + + { 79, 79, 63, 79, 79, 67, 79, 79, 79, 79, 79, 62, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 40, 48, 48, 48, 48, 48, 48, 48, 79, 79, 79, 79, }, + + { 79, 79, 63, 79, 79, 67, 79, 79, 79, 79, 79, 62, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 41, 48, 48, 48, 48, 48, 48, 48, 79, 79, 79, 79, }, + + { 79, 79, 63, 79, 79, 67, 79, 79, 79, 79, 79, 62, 48, 42, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 79, 79, 79, 79, }, + + { 79, 79, 63, 79, 79, 67, 79, 79, 79, 79, 79, 62, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 43, 48, 48, 48, 79, 79, 79, 79, }, + + { 79, 79, 63, 79, 79, 67, 79, 79, 79, 79, 79, 62, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 79, 79, 79, 79, }, + + { 79, 79, 63, 79, 79, 67, 79, 79, 79, 79, 79, 62, 48, 48, 48, 48, 48, 48, 48, 48, 48, 51, 48, 48, 48, 48, 48, 48, 48, 48, 48, 79, 79, 79, 79, }, + + { 79, 79, 63, 79, 79, 67, 79, 79, 79, 79, 79, 62, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 55, 48, 48, 48, 48, 48, 79, 79, 79, 79, }, + + { 79, 79, 63, 79, 79, 67, 79, 79, 79, 79, 79, 46, 46, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 79, 79, 79, 79, }, + + { 79, 79, 63, 79, 79, 67, 79, 79, 79, 79, 79, 46, 47, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 79, 79, 79, 79, }, + + { 79, 79, 63, 79, 79, 67, 79, 79, 79, 79, 79, 62, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 79, 79, 79, 79, }, + + { 79, 79, 63, 79, 79, 67, 79, 79, 79, 79, 79, 62, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 50, 48, 79, 79, 79, 79, }, + + { 79, 79, 63, 79, 79, 67, 79, 79, 79, 79, 79, 62, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 79, 79, 79, 79, }, + + { 79, 79, 63, 79, 79, 67, 79, 79, 79, 79, 79, 62, 48, 48, 48, 48, 48, 48, 48, 52, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 79, 79, 79, 79, }, + + { 79, 79, 63, 79, 79, 67, 79, 79, 79, 79, 79, 62, 48, 48, 48, 48, 48, 48, 48, 48, 53, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 79, 79, 79, 79, }, + + { 79, 79, 63, 79, 79, 67, 79, 79, 79, 79, 79, 62, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 54, 48, 79, 79, 79, 79, }, + + { 79, 79, 63, 79, 79, 67, 79, 79, 79, 79, 79, 62, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 79, 79, 79, 79, }, + + { 79, 79, 63, 79, 79, 67, 79, 79, 79, 79, 79, 62, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 56, 48, 48, 48, 48, 48, 48, 79, 79, 79, 79, }, + + { 79, 79, 63, 79, 79, 67, 79, 79, 79, 79, 79, 62, 48, 57, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 79, 79, 79, 79, }, + + { 79, 79, 63, 79, 79, 67, 79, 79, 79, 79, 79, 62, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 58, 48, 48, 79, 79, 79, 79, }, + + { 79, 79, 63, 79, 79, 67, 79, 79, 79, 79, 79, 62, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 59, 48, 48, 79, 79, 79, 79, }, + + { 79, 79, 63, 79, 79, 67, 79, 79, 79, 79, 79, 62, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 60, 48, 48, 48, 48, 48, 79, 79, 79, 79, }, + + { 79, 79, 63, 79, 79, 67, 79, 79, 79, 79, 79, 62, 48, 48, 48, 61, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 79, 79, 79, 79, }, + + { 79, 79, 63, 79, 79, 67, 79, 79, 79, 79, 79, 62, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 79, 79, 79, 79, }, + + { 79, 79, 63, 79, 79, 67, 79, 79, 79, 79, 79, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 62, 79, 79, 79, 79, }, + + { 79, 79, 79, 79, 79, 79, 64, 64, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, }, + + { 79, 79, 79, 65, 79, 79, 64, 64, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, }, + + { 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, }, + + { 79, 79, 79, 79, 79, 79, 66, 66, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, }, + + { 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 68, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, }, + + { 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 69, 79, 79, 79, 79, }, + + { 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 70, 79, 79, 79, 79, 79, 79, }, + + { 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, }, + + { 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 72, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, }, + + { 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, }, + + { 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 74, 79, 79, }, + + { 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 72, 79, }, + + { 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, }, + + { 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, }, + + { 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, }, + + { 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, 79, }, + }; + + InputLexer::TokenType TOKENS[80] = { InputLexer::nonmatching, InputLexer::VARIABLE, InputLexer::VARIABLE, InputLexer::VARIABLE, InputLexer::VARIABLE, InputLexer::VARIABLE, InputLexer::PARSER, InputLexer::VARIABLE, InputLexer::VARIABLE, InputLexer::VARIABLE, InputLexer::VARIABLE, InputLexer::VARIABLE, InputLexer::VARIABLE, InputLexer::VARIABLE, InputLexer::VARIABLE, InputLexer::PRECEDENCE, InputLexer::VARIABLE, InputLexer::VARIABLE, InputLexer::VARIABLE, InputLexer::VARIABLE, InputLexer::VARIABLE, InputLexer::VARIABLE, InputLexer::LEXESIS, InputLexer::VARIABLE, InputLexer::VARIABLE, InputLexer::VARIABLE, InputLexer::VARIABLE, InputLexer::VARIABLE, InputLexer::VARIABLE, InputLexer::VARIABLE, InputLexer::VARIABLE, InputLexer::TERMINALS, InputLexer::VARIABLE, InputLexer::VARIABLE, InputLexer::VARIABLE, InputLexer::VARIABLE, InputLexer::START, InputLexer::VARIABLE, InputLexer::VARIABLE, InputLexer::VARIABLE, InputLexer::VARIABLE, InputLexer::VARIABLE, InputLexer::VARIABLE, InputLexer::GRAMMAR, InputLexer::VARIABLE, InputLexer::VARIABLE, InputLexer::TERMINAL, InputLexer::TERMINAL, InputLexer::VARIABLE, InputLexer::VARIABLE, InputLexer::LEFT, InputLexer::VARIABLE, InputLexer::VARIABLE, InputLexer::VARIABLE, InputLexer::RIGHT, InputLexer::VARIABLE, InputLexer::VARIABLE, InputLexer::VARIABLE, InputLexer::VARIABLE, InputLexer::VARIABLE, InputLexer::VARIABLE, InputLexer::NONASSOC, InputLexer::nonmatching, InputLexer::nonmatching, InputLexer::nonmatching, InputLexer::PARSERTYPE, InputLexer::NUM, InputLexer::nonmatching, InputLexer::nonmatching, InputLexer::nonmatching, InputLexer::LEXESISNAME, InputLexer::nonmatching, InputLexer::ARROW, InputLexer::nonmatching, InputLexer::nonmatching, InputLexer::SEMICOLON, InputLexer::COLON, InputLexer::PIPE, InputLexer::ignore, InputLexer::nonmatching, }; +} + +InputLexer::InputLexer(std::istream& in) : m_offset(0), m_input(in) { + +} + +InputLexer::~InputLexer() { +} + +InputLexer::Token InputLexer::nextToken() { + TokenType type = ignore; + std::string token; + + while (type == ignore) { + State state = 0; + std::size_t match_length = 0; + token = ""; + + while (!m_input.eof() && state != REJECT) { + char c = m_input.peek(); + if (m_input.eof()) + break; + + token += c; + + state = TABLE[state][TRANS_IDX[(unsigned char)c]]; + if (TOKENS[state]) + { + match_length = token.length(); + type = TOKENS[state]; + } + m_input.get(); + ++m_offset; + } + + std::size_t sdiff = token.length() - match_length; + for (std::size_t i = 0; i < sdiff; i++) + { + m_input.putback(token[token.length() - i - 1]); + } + m_offset -= sdiff; + + if (!type || !match_length) { + if (m_input.eof()) + throw NoMoreTokens(); + throw NoMatch(); + } + + token = token.substr(0, match_length); + } + + Token t; + t.type = type; + t.content = token; + return t; +} + +void InputLexer::skip(std::size_t n) { + for (size_t i = 0; i < n; i++) { + m_input.get(); + ++m_offset; + } +} + +char InputLexer::peek() { + if (m_input.eof()) + throw NoMoreTokens(); + return m_input.peek(); +} + +std::size_t InputLexer::getByteOffset() { + return m_offset; +} diff --git a/src/inputparser.cpp b/src/inputparser.cpp new file mode 100644 index 0000000..29d6253 --- /dev/null +++ b/src/inputparser.cpp @@ -0,0 +1,129 @@ +#include "InputLexer.h" +#include "Parsodus/inputparser.h" + +namespace pds { + + InputParserException::InputParserException(std::string what): m_what(what) {} + const char* InputParserException::what() const throw() { + return m_what.c_str(); + } + + Config InputParser::parseInput(std::istream& is) { + InputLexer lex(is); + Config config; + + + bool readingTerminals = false; + bool readingGrammar = false; + + try { + while(true) { + InputLexer::Token token = lex.nextToken(); + if(token.type != InputLexer::TERMINAL) + readingTerminals = false; + if(token.type != InputLexer::TERMINAL + && token.type != InputLexer::ARROW + && token.type != InputLexer::VARIABLE + && token.type != InputLexer::SEMICOLON + && token.type != InputLexer::PIPE) + readingGrammar = false; + switch(token.type) { + case InputLexer::PARSER: + lexColon(lex, token); + if(token.type == InputLexer::PARSERTYPE) { + if(token.content == "lalr(1)") { + config.parserType = Config::ParserType::LALR_1; + } else + throw InputParserException("Unkown parser type"); + } else { + throw InputParserException("inputfile malformed, no parser type found in parser section"); + } + break; + case InputLexer::LEXESIS: + lexColon(lex, token); + if(token.type != InputLexer::LEXESISNAME) + throw InputParserException("No valid input file found after 'lexesis', found '" + token.content + "' instead."); + config.lexesisFile = token.content; + break; + case InputLexer::TERMINALS: + lexColon(lex, token, false); + readingTerminals = true; + break; + case InputLexer::GRAMMAR: + lexColon(lex, token, false); + readingGrammar = true; + break; + case InputLexer::TERMINAL: + if(readingTerminals) { + config.grammar.terminals.insert(token.content); + } else { + throw InputParserException("Found a terminal outside a grammar or terminals section: " + token.content); + } + break; + case InputLexer::START: + lexColon(lex, token); + if(token.type != InputLexer::VARIABLE) + throw InputParserException("Expected to find a start variable, but found: " + token.content); + config.grammar.start = token.content; + break; + case InputLexer::VARIABLE: + if(readingGrammar) { + if(config.grammar.variables.find(token.content) == config.grammar.variables.end()) + config.grammar.variables.insert(token.content); + + std::string current_head = token.content; + std::set current_rules; + if(config.grammar.rules.count(current_head)) { + current_rules = config.grammar.rules[current_head]; + } + // Parsing rule + token = lex.nextToken(); + if(token.type != InputLexer::ARROW) + throw InputParserException("No arrow found after '"+ current_head+"', but found '" + token.content + "' instead"); + Rule rule; + rule.head = current_head; + bool parsing_head = true; + while(parsing_head) { + token = lex.nextToken(); + switch(token.type) { + case InputLexer::VARIABLE: + rule.tail.push_back(token.content); + break; + case InputLexer::TERMINAL: + rule.tail.push_back(token.content); + break; + case InputLexer::SEMICOLON: + parsing_head = false; + case InputLexer::PIPE: + rule.tail.shrink_to_fit(); + current_rules.insert(rule); + rule.tail.clear(); + break; + default: + throw InputParserException("Expecting to find a variable, terminal, pipe or a semicolon, but found '" + token.content + "' instead"); + } + } + config.grammar.rules[current_head] = current_rules; + } else + throw InputParserException("Found a variable outside a grammar section: " + token.content); + break; + default: + break; + + } + } + } catch(InputLexer::NoMoreTokens& err) { + + } + return config; + } + void InputParser::lexColon(InputLexer& lex, InputLexer::Token &token, bool nextoken) { + token = lex.nextToken(); + if(token.type != InputLexer::COLON) + throw InputParserException("No colon found before '" + token.content + "'"); + if(nextoken) + token = lex.nextToken(); + + } + +} diff --git a/src/inputparser.lxs b/src/inputparser.lxs index 00ad6bb..fb5e9c7 100644 --- a/src/inputparser.lxs +++ b/src/inputparser.lxs @@ -4,7 +4,7 @@ LEXESIS = lexesis TERMINALS = terminals START = start GRAMMAR = grammar -PARSERTYPE = [_a-zA-Z]+(\([1-9][0-9]*\))? +PARSERTYPE = [_a-zA-Z]+(\([0-9][0-9]*\)) LEFT = left RIGHT = right NONASSOC = nonassoc @@ -16,3 +16,4 @@ ARROW = ->|→ SEMICOLON = ; COLON = : PIPE = \| +ignore = \t| |\n|\r diff --git a/src/main.cpp b/src/main.cpp index 6fe6fe4..b0f5958 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -1,5 +1,48 @@ #include +#include +#include "optparse.h" +#include "Parsodus/inputparser.h" + +int main(int argc, char** argv) { + + optparse::OptionParser parser = optparse::OptionParser().description("Parsodus").usage("Parsodus [-d ] [-l ] [-n ] "); + parser.add_help_option(true); + parser.version("%prog 1.0"); + parser.add_option("-d", "--outputdir").dest("outputdir").help("Output the generated files to this directory\n[default: .]").metavar("").set_default("."); + parser.add_option("-l", "--lang", "--language").dest("language").help("The programming language to generate source files for\n[default: c++]").metavar("").set_default("c++"); + parser.add_option("-n", "--name").dest("lexername").help("Use this name for the generated parser, the default is based on the input file name").metavar(""); + optparse::Values options = parser.parse_args(argc, argv); + std::vector args = parser.args(); + + if (args.size() != 1) { + parser.print_usage(std::cerr); + return 1; + } + + std::ifstream infile(args[0]); + if (!infile.good()) { + std::cerr << "Could not open file '" << args[0] << "' for reading" << std::endl; + return 1; + } + + auto config = pds::InputParser::parseInput(infile); + + + + // Reporting what the inputparser found, to be removed... + std::cout << "Start: " << config.grammar.start << std::endl; + for(auto a: config.grammar.terminals) + std::cout << "Terminal: " << a << std::endl; + for(auto a: config.grammar.variables) + std::cout << "Variable: " << a << std::endl; + for(auto a: config.grammar.rules) { + std::cout << "Starting rule with head: " << a.first << std::endl; + for(auto b: a.second) { + std::cout << "\tRule with head: " << b.head << std::endl; + for(auto c: b.tail) { + std::cout << "\t\tFound replacement rule: " << c << std::endl; + } + } + } -int main() { - std::cout << "Parsodus to the rescue" << std::endl; }