diff --git a/templates/c++/lexer.cpp b/templates/c++/lexer.cpp new file mode 100644 index 0000000..9ae9a41 --- /dev/null +++ b/templates/c++/lexer.cpp @@ -0,0 +1,108 @@ +#include "{{name}}.h" + +#include +#include + +namespace { //The automaton data + typedef unsigned long long State; + + State REJECT = {{reject_state}}; + + char TRANS_IDX[256] = { {{#trans_idx}}{{trans}}, {{/trans_idx}} }; + + State TABLE[{{num_states}}][{{num_transitions_per_state}}] = { {{#table}} + { {{#row}} {{state}}, {{/row}} }, + {{/table}} }; + + {{name}}::TokenType TOKENS[{{num_states}}] = { {{#tokens}} {{name}}::{{token}}, {{/tokens}} }; +} + +{{name}}::{{name}}(std::istream& in) : m_offset(0), m_input(in) { + +} + +{{name}}::~{{name}}() { +} + +{{name}}::Token {{name}}::nextToken() { + TRANS_IDX['a'] = 1; + TRANS_IDX['b'] = 2; + TokenType type = ignore; + std::string token; + + while (type == ignore) { + State state = 0; + std::size_t match_length = 0; + token = ""; + + while (!m_input.eof() && state != REJECT) { + char c = m_input.peek(); + if (m_input.eof()) + break; + + token += c; + + state = TABLE[state][TRANS_IDX[c]]; + if (TOKENS[state]) + { + match_length = token.length(); + type = TOKENS[state]; + } + m_input.get(); + ++m_offset; + } + + std::size_t sdiff = token.length() - match_length; + for (std::size_t i = 0; i < sdiff; i++) + { + m_input.putback(token[token.length() - i - 1]); + } + m_offset -= sdiff; + + if (!type || !match_length) { + if (m_input.eof()) + throw NoMoreTokens(); + throw NoMatch(); + } + + token = token.substr(0, match_length); + } + + Token t; + t.type = type; + t.content = token; + return t; +} + +void {{name}}::skip(std::size_t n) { + for (size_t i = 0; i < n; i++) { + m_input.get(); + ++m_offset; + } +} + +char {{name}}::peek() { + if (m_input.eof()) + throw NoMoreTokens(); + return m_input.peek(); +} + +std::size_t {{name}}::getByteOffset() { + return m_offset; +} + +//Temporary main +int main(int argc, char** argv) { + std::istringstream in(argv[1]); + {{name}} lex(in); + try { + while (true) + std::cout << "Match: " << lex.nextToken().content << std::endl; + } + catch (decltype(lex)::NoMoreTokens& err) { + std::cout << "DONE, read " << lex.getByteOffset() << " bytes." << std::endl; + } + catch (decltype(lex)::NoMatch& err) { + std::cout << "No match, " << lex.getByteOffset() << std::endl; + } +} diff --git a/templates/c++/lexer.h b/templates/c++/lexer.h new file mode 100644 index 0000000..4c84556 --- /dev/null +++ b/templates/c++/lexer.h @@ -0,0 +1,61 @@ +#pragma once +#ifndef LEXER_{{name}}_H +#define LEXER_{{name}}_H + +#include +#include +#include + +class {{name}} { + public: + class NoMoreTokens : public std::exception {}; + class NoMatch : public std::exception {}; + + enum TokenType { + nonmatching, + {{#token_types}} + {{type}}, + {{/token_types}} + }; + + struct Token { + TokenType type; + std::string content; + }; + + {{name}}(std::istream& in); + ~{{name}}(); + + /** + * Get the next token + * + * @throws NoMoreTokens if no more tokens are available + * @throws NoMatch if no match was found + */ + Token nextToken(); + + /** + * Skip the following `n` bytes. + * + * @param n The number of bytes to skip + */ + void skip(std::size_t n); + + /** + * Peek at the current head of the input stream, useful in error reporting when a character mismatches for example + * + * @throws NoMoreTokens if the input stream is at an end + */ + char peek(); + + /** + * Get the current byte offset + */ + std::size_t getByteOffset(); + + private: + std::size_t m_offset; + std::istream& m_input; +}; + +#endif //LEXER_{{name}}_H