From 3d59a970f835f95baada211294167b3273badc0b Mon Sep 17 00:00:00 2001 From: Robin Jadoul Date: Wed, 25 May 2016 18:49:15 +0200 Subject: [PATCH] First implementation for c++ backend, waiting for template and main integration --- include/Lexesis/backend.h | 5 +- include/Lexesis/backends/cpp.h | 30 ++++++++++ src/backends/cpp.cpp | 101 +++++++++++++++++++++++++++++++++ templates/c++/lexer.cpp | 10 ++-- 4 files changed, 140 insertions(+), 6 deletions(-) create mode 100644 include/Lexesis/backends/cpp.h create mode 100644 src/backends/cpp.cpp diff --git a/include/Lexesis/backend.h b/include/Lexesis/backend.h index cecc5fe..d30ffa7 100644 --- a/include/Lexesis/backend.h +++ b/include/Lexesis/backend.h @@ -5,7 +5,10 @@ #include "Lexesis/automata.h" #include "Lexesis/template.h" +#include "mstch/mstch.hpp" + #include +#include #include namespace lxs { @@ -17,7 +20,7 @@ namespace lxs { virtual std::string getName() = 0; virtual bool canProcessLang(std::string lang); - virtual void generateLexer(std::function getOstreamForFileName, std::string lexerName, const DFA& dfa) = 0; + virtual void generateLexer(std::function(std::string)> getOstreamForFileName, std::string lexerName, const DFA& dfa) = 0; protected: void doTemplate(std::ostream& out, std::string templateName, templ::TemplateContext context); diff --git a/include/Lexesis/backends/cpp.h b/include/Lexesis/backends/cpp.h new file mode 100644 index 0000000..f6ddc63 --- /dev/null +++ b/include/Lexesis/backends/cpp.h @@ -0,0 +1,30 @@ +#pragma once +#ifndef LEXESIS_BACKENDS_CPP_H +#define LEXESIS_BACKENDS_CPP_H + +#include "Lexesis/backend.h" + +namespace lxs +{ +namespace backends +{ + class CppBackend : public Backend { + public: + CppBackend(); + virtual ~CppBackend(); + + virtual bool canProcessLang(std::string lang); + + virtual void generateLexer(std::function(std::string)> getOstreamForFileName, std::string lexerName, const DFA& dfa); + + private: + templ::TemplateContext buildTable(const DFA& dfa, const std::vector& transition_idx, int num_transitions_per_state) const; + templ::TemplateContext buildTokenList(const DFA& dfa) const; + std::pair, int> buildTransitionIndices(const DFA& dfa) const; + templ::TemplateContext transformTransitionIndices(std::vector transition_indices) const; + }; + +} +} + +#endif //LEXESIS_BACKENDS_CPP_H diff --git a/src/backends/cpp.cpp b/src/backends/cpp.cpp new file mode 100644 index 0000000..a149280 --- /dev/null +++ b/src/backends/cpp.cpp @@ -0,0 +1,101 @@ +#include "Lexesis/backends/cpp.h" + +#include + +namespace lxs { namespace backends { + + CppBackend::CppBackend() : Backend() + {} + + CppBackend::~CppBackend() + {} + + bool CppBackend::canProcessLang(std::string lang) { + for (char& c : lang) + c = std::tolower(c); + return lang == "c++" || lang == "cpp" || lang == "cxx"; + } + + void CppBackend::generateLexer( + std::function(std::string)> getOstreamForFileName, + std::string lexerName, + const DFA& dfa) + { + assert(lexerName.length()); + + std::unique_ptr headerStream = getOstreamForFileName(lexerName + ".h"); + std::unique_ptr implementationStream = getOstreamForFileName(lexerName + ".cpp"); + + std::map topLevel; + + lexerName[0] = std::toupper(lexerName[0]); + topLevel["name"] = templ::make_string(lexerName); + + //The DEADSTATE gets a brand new state: dfa.numStates + topLevel["reject_state"] = templ::make_string(std::to_string(dfa.numStates)); + topLevel["num_states"] = templ::make_string(std::to_string(dfa.numStates + 1)); + + auto transition_indices = buildTransitionIndices(dfa); + topLevel["trans_idx"] = transformTransitionIndices(transition_indices.first); + topLevel["num_transitions_per_state"] = templ::make_string(std::to_string(transition_indices.second)); + + topLevel["table"] = buildTable(dfa, transition_indices.first); + + topLevel["token_types"] = buildTokenList(dfa); + + templ::TemplateContext topLevelMap = templ::make_map(topLevel); + + doTemplate(*headerStream, "c++/lexer.h", topLevelMap); + doTemplate(*implementationStream, "c++/lexer.cpp", topLevelMap); + } + + templ::TemplateContext CppBackend::buildTable(const DFA& dfa, const std::vector& transition_idx, int num_transitions_per_state) const { + std::map reverse_trans; + for (int i = 0; i < 256; i++) { + reverse_trans[transition_idx[i]] = i; + } + + std::vector table; + + for (State s = 0; s < dfa.numStates; s++) { + std::vector row; + for (int i = 0; i < num_transitions_per_state; i++) { + State to = dfa.delta.find(s)->second.find(reverse_trans[i])->second; + row.push_back(templ::make_map({{"state", templ::make_string(std::to_string(to))}})); + } + } + + return templ::make_array(table); + } + + templ::TemplateContext CppBackend::buildTokenList(const DFA& dfa) const { + std::set tokens; + for (const auto& pr : dfa.acceptingToken) { + tokens.insert(pr.second); + } + tokens.insert("ignore"); + + std::vector tokenList; + for (const std::string& s : tokens) { + tokenList.push_back(templ::make_map({{"type", templ::make_string(s)}})); + } + return templ::make_array(tokenList); + } + + std::pair, int> CppBackend::buildTransitionIndices(const DFA& /* dfa */) const { + //FIXME: this is not really optimal ;-) + std::vector transition_idx; + for (int i = 0; i < 256; i++) + transition_idx.push_back(i); + return std::make_pair(transition_idx, 256); + } + + templ::TemplateContext CppBackend::transformTransitionIndices(std::vector transition_indices) const { + std::vector new_trans; + for (auto& i : transition_indices) { + new_trans.push_back(templ::make_map({{"trans", templ::make_string("(char)" + std::to_string(i))}})); + } + return templ::make_array(new_trans); + } + +} } //namespace lxs::backends diff --git a/templates/c++/lexer.cpp b/templates/c++/lexer.cpp index 9ae9a41..975a959 100644 --- a/templates/c++/lexer.cpp +++ b/templates/c++/lexer.cpp @@ -4,11 +4,11 @@ #include namespace { //The automaton data - typedef unsigned long long State; + typedef std::size_t State; State REJECT = {{reject_state}}; - char TRANS_IDX[256] = { {{#trans_idx}}{{trans}}, {{/trans_idx}} }; + unsigned char TRANS_IDX[256] = { {{#trans_idx}}{{trans}}, {{/trans_idx}} }; State TABLE[{{num_states}}][{{num_transitions_per_state}}] = { {{#table}} { {{#row}} {{state}}, {{/row}} }, @@ -42,7 +42,7 @@ namespace { //The automaton data token += c; - state = TABLE[state][TRANS_IDX[c]]; + state = TABLE[state][TRANS_IDX[(unsigned char)c]]; if (TOKENS[state]) { match_length = token.length(); @@ -99,10 +99,10 @@ int main(int argc, char** argv) { while (true) std::cout << "Match: " << lex.nextToken().content << std::endl; } - catch (decltype(lex)::NoMoreTokens& err) { + catch ({{name}}::NoMoreTokens& err) { std::cout << "DONE, read " << lex.getByteOffset() << " bytes." << std::endl; } - catch (decltype(lex)::NoMatch& err) { + catch ({{name}}::NoMatch& err) { std::cout << "No match, " << lex.getByteOffset() << std::endl; } }