From 782c7a8649324e9bb4007022ae7cad905399e062 Mon Sep 17 00:00:00 2001 From: Robin Jadoul Date: Fri, 27 May 2016 19:08:36 +0200 Subject: [PATCH] Documentation --- include/Lexesis/automata.h | 59 +++++++++++++++++++++++++++++++- include/Lexesis/backend.h | 42 +++++++++++++++++++++++ include/Lexesis/backendmanager.h | 18 +++++++++- include/Lexesis/backends/cpp.h | 23 +++++++++++++ include/Lexesis/driver.h | 21 ++++++++++++ include/Lexesis/inputparser.h | 13 +++++++ include/Lexesis/re.h | 56 ++++++++++++++++++++++++++++++ include/Lexesis/template.h | 32 +++++++++++++++++ src/automata.cpp | 6 ++++ src/backends/cpp.cpp | 2 ++ src/driver.cpp | 14 ++++---- src/re.cpp | 18 ++++++++++ 12 files changed, 295 insertions(+), 9 deletions(-) diff --git a/include/Lexesis/automata.h b/include/Lexesis/automata.h index 6874e35..355cd71 100644 --- a/include/Lexesis/automata.h +++ b/include/Lexesis/automata.h @@ -1,3 +1,8 @@ +/** + * Lexesis/automata.h + * + * A file describing basic automata (DFA, NFA and e-NFA), and some operations on them. + */ #pragma once #ifndef AUTOMATA_H #define AUTOMATA_H @@ -14,6 +19,18 @@ namespace lxs { typedef unsigned long long Priority; const State deadState = ULLONG_MAX; + /** + * A basic automaton, the basis for DFA's, NFA's and epsilon-NFA's + * The states are implicit, from 0 to numStates - 1 + * The starting state can be specified, though some methods probably assume it is 0 + * + * The priority and acceptingToken are associations with accepting states + * The lower the priority, the more important. + * + * No transitions are specified yet, since that is the main point of difference between different FA's + * + * The alphabet is always considered every char from 0 to 255 + */ struct Automaton { State numStates = 0; std::set accepting; @@ -22,28 +39,68 @@ namespace lxs { State starting; }; + /** + * A Deterministic finite automaton + * An automaton which should have exactly one transition per state per char + */ struct DFA : public Automaton { std::map > delta; }; + /** + * A nondeterministic FA + * Has an arbitrary amount of transitions per state per char + */ struct NFA : public Automaton { std::map > > delta; + + /** + * compute the epsilon closure for a state + * Returns {s} for a normal NFA, since it has no epsilon transitions + */ virtual std::set eClose(State) const; }; + /** + * An epsilon NFA + * In addition to a normal NFA, can have 'free'/epsilon transitions which do not require a char + */ struct ENFA : public NFA { std::map > epsilonTransitions; - virtual std::set eClose(State) const; virtual std::set eClose(State) const; }; + /** + * Convert a DFA to graphviz dot format, can be useful when debugging + */ std::string toDot(const DFA& d); + + /** + * Convert a NFA to graphviz dot format, can be useful when debugging + */ std::string toDot(const NFA& n); + + /** + * Convert a ENFA to graphviz dot format, can be useful when debugging + */ std::string toDot(const ENFA& e); + /** + * Merge a collection of ENFA's by adding a new starting state in front and connecting it to the old starting states with an epsilon transition + */ ENFA merge(const std::vector& enfas); + + /** + * Modified subset construction: convert an (E)NFA to a DFA + * takes priorities and acceptingTokens into consideration + */ DFA mssc(const NFA& e); + + /** + * Minimize a DFA + * takes priorities and acceptingTokens into consideration to never merge two accepting states with a different acceptingToken + */ DFA minimize(const DFA& d); } //namespace lxs diff --git a/include/Lexesis/backend.h b/include/Lexesis/backend.h index d30ffa7..e260f1b 100644 --- a/include/Lexesis/backend.h +++ b/include/Lexesis/backend.h @@ -12,20 +12,62 @@ #include namespace lxs { + /** + * A general interface for a Lexesis backend + */ class Backend { public: + /** + * Constructor + */ Backend(); + + /** + * Destructor + */ virtual ~Backend(); + /** + * Report a name for the backend + * used in resolving template paths + * + * @return std::string A name for the backend + */ virtual std::string getName() = 0; + + /** + * Can this backend process the language with given description? + * + * @param lang A description for a language (eg. "c++", "cxx", "cpp") + * @return Can this backend process it + */ virtual bool canProcessLang(std::string lang); + /** + * The function that gets called to generate the actual lexer + * + * @param getOstreamForFileName A function that takes a filename and returns a std::ostream that the backend can write to for that filename + * @param lexerName The name that should be given to the lexer + * @param dfa The automaton that should be used in generating the lexer + */ virtual void generateLexer(std::function(std::string)> getOstreamForFileName, std::string lexerName, const DFA& dfa) = 0; protected: + /** + * Render a template (with given (file)name) to the given ostream with the information provided + * + * @param out The ostream to write the rendered template to + * @param templateName An identifier for the template, is combined with `getName()` to construct the actual path + * @param context The information that should be provided to the template when rendering + */ void doTemplate(std::ostream& out, std::string templateName, templ::TemplateContext context); private: + /** + * Find the template with given name + * + * @param templateName the template name, gets combined with `getName()` + */ std::string findTemplate(std::string templateName); }; } diff --git a/include/Lexesis/backendmanager.h b/include/Lexesis/backendmanager.h index 73c993d..e734ec5 100644 --- a/include/Lexesis/backendmanager.h +++ b/include/Lexesis/backendmanager.h @@ -9,14 +9,30 @@ #include "Lexesis/backend.h" namespace lxs { + /** + * A manager for backends + * Aggregates and allow to search for backends that can process a specific language + */ class BackendManager { public: + /** + * Add a backend to the list of registered backends + * + * @param backend The backend to register + */ void registerBackend(std::unique_ptr backend); + /** + * Get a backend that can process the given language + * The manager retains ownership of the returned pointer + * + * @param lang The language the backend should be able to process + * @returns A pointer to a Backend if it can find one, nullptr otherwise + */ Backend* findBackendForLang(std::string lang); private: - std::vector > m_backends; + std::vector > m_backends; ///< The list of registered backends }; } diff --git a/include/Lexesis/backends/cpp.h b/include/Lexesis/backends/cpp.h index b7a15c1..0e86b1f 100644 --- a/include/Lexesis/backends/cpp.h +++ b/include/Lexesis/backends/cpp.h @@ -8,6 +8,9 @@ namespace lxs { namespace backends { + /** + * A backend that emits c++ code + */ class CppBackend : public Backend { public: CppBackend(); @@ -19,9 +22,29 @@ namespace backends virtual void generateLexer(std::function(std::string)> getOstreamForFileName, std::string lexerName, const DFA& dfa); private: + /** + * Build a TemplateContext that represents the transition table + * + * @param transition_idx \see buildTransitionIndices + */ templ::TemplateContext buildTable(const DFA& dfa, const std::vector& transition_idx, int num_transitions_per_state) const; + + /** + * Build a TemplateContext that represents the list of associated tokens with each state + */ templ::TemplateContext buildTokenList(const DFA& dfa) const; + + /** + * For compression of the table, build a list that maps each char to an index + * This way, whenever multiple chars always represent the same transition, the can get the same index, and the table is smaller + * + * @return a pair with the list and the number of distinct indices + */ std::pair, int> buildTransitionIndices(const DFA& dfa) const; + + /** + * Transform the given indices (\see buildTransitionIndices) to a usable TemplateContext + */ templ::TemplateContext transformTransitionIndices(const std::vector& transition_indices) const; }; diff --git a/include/Lexesis/driver.h b/include/Lexesis/driver.h index c248432..6593801 100644 --- a/include/Lexesis/driver.h +++ b/include/Lexesis/driver.h @@ -8,11 +8,32 @@ #include "Lexesis/backendmanager.h" namespace lxs { + /** + * The main driver for Lexesis + */ class Driver { public: + /** + * Constructor + * + * @param backends The backendmanager, prepared with all needed supported backends + * @param inputfile An istream which should be read to be used as token rules specifications + * @param outputdir A string representing the directory where generated files should be places + * @param language The language to generate output for (backends is queried for this language) + * @param lexername The name to give to the generated lexer, this gets cleaned to only contains alphanumeric chars or underscore and start with a non-digit (AKA a valid identifier) + */ Driver(std::unique_ptr backends, std::istream& inputfile, std::string outputdir, std::string language, std::string lexername); + + /** + * Destructor + */ ~Driver(); + /** + * Run this driver, all the preparation should happen when calling the constructor + * + * @return The status code this would return if it were a main function + */ int run(); private: diff --git a/include/Lexesis/inputparser.h b/include/Lexesis/inputparser.h index 4a50670..f9dbfda 100644 --- a/include/Lexesis/inputparser.h +++ b/include/Lexesis/inputparser.h @@ -10,11 +10,24 @@ namespace lxs { struct DFA; struct ENFA; + /** + * Used for parsing token rules + */ class InputParser { public: + /** + * parse the tokens rules read from `is` and return the minimized constructed dfa from those rules + */ static DFA parseInput(std::istream& is); private: + /** + * parse the lines and return pairs of (Token type, regex) + */ static std::vector > parseLines(std::istream &is); + + /** + * Convert the lines from `parseLines` to ENFA's + */ static std::vector linesToEnfa(std::vector > &input); }; } diff --git a/include/Lexesis/re.h b/include/Lexesis/re.h index 3d30bf3..2890e9f 100644 --- a/include/Lexesis/re.h +++ b/include/Lexesis/re.h @@ -8,14 +8,24 @@ #include namespace lxs { + /** + * An abstract regular expression + */ class RE { public: virtual ~RE() {} + /** + * Convert this regex to an ENFA + * This extends the given enfa, and attaches itself to the given `attach` state + */ virtual State toENFA(ENFA& enfa, State attach) = 0; virtual std::string toRe() = 0; }; + /** + * A regex for the empty language + */ class EmptyRE : public RE { public: @@ -25,6 +35,9 @@ namespace lxs { virtual std::string toRe(); }; + /** + * A regex for the language containing only the empty string + */ class EpsilonRE : public RE { public: @@ -34,9 +47,15 @@ namespace lxs { virtual std::string toRe(); }; + /** + * A regex for the language containing a single character + */ class SingleRE : public RE { public: + /** + * @param c The character of the language + */ SingleRE(char c) : c(c) {} ~SingleRE() {} virtual State toENFA(ENFA& enfa, State attach); @@ -45,9 +64,15 @@ namespace lxs { char c; }; + /** + * A regex for the language containing multiple single-symbol strings + */ class MultiRE : public RE { public: + /** + * @param chars The list of symbols contained in the language + */ MultiRE(std::vector chars) : chars(std::move(chars)) {} ~MultiRE() {} virtual State toENFA(ENFA& enfa, State attach); @@ -56,9 +81,16 @@ namespace lxs { std::vector chars; }; + /** + * A regex for the concatenation of two languages + */ class ConcatRE : public RE { public: + /** + * @param e The first language + * @param f The second language + */ ConcatRE(std::shared_ptr e, std::shared_ptr f) : e(e), f(f) {} ~ConcatRE() {} virtual State toENFA(ENFA& enfa, State attach); @@ -67,9 +99,15 @@ namespace lxs { std::shared_ptr e, f; }; + /** + * The regex for the kleene star of a language + */ class StarRE : public RE { public: + /** + * @param e The language to apply the star to + */ StarRE(std::shared_ptr e) : e(e) {} ~StarRE() {} virtual State toENFA(ENFA& enfa, State attach); @@ -78,9 +116,16 @@ namespace lxs { std::shared_ptr e; }; + /** + * A regex for the sum/disjunction of two languages + */ class PlusRE : public RE { public: + /** + * @param e The first language + * @param f The second language + */ PlusRE(std::shared_ptr e, std::shared_ptr f) : e(e), f(f) {} ~PlusRE() {} virtual State toENFA(ENFA& enfa, State attach); @@ -89,8 +134,19 @@ namespace lxs { std::shared_ptr e, f; }; + /** + * Parse the given regular expression and return the associated Regex + * + * @param input The regular expression to parse + * @returns An abstraction representation of `input` + * + * @throws SyntaxError if the regex is invalid, the `what()` method contains some information on the problem. + */ std::shared_ptr parseRE(const std::string& input); + /** + * An exception to represent a syntax error in a regular expression + */ class SyntaxError : public std::runtime_error { public: diff --git a/include/Lexesis/template.h b/include/Lexesis/template.h index 1831e34..3ed9451 100644 --- a/include/Lexesis/template.h +++ b/include/Lexesis/template.h @@ -9,17 +9,49 @@ namespace lxs { namespace templ { + /** + * A changeable information structure for templates + */ using TemplateContext = mstch::node; + /** + * Make a TemplateContext string + */ TemplateContext make_string(std::string); + + /** + * Make a TemplateContext map/dictionary + */ TemplateContext make_map(std::map); + + /** + * Make a TemplateContext array/vector + */ TemplateContext make_array(std::vector); + /** + * A generic wrapper around whichever templating system gets used + */ class Template { public: + /** + * Construct a Template from given filename + * + * @param filename The name of the file which contains the template rules + */ Template(std::string filename); + + /** + * Destructor + */ ~Template(); + /** + * Render this template to `out` using the information in `context` + * + * @param out The ostream to render to + * @param context The information to provide the template rules while rendering + */ void render(std::ostream& out, TemplateContext& context); private: std::string m_filename; diff --git a/src/automata.cpp b/src/automata.cpp index ee53f01..88fd8c2 100644 --- a/src/automata.cpp +++ b/src/automata.cpp @@ -148,6 +148,9 @@ namespace lxs { return rev; } + /** + * Put all reachables states in `d`, starting from `s` into `reachable` + */ void markReachable(const DFA& d, State s, std::set& reachable) { if (reachable.count(s) > 0) return; @@ -298,6 +301,9 @@ namespace lxs { namespace { // Utility function for mssc + /** + * Get the next state, by taking the union of all next states in the NFA by following any character in `oldstate` + */ std::set getNextState(const std::set& oldstate, char symbol, const NFA& e) { std::set states; for(const auto &state: oldstate) { diff --git a/src/backends/cpp.cpp b/src/backends/cpp.cpp index 8d194ac..96d5972 100644 --- a/src/backends/cpp.cpp +++ b/src/backends/cpp.cpp @@ -4,6 +4,8 @@ #include namespace { + //Some shortcut utility functions for creating a TemplateContext + lxs::templ::TemplateContext make_map_elem(std::string key, std::string value) { return lxs::templ::make_map({{key, lxs::templ::make_string(value)}}); } diff --git a/src/driver.cpp b/src/driver.cpp index 23d1ef7..08b2d68 100644 --- a/src/driver.cpp +++ b/src/driver.cpp @@ -11,7 +11,7 @@ namespace { std::string clean(std::string in) { std::string s; for (char c : in) { - if ((s.length() && std::isalnum(c)) || std::isalpha(c)) + if ((s.length() && std::isalnum(c)) || std::isalpha(c) || c == '_') s += c; } return s; @@ -25,17 +25,17 @@ namespace lxs { m_outputdir(outputdir), m_language(language), m_lexername(clean(lexername)) - { - if (!m_lexername.length()) { - std::cerr << "No valid lexer name possible" << std::endl; - exit(1); - } - } + {} Driver::~Driver() {} int Driver::run() { + if (!m_lexername.length()) { + std::cerr << "No valid lexer name possible" << std::endl; + return 1; + } + Backend* back = m_backends->findBackendForLang(m_language); if (!back) { std::cerr << "Could not find a valid backend for language " << m_language << std::endl; diff --git a/src/re.cpp b/src/re.cpp index 0c88afd..581f85b 100644 --- a/src/re.cpp +++ b/src/re.cpp @@ -115,6 +115,9 @@ namespace lxs { } namespace { + /** + * Take the two top elements from `stk` and combine them with a ConcatRE + */ void compress(stack>& stk) { std::shared_ptr a = stk.top(); @@ -124,6 +127,9 @@ namespace lxs { stk.push(std::make_shared(b, a)); //Attention: reversed order because of stack } + /** + * Apply compress until only one RE remains on the stack + */ void compactStack(stack >& stk) { if (stk.empty()) return; @@ -136,6 +142,9 @@ namespace lxs { stk.push(tp); } + /** + * Get the actual char that should be used when c is placed after a backslash + */ char parseEscapeChar(char c) { switch (c) { @@ -175,6 +184,9 @@ namespace lxs { return c; } + /** + * Parse a character class + */ std::shared_ptr parseCharacterClass(const string& input, size_t& idx) { if (idx >= input.size()) throw SyntaxError("Unclosed character class"); @@ -257,6 +269,9 @@ namespace lxs { return std::make_shared(chars); } + /** + * Return the RE for the `.` pattern: everything except a newline + */ std::shared_ptr dotChar() { std::vector any; for (int i = 0; i < 256; i++) @@ -265,6 +280,9 @@ namespace lxs { return std::make_shared(any); } + /** + * Parse the actual regex + */ std::shared_ptr parseRE(const string& input, size_t& idx) { stack > stk;