Documentation

2016-05-27 19:08:36 +02:00 · 2016-05-27 19:08:36 +02:00 · 782c7a8649
parent 23437e8f23
commit 782c7a8649
12 changed files with 295 additions and 9 deletions
--- a/include/Lexesis/automata.h
+++ b/include/Lexesis/automata.h
@ -1,3 +1,8 @@
+/**
+ * Lexesis/automata.h
+ *
+ * A file describing basic automata (DFA, NFA and e-NFA), and some operations on them.
+ */
 #pragma once
 #ifndef AUTOMATA_H
 #define AUTOMATA_H
@ -14,6 +19,18 @@ namespace lxs {
    typedef unsigned long long Priority;
    const State deadState = ULLONG_MAX;

+    /**
+     * A basic automaton, the basis for DFA's, NFA's and epsilon-NFA's
+     * The states are implicit, from 0 to numStates - 1
+     * The starting state can be specified, though some methods probably assume it is 0
+     *
+     * The priority and acceptingToken are associations with accepting states
+     * The lower the priority, the more important.
+     *
+     * No transitions are specified yet, since that is the main point of difference between different FA's
+     *
+     * The alphabet is always considered every char from 0 to 255
+     */
    struct Automaton {
        State numStates = 0;
        std::set<State> accepting;
@ -22,28 +39,68 @@ namespace lxs {
        State starting;
    };

+    /**
+     * A Deterministic finite automaton
+     * An automaton which should have exactly one transition per state per char
+     */
    struct DFA : public Automaton {
        std::map<State, std::map<char, State> > delta;
    };

+    /**
+     * A nondeterministic FA
+     * Has an arbitrary amount of transitions per state per char
+     */
    struct NFA : public Automaton {
        std::map<State, std::map<char, std::set<State> > > delta;
+
+        /**
+         * compute the epsilon closure for a state
+         * Returns {s} for a normal NFA, since it has no epsilon transitions
+         */
        virtual std::set<State> eClose(State) const;
    };

+    /**
+     * An epsilon NFA
+     * In addition to a normal NFA, can have 'free'/epsilon transitions which do not require a char
+     */
    struct ENFA : public NFA {
        std::map<State, std::set<State> > epsilonTransitions;
-        virtual std::set<State> eClose(State) const;

        virtual std::set<State> eClose(State) const;
    };

+    /**
+     * Convert a DFA to graphviz dot format, can be useful when debugging
+     */
    std::string toDot(const DFA& d);
+
+    /**
+     * Convert a NFA to graphviz dot format, can be useful when debugging
+     */
    std::string toDot(const NFA& n);
+
+    /**
+     * Convert a ENFA to graphviz dot format, can be useful when debugging
+     */
    std::string toDot(const ENFA& e);

+    /**
+     * Merge a collection of ENFA's by adding a new starting state in front and connecting it to the old starting states with an epsilon transition
+     */
    ENFA merge(const std::vector<ENFA>& enfas);
+
+    /**
+     * Modified subset construction: convert an (E)NFA to a DFA
+     * takes priorities and acceptingTokens into consideration
+     */
    DFA mssc(const NFA& e);
+
+    /**
+     * Minimize a DFA
+     * takes priorities and acceptingTokens into consideration to never merge two accepting states with a different acceptingToken
+     */
    DFA minimize(const DFA& d);
 } //namespace lxs

--- a/include/Lexesis/backend.h
+++ b/include/Lexesis/backend.h
@ -12,20 +12,62 @@
 #include <string>

 namespace lxs {
+    /**
+     * A general interface for a Lexesis backend
+     */
    class Backend {
        public:
+            /**
+             * Constructor
+             */
            Backend();
+
+            /**
+             * Destructor
+             */
            virtual ~Backend();

+            /**
+             * Report a name for the backend
+             * used in resolving template paths
+             *
+             * @return std::string A name for the backend
+             */
            virtual std::string getName() = 0;
+
+            /**
+             * Can this backend process the language with given description?
+             *
+             * @param lang A description for a language (eg. "c++", "cxx", "cpp")
+             * @return Can this backend process it
+             */
            virtual bool canProcessLang(std::string lang);

+            /**
+             * The function that gets called to generate the actual lexer
+             *
+             * @param getOstreamForFileName A function that takes a filename and returns a std::ostream that the backend can write to for that filename
+             * @param lexerName The name that should be given to the lexer
+             * @param dfa The automaton that should be used in generating the lexer
+             */
            virtual void generateLexer(std::function<std::unique_ptr<std::ostream>(std::string)> getOstreamForFileName, std::string lexerName, const DFA& dfa) = 0;

        protected:
+            /**
+             * Render a template (with given (file)name) to the given ostream with the information provided
+             *
+             * @param out The ostream to write the rendered template to
+             * @param templateName An identifier for the template, is combined with `getName()` to construct the actual path
+             * @param context The information that should be provided to the template when rendering
+             */
            void doTemplate(std::ostream& out, std::string templateName, templ::TemplateContext context);

        private:
+            /**
+             * Find the template with given name
+             *
+             * @param templateName the template name, gets combined with `getName()`
+             */
            std::string findTemplate(std::string templateName);
    };
 }
--- a/include/Lexesis/backendmanager.h
+++ b/include/Lexesis/backendmanager.h
@ -9,14 +9,30 @@
 #include "Lexesis/backend.h"

 namespace lxs {
+    /**
+     * A manager for backends
+     * Aggregates and allow to search for backends that can process a specific language
+     */
    class BackendManager {
        public:
+            /**
+             * Add a backend to the list of registered backends
+             *
+             * @param backend The backend to register
+             */
            void registerBackend(std::unique_ptr<Backend> backend);

+            /**
+             * Get a backend that can process the given language
+             * The manager retains ownership of the returned pointer
+             *
+             * @param lang The language the backend should be able to process
+             * @returns A pointer to a Backend if it can find one, nullptr otherwise
+             */
            Backend* findBackendForLang(std::string lang);

        private:
-            std::vector<std::unique_ptr<Backend> > m_backends;
+            std::vector<std::unique_ptr<Backend> > m_backends; ///< The list of registered backends
    };
 }

--- a/include/Lexesis/backends/cpp.h
+++ b/include/Lexesis/backends/cpp.h
@ -8,6 +8,9 @@ namespace lxs
 {
 namespace backends
 {
+    /**
+     * A backend that emits c++ code
+     */
    class CppBackend : public Backend {
        public:
            CppBackend();
@ -19,9 +22,29 @@ namespace backends
            virtual void generateLexer(std::function<std::unique_ptr<std::ostream>(std::string)> getOstreamForFileName, std::string lexerName, const DFA& dfa);

        private:
+            /**
+             * Build a TemplateContext that represents the transition table
+             *
+             * @param transition_idx \see buildTransitionIndices
+             */
            templ::TemplateContext buildTable(const DFA& dfa, const std::vector<unsigned char>& transition_idx, int num_transitions_per_state) const;
+
+            /**
+             * Build a TemplateContext that represents the list of associated tokens with each state
+             */
            templ::TemplateContext buildTokenList(const DFA& dfa) const;
+
+            /**
+             * For compression of the table, build a list that maps each char to an index
+             * This way, whenever multiple chars always represent the same transition, the can get the same index, and the table is smaller
+             *
+             * @return a pair with the list and the number of distinct indices
+             */
            std::pair<std::vector<unsigned char>, int> buildTransitionIndices(const DFA& dfa) const;
+
+            /**
+             * Transform the given indices (\see buildTransitionIndices) to a usable TemplateContext
+             */
            templ::TemplateContext transformTransitionIndices(const std::vector<unsigned char>& transition_indices) const;
    };

--- a/include/Lexesis/driver.h
+++ b/include/Lexesis/driver.h
@ -8,11 +8,32 @@
 #include "Lexesis/backendmanager.h"

 namespace lxs {
+    /**
+     * The main driver for Lexesis
+     */
    class Driver {
        public:
+            /**
+             * Constructor
+             *
+             * @param backends The backendmanager, prepared with all needed supported backends
+             * @param inputfile An istream which should be read to be used as token rules specifications
+             * @param outputdir A string representing the directory where generated files should be places
+             * @param language The language to generate output for (backends is queried for this language)
+             * @param lexername The name to give to the generated lexer, this gets cleaned to only contains alphanumeric chars or underscore and start with a non-digit (AKA a valid identifier)
+             */
            Driver(std::unique_ptr<BackendManager> backends, std::istream& inputfile, std::string outputdir, std::string language, std::string lexername);
+
+            /**
+             * Destructor
+             */
            ~Driver();

+            /**
+             * Run this driver, all the preparation should happen when calling the constructor
+             *
+             * @return The status code this would return if it were a main function
+             */
            int run();

        private:
--- a/include/Lexesis/inputparser.h
+++ b/include/Lexesis/inputparser.h
@ -10,11 +10,24 @@ namespace lxs {
    struct DFA;
    struct ENFA; 

+    /**
+     * Used for parsing token rules
+     */
    class InputParser {
        public:
+            /**
+             * parse the tokens rules read from `is` and return the minimized constructed dfa from those rules
+             */
            static DFA parseInput(std::istream& is);
        private:
+            /**
+             * parse the lines and return pairs of (Token type, regex)
+             */
            static std::vector<std::pair<std::string,std::string> > parseLines(std::istream &is);
+
+            /**
+             * Convert the lines from `parseLines` to ENFA's
+             */
            static std::vector<ENFA> linesToEnfa(std::vector<std::pair<std::string,std::string> > &input);
    };
 }
--- a/include/Lexesis/re.h
+++ b/include/Lexesis/re.h
@ -8,14 +8,24 @@
 #include <stdexcept>

 namespace lxs {
+    /**
+     * An abstract regular expression
+     */
    class RE
    {
        public:
            virtual ~RE() {}
+            /**
+             * Convert this regex to an ENFA
+             * This extends the given enfa, and attaches itself to the given `attach` state
+             */
            virtual State toENFA(ENFA& enfa, State attach) = 0;
            virtual std::string toRe() = 0;
    };

+    /**
+     * A regex for the empty language
+     */
    class EmptyRE : public RE
    {
        public:
@ -25,6 +35,9 @@ namespace lxs {
            virtual std::string toRe();
    };

+    /**
+     * A regex for the language containing only the empty string
+     */
    class EpsilonRE : public RE
    {
        public:
@ -34,9 +47,15 @@ namespace lxs {
            virtual std::string toRe();
    };

+    /**
+     * A regex for the language containing a single character
+     */
    class SingleRE : public RE
    {
        public:
+            /**
+             * @param c The character of the language
+             */
            SingleRE(char c) : c(c) {}
            ~SingleRE() {}
            virtual State toENFA(ENFA& enfa, State attach);
@ -45,9 +64,15 @@ namespace lxs {
            char c;
    };

+    /**
+     * A regex for the language containing multiple single-symbol strings
+     */
    class MultiRE : public RE
    {
        public:
+            /**
+             * @param chars The list of symbols contained in the language
+             */
            MultiRE(std::vector<char> chars) : chars(std::move(chars)) {}
            ~MultiRE() {}
            virtual State toENFA(ENFA& enfa, State attach);
@ -56,9 +81,16 @@ namespace lxs {
            std::vector<char> chars;
    };

+    /**
+     * A regex for the concatenation of two languages
+     */
    class ConcatRE : public RE
    {
        public:
+            /**
+             * @param e The first language
+             * @param f The second language
+             */
            ConcatRE(std::shared_ptr<RE> e, std::shared_ptr<RE> f) : e(e), f(f) {}
            ~ConcatRE() {}
            virtual State toENFA(ENFA& enfa, State attach);
@ -67,9 +99,15 @@ namespace lxs {
            std::shared_ptr<RE> e, f;
    };

+    /**
+     * The regex for the kleene star of a language
+     */
    class StarRE : public RE
    {
        public:
+            /**
+             * @param e The language to apply the star to
+             */
            StarRE(std::shared_ptr<RE> e) : e(e) {}
            ~StarRE() {}
            virtual State toENFA(ENFA& enfa, State attach);
@ -78,9 +116,16 @@ namespace lxs {
            std::shared_ptr<RE> e;
    };

+    /**
+     * A regex for the sum/disjunction of two languages
+     */
    class PlusRE : public RE
    {
        public:
+            /**
+             * @param e The first language
+             * @param f The second language
+             */
            PlusRE(std::shared_ptr<RE> e, std::shared_ptr<RE> f) : e(e), f(f) {}
            ~PlusRE() {}
            virtual State toENFA(ENFA& enfa, State attach);
@ -89,8 +134,19 @@ namespace lxs {
            std::shared_ptr<RE> e, f;
    };

+    /**
+     * Parse the given regular expression and return the associated Regex
+     *
+     * @param input The regular expression to parse
+     * @returns An abstraction representation of `input`
+     *
+     * @throws SyntaxError if the regex is invalid, the `what()` method contains some information on the problem.
+     */
    std::shared_ptr<RE> parseRE(const std::string& input);

+    /**
+     * An exception to represent a syntax error in a regular expression
+     */
    class SyntaxError : public std::runtime_error
    {
        public:
--- a/include/Lexesis/template.h
+++ b/include/Lexesis/template.h
@ -9,17 +9,49 @@

 namespace lxs {
 namespace templ {
+    /**
+     * A changeable information structure for templates
+     */
    using TemplateContext = mstch::node;

+    /**
+     * Make a TemplateContext string
+     */
    TemplateContext make_string(std::string);
+
+    /**
+     * Make a TemplateContext map/dictionary
+     */
    TemplateContext make_map(std::map<const std::string, TemplateContext>);
+
+    /**
+     * Make a TemplateContext array/vector
+     */
    TemplateContext make_array(std::vector<TemplateContext>);

+    /**
+     * A generic wrapper around whichever templating system gets used
+     */
    class Template {
        public:
+            /**
+             * Construct a Template from given filename
+             *
+             * @param filename The name of the file which contains the template rules
+             */
            Template(std::string filename);
+
+            /**
+             * Destructor
+             */
            ~Template();

+            /**
+             * Render this template to `out` using the information in `context`
+             *
+             * @param out The ostream to render to
+             * @param context The information to provide the template rules while rendering
+             */
            void render(std::ostream& out, TemplateContext& context);
        private:
            std::string m_filename;
--- a/src/automata.cpp
+++ b/src/automata.cpp
@ -148,6 +148,9 @@ namespace lxs {
            return rev;
        }

+        /**
+         * Put all reachables states in `d`, starting from `s` into `reachable`
+         */
        void markReachable(const DFA& d, State s, std::set<State>& reachable) {
            if (reachable.count(s) > 0)
                return;
@ -298,6 +301,9 @@ namespace lxs {
    
    namespace { // Utility function for mssc
        
+        /**
+         * Get the next state, by taking the union of all next states in the NFA by following any character in `oldstate`
+         */
        std::set<State> getNextState(const std::set<State>& oldstate, char symbol, const NFA& e) {
            std::set<State> states;
            for(const auto &state: oldstate) {
--- a/src/backends/cpp.cpp
+++ b/src/backends/cpp.cpp
@ -4,6 +4,8 @@
 #include <iostream>

 namespace {
+    //Some shortcut utility functions for creating a TemplateContext
+    
    lxs::templ::TemplateContext make_map_elem(std::string key, std::string value) {
        return lxs::templ::make_map({{key, lxs::templ::make_string(value)}});
    }
--- a/src/driver.cpp
+++ b/src/driver.cpp
@ -11,7 +11,7 @@ namespace {
    std::string clean(std::string in) {
        std::string s;
        for (char c : in) {
-            if ((s.length() && std::isalnum(c)) || std::isalpha(c))
+            if ((s.length() && std::isalnum(c)) || std::isalpha(c) || c == '_')
                s += c;
        }
        return s;
@ -25,17 +25,17 @@ namespace lxs {
                m_outputdir(outputdir),
                m_language(language),
                m_lexername(clean(lexername))
-    {
-        if (!m_lexername.length()) {
-            std::cerr << "No valid lexer name possible" << std::endl;
-            exit(1);
-        }
-    }
+    {}

    Driver::~Driver()
    {}

    int Driver::run() {
+        if (!m_lexername.length()) {
+            std::cerr << "No valid lexer name possible" << std::endl;
+            return 1;
+        }
+
        Backend* back = m_backends->findBackendForLang(m_language);
        if (!back) {
            std::cerr << "Could not find a valid backend for language " << m_language << std::endl;
--- a/src/re.cpp
+++ b/src/re.cpp
@ -115,6 +115,9 @@ namespace lxs {
    }

    namespace {
+        /**
+         * Take the two top elements from `stk` and combine them with a ConcatRE
+         */
        void compress(stack<std::shared_ptr<RE>>& stk)
        {
            std::shared_ptr<RE> a = stk.top();
@ -124,6 +127,9 @@ namespace lxs {
            stk.push(std::make_shared<ConcatRE>(b, a));   //Attention: reversed order because of stack
        }

+        /**
+         * Apply compress until only one RE remains on the stack
+         */
        void compactStack(stack<std::shared_ptr<RE> >& stk)
        {
            if (stk.empty()) return;
@ -136,6 +142,9 @@ namespace lxs {
            stk.push(tp);
        }

+        /**
+         * Get the actual char that should be used when c is placed after a backslash
+         */
        char parseEscapeChar(char c) {
            switch (c)
            {
@ -175,6 +184,9 @@ namespace lxs {
            return c;
        }

+        /**
+         * Parse a character class
+         */
        std::shared_ptr<RE> parseCharacterClass(const string& input, size_t& idx) {
            if (idx >= input.size())
                throw SyntaxError("Unclosed character class");
@ -257,6 +269,9 @@ namespace lxs {
            return std::make_shared<MultiRE>(chars);
        }

+        /**
+         * Return the RE for the `.` pattern: everything except a newline
+         */
        std::shared_ptr<RE> dotChar() {
            std::vector<char> any;
            for (int i = 0; i < 256; i++)
@ -265,6 +280,9 @@ namespace lxs {
            return std::make_shared<MultiRE>(any);
        }

+        /**
+         * Parse the actual regex
+         */
        std::shared_ptr<RE> parseRE(const string& input, size_t& idx)
        {
            stack<std::shared_ptr<RE> > stk;