From 782c7a8649324e9bb4007022ae7cad905399e062 Mon Sep 17 00:00:00 2001
From: Robin Jadoul <robin.jadoul@gmail.com>
Date: Fri, 27 May 2016 19:08:36 +0200
Subject: [PATCH] Documentation

---
 include/Lexesis/automata.h       | 59 +++++++++++++++++++++++++++++++-
 include/Lexesis/backend.h        | 42 +++++++++++++++++++++++
 include/Lexesis/backendmanager.h | 18 +++++++++-
 include/Lexesis/backends/cpp.h   | 23 +++++++++++++
 include/Lexesis/driver.h         | 21 ++++++++++++
 include/Lexesis/inputparser.h    | 13 +++++++
 include/Lexesis/re.h             | 56 ++++++++++++++++++++++++++++++
 include/Lexesis/template.h       | 32 +++++++++++++++++
 src/automata.cpp                 |  6 ++++
 src/backends/cpp.cpp             |  2 ++
 src/driver.cpp                   | 14 ++++----
 src/re.cpp                       | 18 ++++++++++
 12 files changed, 295 insertions(+), 9 deletions(-)
diff --git a/include/Lexesis/automata.h b/include/Lexesis/automata.h
index 6874e35..355cd71 100644
--- a/include/Lexesis/automata.h
+++ b/include/Lexesis/automata.h
@@ -1,3 +1,8 @@
+/**
+ * Lexesis/automata.h
+ *
+ * A file describing basic automata (DFA, NFA and e-NFA), and some operations on them.
+ */
 #pragma once
 #ifndef AUTOMATA_H
 #define AUTOMATA_H
@@ -14,6 +19,18 @@ namespace lxs {
     typedef unsigned long long Priority;
     const State deadState = ULLONG_MAX;
 
+    /**
+     * A basic automaton, the basis for DFA's, NFA's and epsilon-NFA's
+     * The states are implicit, from 0 to numStates - 1
+     * The starting state can be specified, though some methods probably assume it is 0
+     *
+     * The priority and acceptingToken are associations with accepting states
+     * The lower the priority, the more important.
+     *
+     * No transitions are specified yet, since that is the main point of difference between different FA's
+     *
+     * The alphabet is always considered every char from 0 to 255
+     */
     struct Automaton {
         State numStates = 0;
         std::set<State> accepting;
@@ -22,28 +39,68 @@ namespace lxs {
         State starting;
     };
 
+    /**
+     * A Deterministic finite automaton
+     * An automaton which should have exactly one transition per state per char
+     */
     struct DFA : public Automaton {
         std::map<State, std::map<char, State> > delta;
     };
 
+    /**
+     * A nondeterministic FA
+     * Has an arbitrary amount of transitions per state per char
+     */
     struct NFA : public Automaton {
         std::map<State, std::map<char, std::set<State> > > delta;
+
+        /**
+         * compute the epsilon closure for a state
+         * Returns {s} for a normal NFA, since it has no epsilon transitions
+         */
         virtual std::set<State> eClose(State) const;
     };
 
+    /**
+     * An epsilon NFA
+     * In addition to a normal NFA, can have 'free'/epsilon transitions which do not require a char
+     */
     struct ENFA : public NFA {
         std::map<State, std::set<State> > epsilonTransitions;
-        virtual std::set<State> eClose(State) const;
 
         virtual std::set<State> eClose(State) const;
     };
 
+    /**
+     * Convert a DFA to graphviz dot format, can be useful when debugging
+     */
     std::string toDot(const DFA& d);
+
+    /**
+     * Convert a NFA to graphviz dot format, can be useful when debugging
+     */
     std::string toDot(const NFA& n);
+
+    /**
+     * Convert a ENFA to graphviz dot format, can be useful when debugging
+     */
     std::string toDot(const ENFA& e);
 
+    /**
+     * Merge a collection of ENFA's by adding a new starting state in front and connecting it to the old starting states with an epsilon transition
+     */
     ENFA merge(const std::vector<ENFA>& enfas);
+
+    /**
+     * Modified subset construction: convert an (E)NFA to a DFA
+     * takes priorities and acceptingTokens into consideration
+     */
     DFA mssc(const NFA& e);
+
+    /**
+     * Minimize a DFA
+     * takes priorities and acceptingTokens into consideration to never merge two accepting states with a different acceptingToken
+     */
     DFA minimize(const DFA& d);
 } //namespace lxs
 
diff --git a/include/Lexesis/backend.h b/include/Lexesis/backend.h
index d30ffa7..e260f1b 100644
--- a/include/Lexesis/backend.h
+++ b/include/Lexesis/backend.h
@@ -12,20 +12,62 @@
 #include <string>
 
 namespace lxs {
+    /**
+     * A general interface for a Lexesis backend
+     */
     class Backend {
         public:
+            /**
+             * Constructor
+             */
             Backend();
+
+            /**
+             * Destructor
+             */
             virtual ~Backend();
 
+            /**
+             * Report a name for the backend
+             * used in resolving template paths
+             *
+             * @return std::string A name for the backend
+             */
             virtual std::string getName() = 0;
+
+            /**
+             * Can this backend process the language with given description?
+             *
+             * @param lang A description for a language (eg. "c++", "cxx", "cpp")
+             * @return Can this backend process it
+             */
             virtual bool canProcessLang(std::string lang);
 
+            /**
+             * The function that gets called to generate the actual lexer
+             *
+             * @param getOstreamForFileName A function that takes a filename and returns a std::ostream that the backend can write to for that filename
+             * @param lexerName The name that should be given to the lexer
+             * @param dfa The automaton that should be used in generating the lexer
+             */
             virtual void generateLexer(std::function<std::unique_ptr<std::ostream>(std::string)> getOstreamForFileName, std::string lexerName, const DFA& dfa) = 0;
 
         protected:
+            /**
+             * Render a template (with given (file)name) to the given ostream with the information provided
+             *
+             * @param out The ostream to write the rendered template to
+             * @param templateName An identifier for the template, is combined with `getName()` to construct the actual path
+             * @param context The information that should be provided to the template when rendering
+             */
             void doTemplate(std::ostream& out, std::string templateName, templ::TemplateContext context);
 
         private:
+            /**
+             * Find the template with given name
+             *
+             * @param templateName the template name, gets combined with `getName()`
+             */
             std::string findTemplate(std::string templateName);
     };
 }
diff --git a/include/Lexesis/backendmanager.h b/include/Lexesis/backendmanager.h
index 73c993d..e734ec5 100644
--- a/include/Lexesis/backendmanager.h
+++ b/include/Lexesis/backendmanager.h
@@ -9,14 +9,30 @@
 #include "Lexesis/backend.h"
 
 namespace lxs {
+    /**
+     * A manager for backends
+     * Aggregates and allow to search for backends that can process a specific language
+     */
     class BackendManager {
         public:
+            /**
+             * Add a backend to the list of registered backends
+             *
+             * @param backend The backend to register
+             */
             void registerBackend(std::unique_ptr<Backend> backend);
 
+            /**
+             * Get a backend that can process the given language
+             * The manager retains ownership of the returned pointer
+             *
+             * @param lang The language the backend should be able to process
+             * @returns A pointer to a Backend if it can find one, nullptr otherwise
+             */
             Backend* findBackendForLang(std::string lang);
 
         private:
-            std::vector<std::unique_ptr<Backend> > m_backends;
+            std::vector<std::unique_ptr<Backend> > m_backends; ///< The list of registered backends
     };
 }
 
diff --git a/include/Lexesis/backends/cpp.h b/include/Lexesis/backends/cpp.h
index b7a15c1..0e86b1f 100644
--- a/include/Lexesis/backends/cpp.h
+++ b/include/Lexesis/backends/cpp.h
@@ -8,6 +8,9 @@ namespace lxs
 {
 namespace backends
 {
+    /**
+     * A backend that emits c++ code
+     */
     class CppBackend : public Backend {
         public:
             CppBackend();
@@ -19,9 +22,29 @@ namespace backends
             virtual void generateLexer(std::function<std::unique_ptr<std::ostream>(std::string)> getOstreamForFileName, std::string lexerName, const DFA& dfa);
 
         private:
+            /**
+             * Build a TemplateContext that represents the transition table
+             *
+             * @param transition_idx \see buildTransitionIndices
+             */
             templ::TemplateContext buildTable(const DFA& dfa, const std::vector<unsigned char>& transition_idx, int num_transitions_per_state) const;
+
+            /**
+             * Build a TemplateContext that represents the list of associated tokens with each state
+             */
             templ::TemplateContext buildTokenList(const DFA& dfa) const;
+
+            /**
+             * For compression of the table, build a list that maps each char to an index
+             * This way, whenever multiple chars always represent the same transition, the can get the same index, and the table is smaller
+             *
+             * @return a pair with the list and the number of distinct indices
+             */
             std::pair<std::vector<unsigned char>, int> buildTransitionIndices(const DFA& dfa) const;
+
+            /**
+             * Transform the given indices (\see buildTransitionIndices) to a usable TemplateContext
+             */
             templ::TemplateContext transformTransitionIndices(const std::vector<unsigned char>& transition_indices) const;
     };
 
diff --git a/include/Lexesis/driver.h b/include/Lexesis/driver.h
index c248432..6593801 100644
--- a/include/Lexesis/driver.h
+++ b/include/Lexesis/driver.h
@@ -8,11 +8,32 @@
 #include "Lexesis/backendmanager.h"
 
 namespace lxs {
+    /**
+     * The main driver for Lexesis
+     */
     class Driver {
         public:
+            /**
+             * Constructor
+             *
+             * @param backends The backendmanager, prepared with all needed supported backends
+             * @param inputfile An istream which should be read to be used as token rules specifications
+             * @param outputdir A string representing the directory where generated files should be places
+             * @param language The language to generate output for (backends is queried for this language)
+             * @param lexername The name to give to the generated lexer, this gets cleaned to only contains alphanumeric chars or underscore and start with a non-digit (AKA a valid identifier)
+             */
             Driver(std::unique_ptr<BackendManager> backends, std::istream& inputfile, std::string outputdir, std::string language, std::string lexername);
+
+            /**
+             * Destructor
+             */
             ~Driver();
 
+            /**
+             * Run this driver, all the preparation should happen when calling the constructor
+             *
+             * @return The status code this would return if it were a main function
+             */
             int run();
 
         private:
diff --git a/include/Lexesis/inputparser.h b/include/Lexesis/inputparser.h
index 4a50670..f9dbfda 100644
--- a/include/Lexesis/inputparser.h
+++ b/include/Lexesis/inputparser.h
@@ -10,11 +10,24 @@ namespace lxs {
     struct DFA;
     struct ENFA; 
 
+    /**
+     * Used for parsing token rules
+     */
     class InputParser {
         public:
+            /**
+             * parse the tokens rules read from `is` and return the minimized constructed dfa from those rules
+             */
             static DFA parseInput(std::istream& is);
         private:
+            /**
+             * parse the lines and return pairs of (Token type, regex)
+             */
             static std::vector<std::pair<std::string,std::string> > parseLines(std::istream &is);
+
+            /**
+             * Convert the lines from `parseLines` to ENFA's
+             */
             static std::vector<ENFA> linesToEnfa(std::vector<std::pair<std::string,std::string> > &input);
     };
 }
diff --git a/include/Lexesis/re.h b/include/Lexesis/re.h
index 3d30bf3..2890e9f 100644
--- a/include/Lexesis/re.h
+++ b/include/Lexesis/re.h
@@ -8,14 +8,24 @@
 #include <stdexcept>
 
 namespace lxs {
+    /**
+     * An abstract regular expression
+     */
     class RE
     {
         public:
             virtual ~RE() {}
+            /**
+             * Convert this regex to an ENFA
+             * This extends the given enfa, and attaches itself to the given `attach` state
+             */
             virtual State toENFA(ENFA& enfa, State attach) = 0;
             virtual std::string toRe() = 0;
     };
 
+    /**
+     * A regex for the empty language
+     */
     class EmptyRE : public RE
     {
         public:
@@ -25,6 +35,9 @@ namespace lxs {
             virtual std::string toRe();
     };
 
+    /**
+     * A regex for the language containing only the empty string
+     */
     class EpsilonRE : public RE
     {
         public:
@@ -34,9 +47,15 @@ namespace lxs {
             virtual std::string toRe();
     };
 
+    /**
+     * A regex for the language containing a single character
+     */
     class SingleRE : public RE
     {
         public:
+            /**
+             * @param c The character of the language
+             */
             SingleRE(char c) : c(c) {}
             ~SingleRE() {}
             virtual State toENFA(ENFA& enfa, State attach);
@@ -45,9 +64,15 @@ namespace lxs {
             char c;
     };
 
+    /**
+     * A regex for the language containing multiple single-symbol strings
+     */
     class MultiRE : public RE
     {
         public:
+            /**
+             * @param chars The list of symbols contained in the language
+             */
             MultiRE(std::vector<char> chars) : chars(std::move(chars)) {}
             ~MultiRE() {}
             virtual State toENFA(ENFA& enfa, State attach);
@@ -56,9 +81,16 @@ namespace lxs {
             std::vector<char> chars;
     };
 
+    /**
+     * A regex for the concatenation of two languages
+     */
     class ConcatRE : public RE
     {
         public:
+            /**
+             * @param e The first language
+             * @param f The second language
+             */
             ConcatRE(std::shared_ptr<RE> e, std::shared_ptr<RE> f) : e(e), f(f) {}
             ~ConcatRE() {}
             virtual State toENFA(ENFA& enfa, State attach);
@@ -67,9 +99,15 @@ namespace lxs {
             std::shared_ptr<RE> e, f;
     };
 
+    /**
+     * The regex for the kleene star of a language
+     */
     class StarRE : public RE
     {
         public:
+            /**
+             * @param e The language to apply the star to
+             */
             StarRE(std::shared_ptr<RE> e) : e(e) {}
             ~StarRE() {}
             virtual State toENFA(ENFA& enfa, State attach);
@@ -78,9 +116,16 @@ namespace lxs {
             std::shared_ptr<RE> e;
     };
 
+    /**
+     * A regex for the sum/disjunction of two languages
+     */
     class PlusRE : public RE
     {
         public:
+            /**
+             * @param e The first language
+             * @param f The second language
+             */
             PlusRE(std::shared_ptr<RE> e, std::shared_ptr<RE> f) : e(e), f(f) {}
             ~PlusRE() {}
             virtual State toENFA(ENFA& enfa, State attach);
@@ -89,8 +134,19 @@ namespace lxs {
             std::shared_ptr<RE> e, f;
     };
 
+    /**
+     * Parse the given regular expression and return the associated Regex
+     *
+     * @param input The regular expression to parse
+     * @returns An abstraction representation of `input`
+     *
+     * @throws SyntaxError if the regex is invalid, the `what()` method contains some information on the problem.
+     */
     std::shared_ptr<RE> parseRE(const std::string& input);
 
+    /**
+     * An exception to represent a syntax error in a regular expression
+     */
     class SyntaxError : public std::runtime_error
     {
         public:
diff --git a/include/Lexesis/template.h b/include/Lexesis/template.h
index 1831e34..3ed9451 100644
--- a/include/Lexesis/template.h
+++ b/include/Lexesis/template.h
@@ -9,17 +9,49 @@
 
 namespace lxs {
 namespace templ {
+    /**
+     * A changeable information structure for templates
+     */
     using TemplateContext = mstch::node;
 
+    /**
+     * Make a TemplateContext string
+     */
     TemplateContext make_string(std::string);
+
+    /**
+     * Make a TemplateContext map/dictionary
+     */
     TemplateContext make_map(std::map<const std::string, TemplateContext>);
+
+    /**
+     * Make a TemplateContext array/vector
+     */
     TemplateContext make_array(std::vector<TemplateContext>);
 
+    /**
+     * A generic wrapper around whichever templating system gets used
+     */
     class Template {
         public:
+            /**
+             * Construct a Template from given filename
+             *
+             * @param filename The name of the file which contains the template rules
+             */
             Template(std::string filename);
+
+            /**
+             * Destructor
+             */
             ~Template();
 
+            /**
+             * Render this template to `out` using the information in `context`
+             *
+             * @param out The ostream to render to
+             * @param context The information to provide the template rules while rendering
+             */
             void render(std::ostream& out, TemplateContext& context);
         private:
             std::string m_filename;
diff --git a/src/automata.cpp b/src/automata.cpp
index ee53f01..88fd8c2 100644
--- a/src/automata.cpp
+++ b/src/automata.cpp
@@ -148,6 +148,9 @@ namespace lxs {
             return rev;
         }
 
+        /**
+         * Put all reachables states in `d`, starting from `s` into `reachable`
+         */
         void markReachable(const DFA& d, State s, std::set<State>& reachable) {
             if (reachable.count(s) > 0)
                 return;
@@ -298,6 +301,9 @@ namespace lxs {
     
     namespace { // Utility function for mssc
         
+        /**
+         * Get the next state, by taking the union of all next states in the NFA by following any character in `oldstate`
+         */
         std::set<State> getNextState(const std::set<State>& oldstate, char symbol, const NFA& e) {
             std::set<State> states;
             for(const auto &state: oldstate) {
diff --git a/src/backends/cpp.cpp b/src/backends/cpp.cpp
index 8d194ac..96d5972 100644
--- a/src/backends/cpp.cpp
+++ b/src/backends/cpp.cpp
@@ -4,6 +4,8 @@
 #include <iostream>
 
 namespace {
+    //Some shortcut utility functions for creating a TemplateContext
+    
     lxs::templ::TemplateContext make_map_elem(std::string key, std::string value) {
         return lxs::templ::make_map({{key, lxs::templ::make_string(value)}});
     }
diff --git a/src/driver.cpp b/src/driver.cpp
index 23d1ef7..08b2d68 100644
--- a/src/driver.cpp
+++ b/src/driver.cpp
@@ -11,7 +11,7 @@ namespace {
     std::string clean(std::string in) {
         std::string s;
         for (char c : in) {
-            if ((s.length() && std::isalnum(c)) || std::isalpha(c))
+            if ((s.length() && std::isalnum(c)) || std::isalpha(c) || c == '_')
                 s += c;
         }
         return s;
@@ -25,17 +25,17 @@ namespace lxs {
                 m_outputdir(outputdir),
                 m_language(language),
                 m_lexername(clean(lexername))
-    {
-        if (!m_lexername.length()) {
-            std::cerr << "No valid lexer name possible" << std::endl;
-            exit(1);
-        }
-    }
+    {}
 
     Driver::~Driver()
     {}
 
     int Driver::run() {
+        if (!m_lexername.length()) {
+            std::cerr << "No valid lexer name possible" << std::endl;
+            return 1;
+        }
+
         Backend* back = m_backends->findBackendForLang(m_language);
         if (!back) {
             std::cerr << "Could not find a valid backend for language " << m_language << std::endl;
diff --git a/src/re.cpp b/src/re.cpp
index 0c88afd..581f85b 100644
--- a/src/re.cpp
+++ b/src/re.cpp
@@ -115,6 +115,9 @@ namespace lxs {
     }
 
     namespace {
+        /**
+         * Take the two top elements from `stk` and combine them with a ConcatRE
+         */
         void compress(stack<std::shared_ptr<RE>>& stk)
         {
             std::shared_ptr<RE> a = stk.top();
@@ -124,6 +127,9 @@ namespace lxs {
             stk.push(std::make_shared<ConcatRE>(b, a));   //Attention: reversed order because of stack
         }
 
+        /**
+         * Apply compress until only one RE remains on the stack
+         */
         void compactStack(stack<std::shared_ptr<RE> >& stk)
         {
             if (stk.empty()) return;
@@ -136,6 +142,9 @@ namespace lxs {
             stk.push(tp);
         }
 
+        /**
+         * Get the actual char that should be used when c is placed after a backslash
+         */
         char parseEscapeChar(char c) {
             switch (c)
             {
@@ -175,6 +184,9 @@ namespace lxs {
             return c;
         }
 
+        /**
+         * Parse a character class
+         */
         std::shared_ptr<RE> parseCharacterClass(const string& input, size_t& idx) {
             if (idx >= input.size())
                 throw SyntaxError("Unclosed character class");
@@ -257,6 +269,9 @@ namespace lxs {
             return std::make_shared<MultiRE>(chars);
         }
 
+        /**
+         * Return the RE for the `.` pattern: everything except a newline
+         */
         std::shared_ptr<RE> dotChar() {
             std::vector<char> any;
             for (int i = 0; i < 256; i++)
@@ -265,6 +280,9 @@ namespace lxs {
             return std::make_shared<MultiRE>(any);
         }
 
+        /**
+         * Parse the actual regex
+         */
         std::shared_ptr<RE> parseRE(const string& input, size_t& idx)
         {
             stack<std::shared_ptr<RE> > stk;