Documentation

This commit is contained in:
Robin Jadoul 2016-05-27 19:08:36 +02:00
parent 23437e8f23
commit 782c7a8649
12 changed files with 295 additions and 9 deletions

View File

@ -1,3 +1,8 @@
/**
* Lexesis/automata.h
*
* A file describing basic automata (DFA, NFA and e-NFA), and some operations on them.
*/
#pragma once
#ifndef AUTOMATA_H
#define AUTOMATA_H
@ -14,6 +19,18 @@ namespace lxs {
typedef unsigned long long Priority;
const State deadState = ULLONG_MAX;
/**
* A basic automaton, the basis for DFA's, NFA's and epsilon-NFA's
* The states are implicit, from 0 to numStates - 1
* The starting state can be specified, though some methods probably assume it is 0
*
* The priority and acceptingToken are associations with accepting states
* The lower the priority, the more important.
*
* No transitions are specified yet, since that is the main point of difference between different FA's
*
* The alphabet is always considered every char from 0 to 255
*/
struct Automaton {
State numStates = 0;
std::set<State> accepting;
@ -22,28 +39,68 @@ namespace lxs {
State starting;
};
/**
* A Deterministic finite automaton
* An automaton which should have exactly one transition per state per char
*/
struct DFA : public Automaton {
std::map<State, std::map<char, State> > delta;
};
/**
* A nondeterministic FA
* Has an arbitrary amount of transitions per state per char
*/
struct NFA : public Automaton {
std::map<State, std::map<char, std::set<State> > > delta;
/**
* compute the epsilon closure for a state
* Returns {s} for a normal NFA, since it has no epsilon transitions
*/
virtual std::set<State> eClose(State) const;
};
/**
* An epsilon NFA
* In addition to a normal NFA, can have 'free'/epsilon transitions which do not require a char
*/
struct ENFA : public NFA {
std::map<State, std::set<State> > epsilonTransitions;
virtual std::set<State> eClose(State) const;
virtual std::set<State> eClose(State) const;
};
/**
* Convert a DFA to graphviz dot format, can be useful when debugging
*/
std::string toDot(const DFA& d);
/**
* Convert a NFA to graphviz dot format, can be useful when debugging
*/
std::string toDot(const NFA& n);
/**
* Convert a ENFA to graphviz dot format, can be useful when debugging
*/
std::string toDot(const ENFA& e);
/**
* Merge a collection of ENFA's by adding a new starting state in front and connecting it to the old starting states with an epsilon transition
*/
ENFA merge(const std::vector<ENFA>& enfas);
/**
* Modified subset construction: convert an (E)NFA to a DFA
* takes priorities and acceptingTokens into consideration
*/
DFA mssc(const NFA& e);
/**
* Minimize a DFA
* takes priorities and acceptingTokens into consideration to never merge two accepting states with a different acceptingToken
*/
DFA minimize(const DFA& d);
} //namespace lxs

View File

@ -12,20 +12,62 @@
#include <string>
namespace lxs {
/**
* A general interface for a Lexesis backend
*/
class Backend {
public:
/**
* Constructor
*/
Backend();
/**
* Destructor
*/
virtual ~Backend();
/**
* Report a name for the backend
* used in resolving template paths
*
* @return std::string A name for the backend
*/
virtual std::string getName() = 0;
/**
* Can this backend process the language with given description?
*
* @param lang A description for a language (eg. "c++", "cxx", "cpp")
* @return Can this backend process it
*/
virtual bool canProcessLang(std::string lang);
/**
* The function that gets called to generate the actual lexer
*
* @param getOstreamForFileName A function that takes a filename and returns a std::ostream that the backend can write to for that filename
* @param lexerName The name that should be given to the lexer
* @param dfa The automaton that should be used in generating the lexer
*/
virtual void generateLexer(std::function<std::unique_ptr<std::ostream>(std::string)> getOstreamForFileName, std::string lexerName, const DFA& dfa) = 0;
protected:
/**
* Render a template (with given (file)name) to the given ostream with the information provided
*
* @param out The ostream to write the rendered template to
* @param templateName An identifier for the template, is combined with `getName()` to construct the actual path
* @param context The information that should be provided to the template when rendering
*/
void doTemplate(std::ostream& out, std::string templateName, templ::TemplateContext context);
private:
/**
* Find the template with given name
*
* @param templateName the template name, gets combined with `getName()`
*/
std::string findTemplate(std::string templateName);
};
}

View File

@ -9,14 +9,30 @@
#include "Lexesis/backend.h"
namespace lxs {
/**
* A manager for backends
* Aggregates and allow to search for backends that can process a specific language
*/
class BackendManager {
public:
/**
* Add a backend to the list of registered backends
*
* @param backend The backend to register
*/
void registerBackend(std::unique_ptr<Backend> backend);
/**
* Get a backend that can process the given language
* The manager retains ownership of the returned pointer
*
* @param lang The language the backend should be able to process
* @returns A pointer to a Backend if it can find one, nullptr otherwise
*/
Backend* findBackendForLang(std::string lang);
private:
std::vector<std::unique_ptr<Backend> > m_backends;
std::vector<std::unique_ptr<Backend> > m_backends; ///< The list of registered backends
};
}

View File

@ -8,6 +8,9 @@ namespace lxs
{
namespace backends
{
/**
* A backend that emits c++ code
*/
class CppBackend : public Backend {
public:
CppBackend();
@ -19,9 +22,29 @@ namespace backends
virtual void generateLexer(std::function<std::unique_ptr<std::ostream>(std::string)> getOstreamForFileName, std::string lexerName, const DFA& dfa);
private:
/**
* Build a TemplateContext that represents the transition table
*
* @param transition_idx \see buildTransitionIndices
*/
templ::TemplateContext buildTable(const DFA& dfa, const std::vector<unsigned char>& transition_idx, int num_transitions_per_state) const;
/**
* Build a TemplateContext that represents the list of associated tokens with each state
*/
templ::TemplateContext buildTokenList(const DFA& dfa) const;
/**
* For compression of the table, build a list that maps each char to an index
* This way, whenever multiple chars always represent the same transition, the can get the same index, and the table is smaller
*
* @return a pair with the list and the number of distinct indices
*/
std::pair<std::vector<unsigned char>, int> buildTransitionIndices(const DFA& dfa) const;
/**
* Transform the given indices (\see buildTransitionIndices) to a usable TemplateContext
*/
templ::TemplateContext transformTransitionIndices(const std::vector<unsigned char>& transition_indices) const;
};

View File

@ -8,11 +8,32 @@
#include "Lexesis/backendmanager.h"
namespace lxs {
/**
* The main driver for Lexesis
*/
class Driver {
public:
/**
* Constructor
*
* @param backends The backendmanager, prepared with all needed supported backends
* @param inputfile An istream which should be read to be used as token rules specifications
* @param outputdir A string representing the directory where generated files should be places
* @param language The language to generate output for (backends is queried for this language)
* @param lexername The name to give to the generated lexer, this gets cleaned to only contains alphanumeric chars or underscore and start with a non-digit (AKA a valid identifier)
*/
Driver(std::unique_ptr<BackendManager> backends, std::istream& inputfile, std::string outputdir, std::string language, std::string lexername);
/**
* Destructor
*/
~Driver();
/**
* Run this driver, all the preparation should happen when calling the constructor
*
* @return The status code this would return if it were a main function
*/
int run();
private:

View File

@ -10,11 +10,24 @@ namespace lxs {
struct DFA;
struct ENFA;
/**
* Used for parsing token rules
*/
class InputParser {
public:
/**
* parse the tokens rules read from `is` and return the minimized constructed dfa from those rules
*/
static DFA parseInput(std::istream& is);
private:
/**
* parse the lines and return pairs of (Token type, regex)
*/
static std::vector<std::pair<std::string,std::string> > parseLines(std::istream &is);
/**
* Convert the lines from `parseLines` to ENFA's
*/
static std::vector<ENFA> linesToEnfa(std::vector<std::pair<std::string,std::string> > &input);
};
}

View File

@ -8,14 +8,24 @@
#include <stdexcept>
namespace lxs {
/**
* An abstract regular expression
*/
class RE
{
public:
virtual ~RE() {}
/**
* Convert this regex to an ENFA
* This extends the given enfa, and attaches itself to the given `attach` state
*/
virtual State toENFA(ENFA& enfa, State attach) = 0;
virtual std::string toRe() = 0;
};
/**
* A regex for the empty language
*/
class EmptyRE : public RE
{
public:
@ -25,6 +35,9 @@ namespace lxs {
virtual std::string toRe();
};
/**
* A regex for the language containing only the empty string
*/
class EpsilonRE : public RE
{
public:
@ -34,9 +47,15 @@ namespace lxs {
virtual std::string toRe();
};
/**
* A regex for the language containing a single character
*/
class SingleRE : public RE
{
public:
/**
* @param c The character of the language
*/
SingleRE(char c) : c(c) {}
~SingleRE() {}
virtual State toENFA(ENFA& enfa, State attach);
@ -45,9 +64,15 @@ namespace lxs {
char c;
};
/**
* A regex for the language containing multiple single-symbol strings
*/
class MultiRE : public RE
{
public:
/**
* @param chars The list of symbols contained in the language
*/
MultiRE(std::vector<char> chars) : chars(std::move(chars)) {}
~MultiRE() {}
virtual State toENFA(ENFA& enfa, State attach);
@ -56,9 +81,16 @@ namespace lxs {
std::vector<char> chars;
};
/**
* A regex for the concatenation of two languages
*/
class ConcatRE : public RE
{
public:
/**
* @param e The first language
* @param f The second language
*/
ConcatRE(std::shared_ptr<RE> e, std::shared_ptr<RE> f) : e(e), f(f) {}
~ConcatRE() {}
virtual State toENFA(ENFA& enfa, State attach);
@ -67,9 +99,15 @@ namespace lxs {
std::shared_ptr<RE> e, f;
};
/**
* The regex for the kleene star of a language
*/
class StarRE : public RE
{
public:
/**
* @param e The language to apply the star to
*/
StarRE(std::shared_ptr<RE> e) : e(e) {}
~StarRE() {}
virtual State toENFA(ENFA& enfa, State attach);
@ -78,9 +116,16 @@ namespace lxs {
std::shared_ptr<RE> e;
};
/**
* A regex for the sum/disjunction of two languages
*/
class PlusRE : public RE
{
public:
/**
* @param e The first language
* @param f The second language
*/
PlusRE(std::shared_ptr<RE> e, std::shared_ptr<RE> f) : e(e), f(f) {}
~PlusRE() {}
virtual State toENFA(ENFA& enfa, State attach);
@ -89,8 +134,19 @@ namespace lxs {
std::shared_ptr<RE> e, f;
};
/**
* Parse the given regular expression and return the associated Regex
*
* @param input The regular expression to parse
* @returns An abstraction representation of `input`
*
* @throws SyntaxError if the regex is invalid, the `what()` method contains some information on the problem.
*/
std::shared_ptr<RE> parseRE(const std::string& input);
/**
* An exception to represent a syntax error in a regular expression
*/
class SyntaxError : public std::runtime_error
{
public:

View File

@ -9,17 +9,49 @@
namespace lxs {
namespace templ {
/**
* A changeable information structure for templates
*/
using TemplateContext = mstch::node;
/**
* Make a TemplateContext string
*/
TemplateContext make_string(std::string);
/**
* Make a TemplateContext map/dictionary
*/
TemplateContext make_map(std::map<const std::string, TemplateContext>);
/**
* Make a TemplateContext array/vector
*/
TemplateContext make_array(std::vector<TemplateContext>);
/**
* A generic wrapper around whichever templating system gets used
*/
class Template {
public:
/**
* Construct a Template from given filename
*
* @param filename The name of the file which contains the template rules
*/
Template(std::string filename);
/**
* Destructor
*/
~Template();
/**
* Render this template to `out` using the information in `context`
*
* @param out The ostream to render to
* @param context The information to provide the template rules while rendering
*/
void render(std::ostream& out, TemplateContext& context);
private:
std::string m_filename;

View File

@ -148,6 +148,9 @@ namespace lxs {
return rev;
}
/**
* Put all reachables states in `d`, starting from `s` into `reachable`
*/
void markReachable(const DFA& d, State s, std::set<State>& reachable) {
if (reachable.count(s) > 0)
return;
@ -298,6 +301,9 @@ namespace lxs {
namespace { // Utility function for mssc
/**
* Get the next state, by taking the union of all next states in the NFA by following any character in `oldstate`
*/
std::set<State> getNextState(const std::set<State>& oldstate, char symbol, const NFA& e) {
std::set<State> states;
for(const auto &state: oldstate) {

View File

@ -4,6 +4,8 @@
#include <iostream>
namespace {
//Some shortcut utility functions for creating a TemplateContext
lxs::templ::TemplateContext make_map_elem(std::string key, std::string value) {
return lxs::templ::make_map({{key, lxs::templ::make_string(value)}});
}

View File

@ -11,7 +11,7 @@ namespace {
std::string clean(std::string in) {
std::string s;
for (char c : in) {
if ((s.length() && std::isalnum(c)) || std::isalpha(c))
if ((s.length() && std::isalnum(c)) || std::isalpha(c) || c == '_')
s += c;
}
return s;
@ -25,17 +25,17 @@ namespace lxs {
m_outputdir(outputdir),
m_language(language),
m_lexername(clean(lexername))
{
if (!m_lexername.length()) {
std::cerr << "No valid lexer name possible" << std::endl;
exit(1);
}
}
{}
Driver::~Driver()
{}
int Driver::run() {
if (!m_lexername.length()) {
std::cerr << "No valid lexer name possible" << std::endl;
return 1;
}
Backend* back = m_backends->findBackendForLang(m_language);
if (!back) {
std::cerr << "Could not find a valid backend for language " << m_language << std::endl;

View File

@ -115,6 +115,9 @@ namespace lxs {
}
namespace {
/**
* Take the two top elements from `stk` and combine them with a ConcatRE
*/
void compress(stack<std::shared_ptr<RE>>& stk)
{
std::shared_ptr<RE> a = stk.top();
@ -124,6 +127,9 @@ namespace lxs {
stk.push(std::make_shared<ConcatRE>(b, a)); //Attention: reversed order because of stack
}
/**
* Apply compress until only one RE remains on the stack
*/
void compactStack(stack<std::shared_ptr<RE> >& stk)
{
if (stk.empty()) return;
@ -136,6 +142,9 @@ namespace lxs {
stk.push(tp);
}
/**
* Get the actual char that should be used when c is placed after a backslash
*/
char parseEscapeChar(char c) {
switch (c)
{
@ -175,6 +184,9 @@ namespace lxs {
return c;
}
/**
* Parse a character class
*/
std::shared_ptr<RE> parseCharacterClass(const string& input, size_t& idx) {
if (idx >= input.size())
throw SyntaxError("Unclosed character class");
@ -257,6 +269,9 @@ namespace lxs {
return std::make_shared<MultiRE>(chars);
}
/**
* Return the RE for the `.` pattern: everything except a newline
*/
std::shared_ptr<RE> dotChar() {
std::vector<char> any;
for (int i = 0; i < 256; i++)
@ -265,6 +280,9 @@ namespace lxs {
return std::make_shared<MultiRE>(any);
}
/**
* Parse the actual regex
*/
std::shared_ptr<RE> parseRE(const string& input, size_t& idx)
{
stack<std::shared_ptr<RE> > stk;