Documentation
This commit is contained in:
parent
23437e8f23
commit
782c7a8649
|
@ -1,3 +1,8 @@
|
|||
/**
|
||||
* Lexesis/automata.h
|
||||
*
|
||||
* A file describing basic automata (DFA, NFA and e-NFA), and some operations on them.
|
||||
*/
|
||||
#pragma once
|
||||
#ifndef AUTOMATA_H
|
||||
#define AUTOMATA_H
|
||||
|
@ -14,6 +19,18 @@ namespace lxs {
|
|||
typedef unsigned long long Priority;
|
||||
const State deadState = ULLONG_MAX;
|
||||
|
||||
/**
|
||||
* A basic automaton, the basis for DFA's, NFA's and epsilon-NFA's
|
||||
* The states are implicit, from 0 to numStates - 1
|
||||
* The starting state can be specified, though some methods probably assume it is 0
|
||||
*
|
||||
* The priority and acceptingToken are associations with accepting states
|
||||
* The lower the priority, the more important.
|
||||
*
|
||||
* No transitions are specified yet, since that is the main point of difference between different FA's
|
||||
*
|
||||
* The alphabet is always considered every char from 0 to 255
|
||||
*/
|
||||
struct Automaton {
|
||||
State numStates = 0;
|
||||
std::set<State> accepting;
|
||||
|
@ -22,28 +39,68 @@ namespace lxs {
|
|||
State starting;
|
||||
};
|
||||
|
||||
/**
|
||||
* A Deterministic finite automaton
|
||||
* An automaton which should have exactly one transition per state per char
|
||||
*/
|
||||
struct DFA : public Automaton {
|
||||
std::map<State, std::map<char, State> > delta;
|
||||
};
|
||||
|
||||
/**
|
||||
* A nondeterministic FA
|
||||
* Has an arbitrary amount of transitions per state per char
|
||||
*/
|
||||
struct NFA : public Automaton {
|
||||
std::map<State, std::map<char, std::set<State> > > delta;
|
||||
|
||||
/**
|
||||
* compute the epsilon closure for a state
|
||||
* Returns {s} for a normal NFA, since it has no epsilon transitions
|
||||
*/
|
||||
virtual std::set<State> eClose(State) const;
|
||||
};
|
||||
|
||||
/**
|
||||
* An epsilon NFA
|
||||
* In addition to a normal NFA, can have 'free'/epsilon transitions which do not require a char
|
||||
*/
|
||||
struct ENFA : public NFA {
|
||||
std::map<State, std::set<State> > epsilonTransitions;
|
||||
virtual std::set<State> eClose(State) const;
|
||||
|
||||
virtual std::set<State> eClose(State) const;
|
||||
};
|
||||
|
||||
/**
|
||||
* Convert a DFA to graphviz dot format, can be useful when debugging
|
||||
*/
|
||||
std::string toDot(const DFA& d);
|
||||
|
||||
/**
|
||||
* Convert a NFA to graphviz dot format, can be useful when debugging
|
||||
*/
|
||||
std::string toDot(const NFA& n);
|
||||
|
||||
/**
|
||||
* Convert a ENFA to graphviz dot format, can be useful when debugging
|
||||
*/
|
||||
std::string toDot(const ENFA& e);
|
||||
|
||||
/**
|
||||
* Merge a collection of ENFA's by adding a new starting state in front and connecting it to the old starting states with an epsilon transition
|
||||
*/
|
||||
ENFA merge(const std::vector<ENFA>& enfas);
|
||||
|
||||
/**
|
||||
* Modified subset construction: convert an (E)NFA to a DFA
|
||||
* takes priorities and acceptingTokens into consideration
|
||||
*/
|
||||
DFA mssc(const NFA& e);
|
||||
|
||||
/**
|
||||
* Minimize a DFA
|
||||
* takes priorities and acceptingTokens into consideration to never merge two accepting states with a different acceptingToken
|
||||
*/
|
||||
DFA minimize(const DFA& d);
|
||||
} //namespace lxs
|
||||
|
||||
|
|
|
@ -12,20 +12,62 @@
|
|||
#include <string>
|
||||
|
||||
namespace lxs {
|
||||
/**
|
||||
* A general interface for a Lexesis backend
|
||||
*/
|
||||
class Backend {
|
||||
public:
|
||||
/**
|
||||
* Constructor
|
||||
*/
|
||||
Backend();
|
||||
|
||||
/**
|
||||
* Destructor
|
||||
*/
|
||||
virtual ~Backend();
|
||||
|
||||
/**
|
||||
* Report a name for the backend
|
||||
* used in resolving template paths
|
||||
*
|
||||
* @return std::string A name for the backend
|
||||
*/
|
||||
virtual std::string getName() = 0;
|
||||
|
||||
/**
|
||||
* Can this backend process the language with given description?
|
||||
*
|
||||
* @param lang A description for a language (eg. "c++", "cxx", "cpp")
|
||||
* @return Can this backend process it
|
||||
*/
|
||||
virtual bool canProcessLang(std::string lang);
|
||||
|
||||
/**
|
||||
* The function that gets called to generate the actual lexer
|
||||
*
|
||||
* @param getOstreamForFileName A function that takes a filename and returns a std::ostream that the backend can write to for that filename
|
||||
* @param lexerName The name that should be given to the lexer
|
||||
* @param dfa The automaton that should be used in generating the lexer
|
||||
*/
|
||||
virtual void generateLexer(std::function<std::unique_ptr<std::ostream>(std::string)> getOstreamForFileName, std::string lexerName, const DFA& dfa) = 0;
|
||||
|
||||
protected:
|
||||
/**
|
||||
* Render a template (with given (file)name) to the given ostream with the information provided
|
||||
*
|
||||
* @param out The ostream to write the rendered template to
|
||||
* @param templateName An identifier for the template, is combined with `getName()` to construct the actual path
|
||||
* @param context The information that should be provided to the template when rendering
|
||||
*/
|
||||
void doTemplate(std::ostream& out, std::string templateName, templ::TemplateContext context);
|
||||
|
||||
private:
|
||||
/**
|
||||
* Find the template with given name
|
||||
*
|
||||
* @param templateName the template name, gets combined with `getName()`
|
||||
*/
|
||||
std::string findTemplate(std::string templateName);
|
||||
};
|
||||
}
|
||||
|
|
|
@ -9,14 +9,30 @@
|
|||
#include "Lexesis/backend.h"
|
||||
|
||||
namespace lxs {
|
||||
/**
|
||||
* A manager for backends
|
||||
* Aggregates and allow to search for backends that can process a specific language
|
||||
*/
|
||||
class BackendManager {
|
||||
public:
|
||||
/**
|
||||
* Add a backend to the list of registered backends
|
||||
*
|
||||
* @param backend The backend to register
|
||||
*/
|
||||
void registerBackend(std::unique_ptr<Backend> backend);
|
||||
|
||||
/**
|
||||
* Get a backend that can process the given language
|
||||
* The manager retains ownership of the returned pointer
|
||||
*
|
||||
* @param lang The language the backend should be able to process
|
||||
* @returns A pointer to a Backend if it can find one, nullptr otherwise
|
||||
*/
|
||||
Backend* findBackendForLang(std::string lang);
|
||||
|
||||
private:
|
||||
std::vector<std::unique_ptr<Backend> > m_backends;
|
||||
std::vector<std::unique_ptr<Backend> > m_backends; ///< The list of registered backends
|
||||
};
|
||||
}
|
||||
|
||||
|
|
|
@ -8,6 +8,9 @@ namespace lxs
|
|||
{
|
||||
namespace backends
|
||||
{
|
||||
/**
|
||||
* A backend that emits c++ code
|
||||
*/
|
||||
class CppBackend : public Backend {
|
||||
public:
|
||||
CppBackend();
|
||||
|
@ -19,9 +22,29 @@ namespace backends
|
|||
virtual void generateLexer(std::function<std::unique_ptr<std::ostream>(std::string)> getOstreamForFileName, std::string lexerName, const DFA& dfa);
|
||||
|
||||
private:
|
||||
/**
|
||||
* Build a TemplateContext that represents the transition table
|
||||
*
|
||||
* @param transition_idx \see buildTransitionIndices
|
||||
*/
|
||||
templ::TemplateContext buildTable(const DFA& dfa, const std::vector<unsigned char>& transition_idx, int num_transitions_per_state) const;
|
||||
|
||||
/**
|
||||
* Build a TemplateContext that represents the list of associated tokens with each state
|
||||
*/
|
||||
templ::TemplateContext buildTokenList(const DFA& dfa) const;
|
||||
|
||||
/**
|
||||
* For compression of the table, build a list that maps each char to an index
|
||||
* This way, whenever multiple chars always represent the same transition, the can get the same index, and the table is smaller
|
||||
*
|
||||
* @return a pair with the list and the number of distinct indices
|
||||
*/
|
||||
std::pair<std::vector<unsigned char>, int> buildTransitionIndices(const DFA& dfa) const;
|
||||
|
||||
/**
|
||||
* Transform the given indices (\see buildTransitionIndices) to a usable TemplateContext
|
||||
*/
|
||||
templ::TemplateContext transformTransitionIndices(const std::vector<unsigned char>& transition_indices) const;
|
||||
};
|
||||
|
||||
|
|
|
@ -8,11 +8,32 @@
|
|||
#include "Lexesis/backendmanager.h"
|
||||
|
||||
namespace lxs {
|
||||
/**
|
||||
* The main driver for Lexesis
|
||||
*/
|
||||
class Driver {
|
||||
public:
|
||||
/**
|
||||
* Constructor
|
||||
*
|
||||
* @param backends The backendmanager, prepared with all needed supported backends
|
||||
* @param inputfile An istream which should be read to be used as token rules specifications
|
||||
* @param outputdir A string representing the directory where generated files should be places
|
||||
* @param language The language to generate output for (backends is queried for this language)
|
||||
* @param lexername The name to give to the generated lexer, this gets cleaned to only contains alphanumeric chars or underscore and start with a non-digit (AKA a valid identifier)
|
||||
*/
|
||||
Driver(std::unique_ptr<BackendManager> backends, std::istream& inputfile, std::string outputdir, std::string language, std::string lexername);
|
||||
|
||||
/**
|
||||
* Destructor
|
||||
*/
|
||||
~Driver();
|
||||
|
||||
/**
|
||||
* Run this driver, all the preparation should happen when calling the constructor
|
||||
*
|
||||
* @return The status code this would return if it were a main function
|
||||
*/
|
||||
int run();
|
||||
|
||||
private:
|
||||
|
|
|
@ -10,11 +10,24 @@ namespace lxs {
|
|||
struct DFA;
|
||||
struct ENFA;
|
||||
|
||||
/**
|
||||
* Used for parsing token rules
|
||||
*/
|
||||
class InputParser {
|
||||
public:
|
||||
/**
|
||||
* parse the tokens rules read from `is` and return the minimized constructed dfa from those rules
|
||||
*/
|
||||
static DFA parseInput(std::istream& is);
|
||||
private:
|
||||
/**
|
||||
* parse the lines and return pairs of (Token type, regex)
|
||||
*/
|
||||
static std::vector<std::pair<std::string,std::string> > parseLines(std::istream &is);
|
||||
|
||||
/**
|
||||
* Convert the lines from `parseLines` to ENFA's
|
||||
*/
|
||||
static std::vector<ENFA> linesToEnfa(std::vector<std::pair<std::string,std::string> > &input);
|
||||
};
|
||||
}
|
||||
|
|
|
@ -8,14 +8,24 @@
|
|||
#include <stdexcept>
|
||||
|
||||
namespace lxs {
|
||||
/**
|
||||
* An abstract regular expression
|
||||
*/
|
||||
class RE
|
||||
{
|
||||
public:
|
||||
virtual ~RE() {}
|
||||
/**
|
||||
* Convert this regex to an ENFA
|
||||
* This extends the given enfa, and attaches itself to the given `attach` state
|
||||
*/
|
||||
virtual State toENFA(ENFA& enfa, State attach) = 0;
|
||||
virtual std::string toRe() = 0;
|
||||
};
|
||||
|
||||
/**
|
||||
* A regex for the empty language
|
||||
*/
|
||||
class EmptyRE : public RE
|
||||
{
|
||||
public:
|
||||
|
@ -25,6 +35,9 @@ namespace lxs {
|
|||
virtual std::string toRe();
|
||||
};
|
||||
|
||||
/**
|
||||
* A regex for the language containing only the empty string
|
||||
*/
|
||||
class EpsilonRE : public RE
|
||||
{
|
||||
public:
|
||||
|
@ -34,9 +47,15 @@ namespace lxs {
|
|||
virtual std::string toRe();
|
||||
};
|
||||
|
||||
/**
|
||||
* A regex for the language containing a single character
|
||||
*/
|
||||
class SingleRE : public RE
|
||||
{
|
||||
public:
|
||||
/**
|
||||
* @param c The character of the language
|
||||
*/
|
||||
SingleRE(char c) : c(c) {}
|
||||
~SingleRE() {}
|
||||
virtual State toENFA(ENFA& enfa, State attach);
|
||||
|
@ -45,9 +64,15 @@ namespace lxs {
|
|||
char c;
|
||||
};
|
||||
|
||||
/**
|
||||
* A regex for the language containing multiple single-symbol strings
|
||||
*/
|
||||
class MultiRE : public RE
|
||||
{
|
||||
public:
|
||||
/**
|
||||
* @param chars The list of symbols contained in the language
|
||||
*/
|
||||
MultiRE(std::vector<char> chars) : chars(std::move(chars)) {}
|
||||
~MultiRE() {}
|
||||
virtual State toENFA(ENFA& enfa, State attach);
|
||||
|
@ -56,9 +81,16 @@ namespace lxs {
|
|||
std::vector<char> chars;
|
||||
};
|
||||
|
||||
/**
|
||||
* A regex for the concatenation of two languages
|
||||
*/
|
||||
class ConcatRE : public RE
|
||||
{
|
||||
public:
|
||||
/**
|
||||
* @param e The first language
|
||||
* @param f The second language
|
||||
*/
|
||||
ConcatRE(std::shared_ptr<RE> e, std::shared_ptr<RE> f) : e(e), f(f) {}
|
||||
~ConcatRE() {}
|
||||
virtual State toENFA(ENFA& enfa, State attach);
|
||||
|
@ -67,9 +99,15 @@ namespace lxs {
|
|||
std::shared_ptr<RE> e, f;
|
||||
};
|
||||
|
||||
/**
|
||||
* The regex for the kleene star of a language
|
||||
*/
|
||||
class StarRE : public RE
|
||||
{
|
||||
public:
|
||||
/**
|
||||
* @param e The language to apply the star to
|
||||
*/
|
||||
StarRE(std::shared_ptr<RE> e) : e(e) {}
|
||||
~StarRE() {}
|
||||
virtual State toENFA(ENFA& enfa, State attach);
|
||||
|
@ -78,9 +116,16 @@ namespace lxs {
|
|||
std::shared_ptr<RE> e;
|
||||
};
|
||||
|
||||
/**
|
||||
* A regex for the sum/disjunction of two languages
|
||||
*/
|
||||
class PlusRE : public RE
|
||||
{
|
||||
public:
|
||||
/**
|
||||
* @param e The first language
|
||||
* @param f The second language
|
||||
*/
|
||||
PlusRE(std::shared_ptr<RE> e, std::shared_ptr<RE> f) : e(e), f(f) {}
|
||||
~PlusRE() {}
|
||||
virtual State toENFA(ENFA& enfa, State attach);
|
||||
|
@ -89,8 +134,19 @@ namespace lxs {
|
|||
std::shared_ptr<RE> e, f;
|
||||
};
|
||||
|
||||
/**
|
||||
* Parse the given regular expression and return the associated Regex
|
||||
*
|
||||
* @param input The regular expression to parse
|
||||
* @returns An abstraction representation of `input`
|
||||
*
|
||||
* @throws SyntaxError if the regex is invalid, the `what()` method contains some information on the problem.
|
||||
*/
|
||||
std::shared_ptr<RE> parseRE(const std::string& input);
|
||||
|
||||
/**
|
||||
* An exception to represent a syntax error in a regular expression
|
||||
*/
|
||||
class SyntaxError : public std::runtime_error
|
||||
{
|
||||
public:
|
||||
|
|
|
@ -9,17 +9,49 @@
|
|||
|
||||
namespace lxs {
|
||||
namespace templ {
|
||||
/**
|
||||
* A changeable information structure for templates
|
||||
*/
|
||||
using TemplateContext = mstch::node;
|
||||
|
||||
/**
|
||||
* Make a TemplateContext string
|
||||
*/
|
||||
TemplateContext make_string(std::string);
|
||||
|
||||
/**
|
||||
* Make a TemplateContext map/dictionary
|
||||
*/
|
||||
TemplateContext make_map(std::map<const std::string, TemplateContext>);
|
||||
|
||||
/**
|
||||
* Make a TemplateContext array/vector
|
||||
*/
|
||||
TemplateContext make_array(std::vector<TemplateContext>);
|
||||
|
||||
/**
|
||||
* A generic wrapper around whichever templating system gets used
|
||||
*/
|
||||
class Template {
|
||||
public:
|
||||
/**
|
||||
* Construct a Template from given filename
|
||||
*
|
||||
* @param filename The name of the file which contains the template rules
|
||||
*/
|
||||
Template(std::string filename);
|
||||
|
||||
/**
|
||||
* Destructor
|
||||
*/
|
||||
~Template();
|
||||
|
||||
/**
|
||||
* Render this template to `out` using the information in `context`
|
||||
*
|
||||
* @param out The ostream to render to
|
||||
* @param context The information to provide the template rules while rendering
|
||||
*/
|
||||
void render(std::ostream& out, TemplateContext& context);
|
||||
private:
|
||||
std::string m_filename;
|
||||
|
|
|
@ -148,6 +148,9 @@ namespace lxs {
|
|||
return rev;
|
||||
}
|
||||
|
||||
/**
|
||||
* Put all reachables states in `d`, starting from `s` into `reachable`
|
||||
*/
|
||||
void markReachable(const DFA& d, State s, std::set<State>& reachable) {
|
||||
if (reachable.count(s) > 0)
|
||||
return;
|
||||
|
@ -298,6 +301,9 @@ namespace lxs {
|
|||
|
||||
namespace { // Utility function for mssc
|
||||
|
||||
/**
|
||||
* Get the next state, by taking the union of all next states in the NFA by following any character in `oldstate`
|
||||
*/
|
||||
std::set<State> getNextState(const std::set<State>& oldstate, char symbol, const NFA& e) {
|
||||
std::set<State> states;
|
||||
for(const auto &state: oldstate) {
|
||||
|
|
|
@ -4,6 +4,8 @@
|
|||
#include <iostream>
|
||||
|
||||
namespace {
|
||||
//Some shortcut utility functions for creating a TemplateContext
|
||||
|
||||
lxs::templ::TemplateContext make_map_elem(std::string key, std::string value) {
|
||||
return lxs::templ::make_map({{key, lxs::templ::make_string(value)}});
|
||||
}
|
||||
|
|
|
@ -11,7 +11,7 @@ namespace {
|
|||
std::string clean(std::string in) {
|
||||
std::string s;
|
||||
for (char c : in) {
|
||||
if ((s.length() && std::isalnum(c)) || std::isalpha(c))
|
||||
if ((s.length() && std::isalnum(c)) || std::isalpha(c) || c == '_')
|
||||
s += c;
|
||||
}
|
||||
return s;
|
||||
|
@ -25,17 +25,17 @@ namespace lxs {
|
|||
m_outputdir(outputdir),
|
||||
m_language(language),
|
||||
m_lexername(clean(lexername))
|
||||
{
|
||||
if (!m_lexername.length()) {
|
||||
std::cerr << "No valid lexer name possible" << std::endl;
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
{}
|
||||
|
||||
Driver::~Driver()
|
||||
{}
|
||||
|
||||
int Driver::run() {
|
||||
if (!m_lexername.length()) {
|
||||
std::cerr << "No valid lexer name possible" << std::endl;
|
||||
return 1;
|
||||
}
|
||||
|
||||
Backend* back = m_backends->findBackendForLang(m_language);
|
||||
if (!back) {
|
||||
std::cerr << "Could not find a valid backend for language " << m_language << std::endl;
|
||||
|
|
18
src/re.cpp
18
src/re.cpp
|
@ -115,6 +115,9 @@ namespace lxs {
|
|||
}
|
||||
|
||||
namespace {
|
||||
/**
|
||||
* Take the two top elements from `stk` and combine them with a ConcatRE
|
||||
*/
|
||||
void compress(stack<std::shared_ptr<RE>>& stk)
|
||||
{
|
||||
std::shared_ptr<RE> a = stk.top();
|
||||
|
@ -124,6 +127,9 @@ namespace lxs {
|
|||
stk.push(std::make_shared<ConcatRE>(b, a)); //Attention: reversed order because of stack
|
||||
}
|
||||
|
||||
/**
|
||||
* Apply compress until only one RE remains on the stack
|
||||
*/
|
||||
void compactStack(stack<std::shared_ptr<RE> >& stk)
|
||||
{
|
||||
if (stk.empty()) return;
|
||||
|
@ -136,6 +142,9 @@ namespace lxs {
|
|||
stk.push(tp);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the actual char that should be used when c is placed after a backslash
|
||||
*/
|
||||
char parseEscapeChar(char c) {
|
||||
switch (c)
|
||||
{
|
||||
|
@ -175,6 +184,9 @@ namespace lxs {
|
|||
return c;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse a character class
|
||||
*/
|
||||
std::shared_ptr<RE> parseCharacterClass(const string& input, size_t& idx) {
|
||||
if (idx >= input.size())
|
||||
throw SyntaxError("Unclosed character class");
|
||||
|
@ -257,6 +269,9 @@ namespace lxs {
|
|||
return std::make_shared<MultiRE>(chars);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the RE for the `.` pattern: everything except a newline
|
||||
*/
|
||||
std::shared_ptr<RE> dotChar() {
|
||||
std::vector<char> any;
|
||||
for (int i = 0; i < 256; i++)
|
||||
|
@ -265,6 +280,9 @@ namespace lxs {
|
|||
return std::make_shared<MultiRE>(any);
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse the actual regex
|
||||
*/
|
||||
std::shared_ptr<RE> parseRE(const string& input, size_t& idx)
|
||||
{
|
||||
stack<std::shared_ptr<RE> > stk;
|
||||
|
|
Loading…
Reference in New Issue