Lexesis/src/automata.cpp

464 lines
16 KiB
C++

#include "Lexesis/automata.h"
#include <iostream>
#include <cassert>
#include <algorithm>
#include <climits>
#include <queue>
#include <string>
namespace lxs {
std::string toDot(const DFA& d)
{
std::string s = "digraph {\nrankdir=LR\nin [shape=point style=invis]\n";
for (State state = 0; state < d.numStates; state++)
{
std::string label = std::to_string(state);
s += std::to_string(state) + " [";
if (state == d.starting)
s += "color=yellow ";
if (d.accepting.count(state) > 0) {
s += "color=green shape=doublecircle ";
label += "\\np=" + std::to_string(d.priority.find(state)->second) + "\\nac=" + d.acceptingToken.find(state)->second;
}
s += "label=\"" + label + "\"]\n";
}
for (const auto& tmp : d.delta)
{
const auto& from = tmp.first;
for (const auto& trans : tmp.second)
{
s += std::to_string(from) + " -> " + std::to_string(trans.second) + " [label=\"" + trans.first + "\"]\n";
}
}
s += "in -> " + std::to_string(d.starting) + "\n}\n";
return s;
}
std::string toDot(const NFA& n)
{
std::string s = "digraph {\nrankdir=LR\nin [shape=point style=invis]\n";
for (State state = 0; state < n.numStates; state++)
{
std::string label = std::to_string(state);
s += std::to_string(state) + " [";
if (state == n.starting)
s += "color=yellow ";
if (n.accepting.count(state) > 0) {
s += "color=green shape=doublecircle ";
label += "\\np=" + std::to_string(n.priority.find(state)->second) + "\\nac=" + n.acceptingToken.find(state)->second;
}
s += "label=\"" + label + "\"]\n";
}
for (const auto& tmp : n.delta)
{
const auto& from = tmp.first;
for (const auto& trans : tmp.second)
{
for (const auto& to : trans.second)
s += std::to_string(from) + " -> " + std::to_string(to) + " [label=\"" + trans.first + "\"]\n";
}
}
s += "in -> " + std::to_string(n.starting) + "\n}\n";
return s;
}
std::string toDot(const ENFA& e)
{
std::string s = "digraph {\nrankdir=LR\nin [shape=point style=invis]\n";
for (State state = 0; state < e.numStates; state++)
{
std::string label = std::to_string(state);
s += std::to_string(state) + " [";
if (state == e.starting)
s += "color=yellow ";
if (e.accepting.count(state) > 0) {
s += "color=green shape=doublecircle ";
label += "\\np=" + std::to_string(e.priority.find(state)->second) + "\\nac=" + e.acceptingToken.find(state)->second;
}
s += "label=\"" + label + "\"]\n";
}
for (const auto& tmp : e.delta)
{
const auto& from = tmp.first;
for (const auto& trans : tmp.second)
{
for (const auto& to : trans.second)
s += std::to_string(from) + " -> " + std::to_string(to) + " [label=\"" + trans.first + "\"]\n";
}
}
for (const auto& etrans : e.epsilonTransitions)
{
for (const auto& dest : etrans.second)
s += std::to_string(etrans.first) + " -> " + std::to_string(dest) + " [label=\"ε\"]\n";
}
s += "in -> " + std::to_string(e.starting) + "\n}\n";
return s;
}
std::set<State> ENFA::eClose(State s) const {
std::set<State> states;
std::queue<State> statequeue;
statequeue.push(s);
states.insert(s);
while(!statequeue.empty()) {
auto state = statequeue.front();
statequeue.pop();
const auto newStatesIt = epsilonTransitions.find(state);
if (newStatesIt == epsilonTransitions.end()) continue;
for(const auto& newstate: newStatesIt->second) {
if(states.find(newstate) == states.end()) {
states.insert(newstate);
statequeue.push(newstate);
}
}
}
return states;
}
std::set<State> NFA::eClose(State s) const {
return {s};
}
namespace { // Utility functions for minimisation
using Distinguishables = std::map<State, std::set<State> >;
/**
* Return the reversal of a given DFA
* This keeps the same accepting states as the original DFA
* This uses a random starting state for the reversal.
* This does not consider priorities or associated tokens.
*/
NFA reverse(const DFA& d) {
NFA rev;
rev.numStates = d.numStates;
rev.accepting = d.accepting;
rev.acceptingToken = d.acceptingToken;
rev.starting = 0;
for (const auto& stateTransPair : d.delta) {
for (const auto& child : stateTransPair.second) {
rev.delta[child.second][child.first].insert(stateTransPair.first);
}
}
return rev;
}
/**
* Put all reachables states in `d`, starting from `s` into `reachable`
*/
void markReachable(const DFA& d, State s, std::set<State>& reachable) {
if (reachable.count(s) > 0)
return;
reachable.insert(s);
for (const auto& charStatePair : d.delta.find(s)->second)
markReachable(d, charStatePair.second, reachable);
}
/**
* Remove unreachable nodes from the reversal of d
* return a set with the reachable states
*/
std::set<State> removeUnreachable(const DFA& d, NFA& reversed) {
std::set<State> reachable;
markReachable(d, d.starting, reachable);
std::vector<State> statesToRemove;
for (State i = 0; i < d.numStates; i++) {
if (reachable.count(i) == 0) {
statesToRemove.push_back(i);
}
}
if (reachable.count(deadState) == 0)
statesToRemove.push_back(deadState);
for (State s : statesToRemove) {
reversed.accepting.erase(s);
reversed.delta.erase(s);
}
return reachable;
}
/**
* Compute distinguishable pairs, using the reversal of a DFA
*/
void computeDistinguishable(NFA& rev, Distinguishables& dist) {
std::queue<std::pair<State, State> > q;
for (State a = 0; a < rev.numStates; a++) {
for (State b = a + 1; b < rev.numStates; b++) {
if (rev.accepting.count(a) != rev.accepting.count(b)) {
q.push(std::make_pair(a, b));
} else if (rev.accepting.count(a) && rev.acceptingToken[a] != rev.acceptingToken[b]) {
//Do not merge accepting states that define different tokens, guaranteed problems...
q.push(std::make_pair(a, b));
}
}
if (rev.accepting.count(a) != rev.accepting.count(deadState)) {
q.push(std::make_pair(a, deadState));
}
}
while (!q.empty()) {
std::pair<State, State> p = q.front();
q.pop();
if (dist[p.first].count(p.second) > 0) continue;
dist[p.first].insert(p.second);
dist[p.second].insert(p.first);
for (int c = 0; c < 256; c++) {
for (State nextA : rev.delta[p.first][(char)c]) {
for (State nextB : rev.delta[p.second][(char) c]) {
q.push(std::make_pair(nextA, nextB));
}
}
}
}
}
/**
* Do the actual minimisation, using precomputed distinguishable pairs
*/
DFA compress(const DFA& d, std::set<State>& reachables, Distinguishables& dist) {
DFA min;
min.starting = d.starting;
std::map<State, State> newStates;
std::set<State> done;
State cur = 0;
for (State a = 0; a <= d.numStates; a++) {
if (a == d.numStates)
a = deadState;
if (reachables.count(a) == 0 || done.count(a) > 0) {
if (a == deadState)
break;
continue;
}
newStates[a] = cur;
if (a == deadState)
newStates[a] = deadState;
done.insert(a);
if (a != deadState) {
for (State b = a + 1; b <= d.numStates; b++) {
if (b == d.numStates)
b = deadState;
if (reachables.count(b) > 0 && dist[a].count(b) == 0) {
done.insert(b);
newStates[b] = cur;
if (b == deadState)
newStates[b] = deadState;
}
if (b == deadState)
b = d.numStates;
}
}
if (d.accepting.count(a) > 0) {
//Since different accepting tokens should never be merged, there is no need to check the priorities
min.accepting.insert(cur);
min.priority[cur] = d.priority.find(a)->second;
min.acceptingToken[cur] = d.acceptingToken.find(a)->second;
}
if (a != deadState) {
++min.numStates;
++cur;
}
if (a == deadState)
a = d.numStates;
}
done.clear();
//Fill the delta function of the minimized DFA
for (State s = 0; s < d.numStates; s++) {
if (done.count(newStates[s]) > 0) continue;
done.insert(newStates[s]);
for (const auto& p : d.delta.find(s)->second)
min.delta[newStates[s]][p.first] = newStates[p.second];
}
return min;
}
} //namespace
DFA minimize(const DFA& d) {
NFA reversed = reverse(d);
std::set<State> reachable = removeUnreachable(d, reversed);
Distinguishables dist;
computeDistinguishable(reversed, dist);
return compress(d, reachable, dist);
}
namespace { // Utility function for mssc
/**
* Get the next state, by taking the union of all next states in the NFA by following any character in `oldstate`
*/
std::set<State> getNextState(const std::set<State>& oldstate, char symbol, const NFA& e) {
std::set<State> states;
for(const auto &state: oldstate) {
auto a = e.delta.find(state);
if(a != e.delta.end()) {
auto newStates = a->second.find(symbol);
if(newStates != a->second.end()) {
for(auto &newstate:newStates->second) {
auto eclosestates = e.eClose(newstate);
for(auto &eclosestate:eclosestates) {
states.insert(eclosestate);
}
}
}
}
}
return states;
}
} //namespace
DFA mssc(const NFA& e) {
//Temporary, improper dfa structure
std::map<std::set<State>, std::map<char,std::set<State> > > dfa;
std::map<char, std::set<State> > trans;
for (int c = 0; c < 256; c++) {
trans[c] = {};
}
dfa[{}] = trans;
//Lazy evaluation, on a still implicit DFA
std::queue<std::set<State> > tocheck;
tocheck.push(e.eClose(e.starting));
while(!tocheck.empty()) {
auto state = std::move(tocheck.front());
tocheck.pop();
std::map<char, std::set<State> > trans;
for (int c = 0; c < 256; c++) {
auto nextstate = getNextState(state,c,e);
if(dfa.find(nextstate) == dfa.end()) {
dfa[nextstate] = {};
tocheck.push(nextstate);
}
trans[c] = std::move(nextstate);
}
dfa[state] = trans;
}
//Assign sequential indices for the actual DFA
std::map<std::set<State>,State> lookup;
State numStates = 0;
for(auto &state : dfa) {
if (state.first.size())
lookup[state.first] = numStates++;
else
lookup[state.first] = deadState;
}
//Setup the actual DFA
DFA result;
result.numStates = numStates;
result.starting = lookup.find(e.eClose(e.starting))->second;
//Setup the transitions
//Merge priorities and acceptingTokens
for(auto &state:dfa) {
Priority priority = std::numeric_limits<Priority>::max();
std::string acTok = "";
State newstate = lookup.find(state.first)->second;
std::map<char,State> newtransitions;
for(auto &item : state.first) {
if(e.accepting.count(item) > 0) {
const auto& newPrior = e.priority.find(item)->second;
if (newPrior < priority)
{
priority = newPrior;
acTok = e.acceptingToken.find(item)->second;
}
}
}
//Insert the found priority
if(priority != std::numeric_limits<Priority>::max()) {
result.accepting.insert(newstate);
result.priority[newstate] = priority;
result.acceptingToken[newstate] = acTok;
}
//Do transition
for(auto &transition : state.second) {
newtransitions[transition.first] = lookup[transition.second];
}
result.delta[newstate] = std::move(newtransitions);
}
return result;
}
ENFA merge(const std::vector<ENFA>& enfas) {
ENFA result;
unsigned int offset = 1;
result.starting = 0;
result.numStates = 1;
for(const auto &enfa: enfas) {
result.numStates += enfa.numStates;
for(const auto accepting: enfa.accepting) {
result.accepting.insert(accepting + offset);
}
for(const auto &priority: enfa.priority) {
result.priority[priority.first + offset] = priority.second;
}
for(const auto &acceptingToken: enfa.acceptingToken) {
result.acceptingToken[acceptingToken.first + offset] = acceptingToken.second;
}
for(const auto &transition: enfa.delta) {
std::map<char, std::set<State> > temptransition;
for(auto &trans: transition.second) {
std::set<State> tempset;
for(auto state: trans.second) {
tempset.insert(state+offset);
}
temptransition[trans.first] = tempset;
}
result.delta[transition.first + offset] = temptransition;
}
for(const auto &epsilonTransition: enfa.epsilonTransitions) {
std::set<State> tempset;
for(State state: epsilonTransition.second) {
tempset.insert(state+offset);
}
result.epsilonTransitions[epsilonTransition.first + offset] = tempset;
}
result.epsilonTransitions[0].insert(enfa.starting + offset);
offset += enfa.numStates;
}
return result;
}
} //namespace lxs