Lexesis/src/automata.cpp

#include "Lexesis/automata.h"

#include <iostream>
#include <cassert>
#include <algorithm>
#include <climits>
#include <queue>
#include <string>

namespace lxs {
    std::string toDot(const DFA& d)
    {
        std::string s = "digraph {\nrankdir=LR\nin [shape=point style=invis]\n";

        for (State state = 0; state < d.numStates; state++)
        {
            std::string label = std::to_string(state);
            s += std::to_string(state) + " [";
            if (state == d.starting)
                s += "color=yellow ";
            if (d.accepting.count(state) > 0) {
                s += "color=green shape=doublecircle ";
                label += "\\np=" + std::to_string(d.priority.find(state)->second) + "\\nac=" + d.acceptingToken.find(state)->second;
            }
            s += "label=\"" + label + "\"]\n";
        }

        for (const auto& tmp : d.delta)
        {
            const auto& from = tmp.first;
            for (const auto& trans : tmp.second)
            {
                s += std::to_string(from) + " -> " + std::to_string(trans.second) + " [label=\"" + trans.first + "\"]\n";
            }
        }

        s += "in -> " + std::to_string(d.starting) + "\n}\n";
        return s;
    }

    std::string toDot(const NFA& n)
    {
        std::string s = "digraph {\nrankdir=LR\nin [shape=point style=invis]\n";

        for (State state = 0; state < n.numStates; state++)
        {
            std::string label = std::to_string(state);
            s += std::to_string(state) + " [";
            if (state == n.starting)
                s += "color=yellow ";
            if (n.accepting.count(state) > 0) {
                s += "color=green shape=doublecircle ";
                label += "\\np=" + std::to_string(n.priority.find(state)->second) + "\\nac=" + n.acceptingToken.find(state)->second;
            }
            s += "label=\"" + label + "\"]\n";
        }

        for (const auto& tmp : n.delta)
        {
            const auto& from = tmp.first;
            for (const auto& trans : tmp.second)
            {
                for (const auto& to : trans.second)
                    s += std::to_string(from) + " -> " + std::to_string(to) + " [label=\"" + trans.first + "\"]\n";
            }
        }

        s += "in -> " + std::to_string(n.starting) + "\n}\n";
        return s;
    }

    std::string toDot(const ENFA& e)
    {
        std::string s = "digraph {\nrankdir=LR\nin [shape=point style=invis]\n";

        for (State state = 0; state < e.numStates; state++)
        {
            std::string label = std::to_string(state);
            s += std::to_string(state) + " [";
            if (state == e.starting)
                s += "color=yellow ";
            if (e.accepting.count(state) > 0) {
                s += "color=green shape=doublecircle ";
                label += "\\np=" + std::to_string(e.priority.find(state)->second) + "\\nac=" + e.acceptingToken.find(state)->second;
            }
            s += "label=\"" + label + "\"]\n";
        }

        for (const auto& tmp : e.delta)
        {
            const auto& from = tmp.first;
            for (const auto& trans : tmp.second)
            {
                for (const auto& to : trans.second)
                    s += std::to_string(from) + " -> " + std::to_string(to) + " [label=\"" + trans.first + "\"]\n";
            }
        }

        for (const auto& etrans : e.epsilonTransitions)
        {
            for (const auto& dest : etrans.second)
                s += std::to_string(etrans.first) + " -> " + std::to_string(dest) + " [label=\"ε\"]\n";
        }

        s += "in -> " + std::to_string(e.starting) + "\n}\n";
        return s;
    }

    std::set<State> ENFA::eClose(State s) const {
        std::set<State> states;
        std::queue<State> statequeue;
        statequeue.push(s);
        states.insert(s);
        while(!statequeue.empty()) {
            auto state = statequeue.front();
            statequeue.pop();
            const auto newStatesIt = epsilonTransitions.find(state);
            if (newStatesIt == epsilonTransitions.end()) continue;
            for(const auto& newstate: newStatesIt->second) {
                if(states.find(newstate) == states.end()) {
                    states.insert(newstate);
                    statequeue.push(newstate);
                }
            }
        }
        return states;
    }

    std::set<State> NFA::eClose(State s) const {
        return {s};
    }

    namespace { // Utility functions for minimisation

        using Distinguishables = std::map<State, std::set<State> >;

        /**
         * Return the reversal of a given DFA
         * This keeps the same accepting states as the original DFA
         * This uses a random starting state for the reversal.
         * This does not consider priorities or associated tokens.
         */
        NFA reverse(const DFA& d) {
            NFA rev;

            rev.numStates = d.numStates;
            rev.accepting = d.accepting;
            rev.acceptingToken = d.acceptingToken;
            rev.starting = 0;

            for (const auto& stateTransPair : d.delta) {
                for (const auto& child : stateTransPair.second) {
                    rev.delta[child.second][child.first].insert(stateTransPair.first);
                }
            }

            return rev;
        }

        /**
         * Put all reachables states in `d`, starting from `s` into `reachable`
         */
        void markReachable(const DFA& d, State s, std::set<State>& reachable) {
            if (reachable.count(s) > 0)
                return;
            reachable.insert(s);
            for (const auto& charStatePair : d.delta.find(s)->second)
                markReachable(d, charStatePair.second, reachable);
        }

        /**
         * Remove unreachable nodes from the reversal of d
         * return a set with the reachable states
         */
        std::set<State> removeUnreachable(const DFA& d, NFA& reversed) {
            std::set<State> reachable;
            markReachable(d, d.starting, reachable);

            std::vector<State> statesToRemove;

            for (State i = 0; i < d.numStates; i++) {
                if (reachable.count(i) == 0) {
                    statesToRemove.push_back(i);
                }
            }
            if (reachable.count(deadState) == 0)
                statesToRemove.push_back(deadState);

            for (State s : statesToRemove) {
                reversed.accepting.erase(s);
                reversed.delta.erase(s);
            }

            return reachable;
        }

        /**
         * Compute distinguishable pairs, using the reversal of a DFA
         */
        void computeDistinguishable(NFA& rev, Distinguishables& dist) {
            std::queue<std::pair<State, State> > q;

            for (State a = 0; a < rev.numStates; a++) {
                for (State b = a + 1; b < rev.numStates; b++) {
                    if (rev.accepting.count(a) != rev.accepting.count(b)) {
                        q.push(std::make_pair(a, b));
                    } else if (rev.accepting.count(a) && rev.acceptingToken[a] != rev.acceptingToken[b]) {
                        //Do not merge accepting states that define different tokens, guaranteed problems...
                        q.push(std::make_pair(a, b));
                    }
                }
                if (rev.accepting.count(a) != rev.accepting.count(deadState)) {
                    q.push(std::make_pair(a, deadState));
                }
            }

            while (!q.empty()) {
                std::pair<State, State> p = q.front();
                q.pop();
                if (dist[p.first].count(p.second) > 0) continue;
                dist[p.first].insert(p.second);
                dist[p.second].insert(p.first);

                for (int c = 0; c < 256; c++) {
                    for (State nextA : rev.delta[p.first][(char)c]) {
                        for (State nextB : rev.delta[p.second][(char) c]) {
                            q.push(std::make_pair(nextA, nextB));
                        }
                    }
                }
            }
        }

        /**
         * Do the actual minimisation, using precomputed distinguishable pairs
         */
        DFA compress(const DFA& d, std::set<State>& reachables, Distinguishables& dist) {
            DFA min;
            min.starting = d.starting;

            std::map<State, State> newStates;
            std::set<State> done;

            State cur = 0;
            for (State a = 0; a <= d.numStates; a++) {
                if (a == d.numStates)
                    a = deadState;

                if (reachables.count(a) == 0 || done.count(a) > 0) {
                    if (a == deadState)
                        break;
                    continue;
                }

                newStates[a] = cur;
                if (a == deadState)
                    newStates[a] = deadState;

                done.insert(a);
                if (a != deadState) {
                    for (State b = a + 1; b <= d.numStates; b++) {
                        if (b == d.numStates)
                            b = deadState;

                        if (reachables.count(b) > 0 && dist[a].count(b) == 0) {
                            done.insert(b);
                            newStates[b] = cur;
                            if (b == deadState)
                                newStates[b] = deadState;
                        }

                        if (b == deadState)
                            b = d.numStates;
                    }
                }

                if (d.accepting.count(a) > 0) {
                    //Since different accepting tokens should never be merged, there is no need to check the priorities
                    min.accepting.insert(cur);
                    min.priority[cur] = d.priority.find(a)->second;
                    min.acceptingToken[cur] = d.acceptingToken.find(a)->second;
                }

                if (a != deadState) {
                    ++min.numStates;
                    ++cur;
                }

                if (a == deadState)
                    a = d.numStates;
            }

            done.clear();
            //Fill the delta function of the minimized DFA
            for (State s = 0; s < d.numStates; s++) {
                if (done.count(newStates[s]) > 0) continue;
                done.insert(newStates[s]);
                for (const auto& p : d.delta.find(s)->second)
                    min.delta[newStates[s]][p.first] = newStates[p.second];
            }

            return min;
        }

    } //namespace

    DFA minimize(const DFA& d) {
        NFA reversed = reverse(d);
        std::set<State> reachable = removeUnreachable(d, reversed);
        Distinguishables dist;
        computeDistinguishable(reversed, dist);
        return compress(d, reachable, dist);
    }

    namespace { // Utility function for mssc

        /**
         * Get the next state, by taking the union of all next states in the NFA by following any character in `oldstate`
         */
        std::set<State> getNextState(const std::set<State>& oldstate, char symbol, const NFA& e) {
            std::set<State> states;
            for(const auto &state: oldstate) {
                 auto a = e.delta.find(state);
                 if(a != e.delta.end()) {
                     auto newStates = a->second.find(symbol);
                     if(newStates != a->second.end()) {
                         for(auto &newstate:newStates->second) {
                            auto eclosestates = e.eClose(newstate);
                            for(auto &eclosestate:eclosestates) {
                                states.insert(eclosestate);
                            }
                         }

                     }
                 }
            }
            return states;
        }

    } //namespace

    DFA mssc(const NFA& e) {
        //Temporary, improper dfa structure
        std::map<std::set<State>, std::map<char,std::set<State> > > dfa;
        std::map<char, std::set<State> > trans;
        for (int c = 0; c < 256; c++) {
            trans[c] = {};
        }
        dfa[{}] = trans;

        //Lazy evaluation, on a still implicit DFA
        std::queue<std::set<State> > tocheck;
        tocheck.push(e.eClose(e.starting));
        while(!tocheck.empty()) {
            auto state = std::move(tocheck.front());
            tocheck.pop();
            std::map<char, std::set<State> > trans;
            for (int c = 0; c < 256; c++) {
                auto nextstate = getNextState(state,c,e);
                if(dfa.find(nextstate) == dfa.end()) {
                    dfa[nextstate] = {};
                    tocheck.push(nextstate);
                }
                trans[c] = std::move(nextstate);
            }
            dfa[state] = trans;
        }

        //Assign sequential indices for the actual DFA
        std::map<std::set<State>,State> lookup;
        State numStates = 0;
        for(auto &state : dfa) {
            if (state.first.size())
                lookup[state.first] = numStates++;
            else
                lookup[state.first] = deadState;
        }

        //Setup the actual DFA
        DFA result;
        result.numStates = numStates;
        result.starting = lookup.find(e.eClose(e.starting))->second;

        //Setup the transitions
        //Merge priorities and acceptingTokens
        for(auto &state:dfa) {
            Priority priority = std::numeric_limits<Priority>::max();
            std::string acTok = "";

            State newstate = lookup.find(state.first)->second;
            std::map<char,State> newtransitions;
            for(auto &item : state.first) {
                if(e.accepting.count(item) > 0) {
                    const auto& newPrior = e.priority.find(item)->second;
                    if (newPrior < priority)
                    {
                        priority = newPrior;
                        acTok = e.acceptingToken.find(item)->second;
                    }
                }
            }

            //Insert the found priority
            if(priority != std::numeric_limits<Priority>::max()) {
                result.accepting.insert(newstate);
                result.priority[newstate] = priority;
                result.acceptingToken[newstate] = acTok;
            }

            //Do transition
            for(auto &transition : state.second) {
                newtransitions[transition.first] = lookup[transition.second];
            }

            result.delta[newstate] = std::move(newtransitions);
        }

        return result;
    }


    ENFA merge(const std::vector<ENFA>& enfas) {
        ENFA result;
        unsigned int offset = 1;
        result.starting = 0;
        result.numStates = 1;
        for(const auto &enfa: enfas) {
            result.numStates += enfa.numStates;
            for(const auto accepting: enfa.accepting) {
                result.accepting.insert(accepting + offset);
            }
            for(const auto &priority: enfa.priority) {
                result.priority[priority.first + offset] = priority.second;
            }
            for(const auto &acceptingToken: enfa.acceptingToken) {
                result.acceptingToken[acceptingToken.first + offset] = acceptingToken.second;
            }
            for(const auto &transition: enfa.delta) {
                std::map<char, std::set<State> > temptransition;
                for(auto &trans: transition.second) {
                    std::set<State> tempset;
                    for(auto state: trans.second) {
                        tempset.insert(state+offset);
                    }
                    temptransition[trans.first] = tempset;
                }
                result.delta[transition.first + offset] = temptransition;
            }
            for(const auto &epsilonTransition: enfa.epsilonTransitions) {
                std::set<State> tempset;
                for(State state: epsilonTransition.second) {
                    tempset.insert(state+offset);
                }
                result.epsilonTransitions[epsilonTransition.first + offset] = tempset;
            }
            result.epsilonTransitions[0].insert(enfa.starting + offset);
            offset += enfa.numStates;
        }
        return result;
    }


} //namespace lxs