#!/usr/bin/env python2.6 """ re2fa.py 0.1 Usage: re2fa.py re Accept a regular expression on the command line and generate a corresponding NFA. For REs, "@" is the empty set, and "-" is the empty string. Options: -h --help Print this help. -d --dot Print a description of the automaton in the Graphviz language suitable for processing with dot to get a graphical representation of the automaton. -p --print Print back the automaton in tabular form. Copyright 2015 Stephan Schulz, schulz@eprover.org This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program ; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA The original copyright holder can be contacted as Stephan Schulz DHBW Stuttgart Informatik Postfach 10 05 63 D-70004 Stuttgart Germany or via email (address above). """ import sys import re import getopt start_re = re.compile("dummy_x23z *-> *([A-Za-z0-9]+)") trans_re = re.compile("([A-Za-z0-9]+) *-> *([A-Za-z0-9]+) *\[label *= *([^]]*)\]") accept_re = re.compile("node \[shape = doublecircle\];") naccept_re = re.compile("node") state_re = re.compile("([A-Za-z0-9]+) *\[label=[^]]*\];") subscr_re=re.compile("[0-9]*$") def prettyState(str): mo = subscr_re.search(str) if mo: init = str[0:mo.start()] subscr = str[mo.start():] str = "%s [label=<%s%s>]"%(str,init,subscr) return str class DFA(object): """ Object representing a (deterministic) finite automaton. """ def __init__(self): self.states = set() self.sigma = [] self.delta = {} self.start = None self.accept = set() def add_state(self, state): self.states.add(state) def set_accepting(self, state): self.states.add(state) self.accept.add(state) def set_start(self, state): # Will overwrite previous start state, if any self.states.add(state) self.start = state def add_trans(self, fr, char, to): self.states.add(fr) self.states.add(to) if not char in self.sigma: self.sigma.append(char) self.delta[(fr, char)] = to def delta_fun(self, state, letter): try: return self.delta[(state, letter)] except: return None def __str__(self): res = list() sigma = sorted(self.sigma) l1 = " | "+" ".join(["%4s"%(i,) for i in sigma]) res.append(l1) l2 = "-"*(len(l1)+1) res.append(l2) for i in sorted(self.states): if i == self.start: start = "->" else: start = " " if i in self.accept: accept = "*" else: accept = " " xstate = "%2s %1s %3s | " %(start, accept, i) l = xstate+" ".join(["%4s"% (self.delta[(i,s)],) for s in sigma]) res.append(l) return "\n".join(res) dfa_dot_template = """ digraph finite_state_machine { rankdir=LR; size="8,5" node [shape = plaintext]; dummy_x23z [label = ""]; node [shape = doublecircle]; %s node [shape = circle]; %s %s } """ def dotify(self): accepts_str = " ".join(map(prettyState,self.accept)) if accepts_str: accepts_str += ";" other_states = [state for state in self.states if not state in self.accept] other_str = " ".join(map(prettyState,other_states)) if other_str: other_str += ";" transitions = [] if self.start: transitions.append(" dummy_x23z -> %s\n"%(self.start,)) for i in self.delta.keys(): source, label = i to = self.delta_fun(source, label) transitions.append(" %s -> %s [label = \"%s\"];\n"% (source, to, label)) transitions_str = "".join(transitions) return self.dfa_dot_template%(accepts_str,other_str,transitions_str) class NFA(object): """ Object representing a (non-deterministic) finite automaton. """ def __init__(self): self.states = set() self.sigma = [] self.delta = set() self.start = None self.accept = set() def add_state(self, state): self.states.add(state) def set_accepting(self, state): self.states.add(state) self.accept.add(state) def set_start(self, state): # Will overwrite previous start state, if any self.states.add(state) self.start = state def add_trans(self, fr, char, to): self.states.add(fr) self.states.add(to) if not char in self.sigma: self.sigma.append(char) self.delta.add((fr, char, to)) def add_nfa(self,nfa): for s in nfa.states: self.add_state(s) for c in nfa.sigma: if not c in self.sigma: self.sigma.append(c) self.delta.update(nfa.delta) def delta_rel(self, state, char): return [t[2] for t in self.delta if (t[0]==state and t[1]==char)] def delta_star(self, state, char): res = set() for s in self.delta_rel(state, char): res.update(self.epsilon_closure(s)) return res def Delta(self, states, char): res = set() for s in states: res.update(self.delta_star(s, char)) return res def delta_rel_str(self, state, char): return "{"+",".join(self.delta_rel(state, char))+"}" def comp_epsilon_closure(self, state, closure): if state in closure: return closure closure.add(state) for s in self.delta_rel(state, ""): self.comp_epsilon_closure(s, closure) return closure def epsilon_closure(self, state): return self.comp_epsilon_closure(state, set()) def __str__(self): res = list() sigma = sorted(self.sigma) l1 = " | "+" ".join(["%15s"%(i,) for i in sigma]) res.append(l1) l2 = "-"*(len(l1)+1) res.append(l2) for i in sorted(self.states): if i == self.start: start = "->" else: start = " " if i in self.accept: accept = "*" else: accept = " " xstate = "%2s %1s %3s | " %(start, accept, i) d = " ".join(["%15s"%(self.delta_rel_str(i, c),) for c in sigma]) l = xstate + d res.append(l) return "\n".join(res) def parse_gv(self, str): lines = str.split("\n") accepting = False for line in lines: mo = accept_re.search(line) if mo: print "Accept" accepting = True else: mo = naccept_re.search(line) if mo: print "Non-Accept" accepting = False mo = state_re.search(line) if mo: print "Found state", mo.groups()[0] if accepting: self.set_accepting(mo.groups()[0]) mo = start_re.search(line) if mo: print "Start state is", mo.groups()[0] self.set_start(mo.groups()[0]) else: print "Scanning line", line mo = trans_re.search(line) if mo: print "Found" fr = mo.groups()[0] to = mo.groups()[1] transstr = mo.groups()[2].strip("\"") chars = transstr.split(",") for c in chars: c=c.strip() char = "" if c != "<ε>": char = c print "Transition %s -- %s -> %s"% (fr, char, to) self.add_trans(fr, char, to) nfa_dot_template = """ digraph finite_state_machine { rankdir=LR; node [shape = plaintext]; dummy_x23z [label = ""]; node [shape = doublecircle]; %s node [shape = circle]; %s %s } """ def dotify(self): accepts_str = " ".join(map(prettyState,self.accept)) if accepts_str: accepts_str += ";" other_states = [state for state in self.states if not state in self.accept] other_str = " ".join(map(prettyState,other_states)) if other_str: other_str += ";" transitions = [] if self.start: transitions.append(" dummy_x23z -> %s\n"%(self.start,)) for i in self.delta: source, label, to = i if label == "": label="ε" transitions.append(" %s -> %s [label = \"%s\"];\n"% (source, to, label)) transitions_str = "".join(transitions) return self.nfa_dot_template%(accepts_str,other_str,transitions_str) def make_det(self): dfa = DFA() states = {} sigma = [l for l in self.sigma if l != ""] S0 = frozenset(self.epsilon_closure(self.start)) states[S0] = "S0" dfa.set_start("S0") #print "S0 =", S0 worklist =[ ("S0", S0) ] state_counter = 1 while worklist: item = worklist.pop(0) name, state = item for l in sigma: newstate = frozenset(self.Delta(state, l)) # print "Delta(%s, %s) = "%(name,l), newstate if not newstate in states: newname = "S%d"% (state_counter,) state_counter += 1 states[newstate] = newname worklist.append((newname, newstate)) #print newname, " = ", newstate if newstate.intersection(self.accept): dfa.set_accepting(newname) else: #print "State is equal to ", states[newstate] newname = states[newstate] dfa.add_trans(name, l, newname) return dfa class RE(object): """ Class representing regular expressions. We use - for the empty string and @ for the empty set. """ q_counter=-1 def __init__(self, type, subexpr1=None, subexpr2=None): # print "RE(", type, subexpr1, subexpr2, ")" self.type = type self.subexpr1 = subexpr1 self.subexpr2 = subexpr2 def getNewState(self): RE.q_counter+=1 return "q%d"%(RE.q_counter,) def makeSubNFA(self): """Returns NFA, startstate, accepting state""" res = NFA() if self.type=="+": q0 = self.getNewState() qf = self.getNewState() res1,q1_0, q1_f = self.subexpr1.makeSubNFA() res2,q2_0, q2_f = self.subexpr2.makeSubNFA() res1.add_nfa(res2) res1.add_trans(q0, "", q1_0) res1.add_trans(q0, "", q2_0) res1.add_trans(q1_f, "", qf) res1.add_trans(q2_f, "", qf) res=res1 elif self.type=="&": res1,q0, qe = self.subexpr1.makeSubNFA() res2,qs, qf = self.subexpr2.makeSubNFA() res1.add_nfa(res2) res1.add_trans(qe,"",qs) res=res1 elif self.type=="*": q0 = self.getNewState() qf = self.getNewState() res,q1_0, q1_f = self.subexpr1.makeSubNFA() res.add_trans(q0, "", q1_0) res.add_trans(q0, "", qf) res.add_trans(q1_f, "", qf) res.add_trans(q1_f, "", q1_0) elif self.type=="-": q0 = self.getNewState() qf = self.getNewState() res.add_trans(q0, "", qf) elif self.type=="@": q0 = self.getNewState() qf = self.getNewState() res.add_state(q0) res.add_state(qf) else: q0 = self.getNewState() qf = self.getNewState() res.add_trans(q0, self.type, qf) return res, q0, qf def makeNFA(self): res,q0,qf=self.makeSubNFA() res.set_start(q0) res.set_accepting(qf) return res def __str__(self): if self.type == "+": return "("+str(self.subexpr1)+"+"+str(self.subexpr2)+")" elif self.type == "&": return "("+str(self.subexpr1)+str(self.subexpr2)+")" elif self.type == "*": return "("+str(self.subexpr1)+"*)" else: return self.type def check(inp, l): if type(l) == type("a") and inp[0]!=l or not inp[0] in l: print "Error, expected "+str(l)+", found "+inp[0] sys.exit(1) def accept(inp, l): check(inp,l) inp.pop(0) def parse_alt(inp): res = parse_cat(inp) while inp[0]=="+": inp.pop(0) res2=parse_alt(inp) res=RE("+", res, res2) return res def parse_cat(inp): res = parse_kleene(inp) while not inp[0] in ["eof", "*", "+", ")"]: res2=parse_cat(inp) res=RE("&", res, res2) return res def parse_kleene(inp): res = parse_base(inp) while inp[0]=="*": inp.pop(0) res=RE("*", res) return res def parse_base(inp): if inp[0] in ["eof", "*", "+", ")"]: print "Error, regexp-starting character expected, got", inp[0] exit(1) if inp[0] == "(": inp.pop(0) res = parse_alt(inp) accept(inp, ")") else: res = RE(inp.pop(0)) return res dot = False prt = False dfa = False if __name__ == '__main__': opts, args = getopt.gnu_getopt(sys.argv[1:], "hDdp", ["help", "dfa", "dot", "print"]) for option, optarg in opts: if option in ["-h", "--help"]: print __doc__ sys.exit() elif option in ["-D", "--dfa"]: dfa = True elif option in ["-d", "--dot"]: dot = True elif option in ["-p", "--print"]: prt = True else: sys.exit("Unknown option "+ option) if len(args)<1: print __doc__ sys.exit() inp = [c for c in args[0]]+["eof"] re = parse_alt(inp) res = re.makeNFA() if dfa: res = res.make_det() if prt: print res if dot: print res.dotify()