Plasma GitLab Archive
Projects Blog Knowledge

(* $Id: pxp_dfa.ml,v 1.3 2002/02/20 10:01:36 gerd Exp $
 * ----------------------------------------------------------------------
 *
 *)

open Pxp_aux


module Graph = struct
  class id_obj = object end

  type vertex =
      { mutable edges_out : (string * vertex) list;
	mutable edges_out_map : vertex StringMap.t;
	mutable edges_in : (vertex * string) list;
	mutable id : id_obj;
      }
  and graph =
      (* Currently not managed *)
      unit

  exception Edge_not_unique

  let create () = ()

  let new_vertex g =
    let v =
      { edges_out = [];
	edges_out_map = StringMap.empty;
	edges_in = [];
	id = new id_obj;
      } in
    v

  let new_edge v_from e v_to =
    (* if v_from.graph != v_to.graph then
          invalid_arg "Pxp_dfa.Graph.new_edge";
    *)
    try 
      let v = StringMap.find e v_from.edges_out_map in
      if v != v_to then
	raise Edge_not_unique;
    with
	Not_found ->
	  v_from.edges_out     <- (e, v_to) :: v_from.edges_out;
	  v_from.edges_out_map <- StringMap.add e v_to v_from.edges_out_map;
	  v_to.edges_in        <- (v_from, e) :: v_to.edges_in;
	  ()

  (* let graph_of_vertex v = v.graph *)

  let union g1 g2 = ()

  let outgoing_edges v =
    v.edges_out

  let ingoing_edges v =
    v.edges_in

  let follow_edge v e =
    StringMap.find e v.edges_out_map  (* or raise Not_found *)
end
;;


module VertexOrd = struct
  type t = Graph.vertex
  let compare v1 v2 =
    (* if v1.Graph.graph != v2.Graph.graph then
        invalid_arg "Pxp_dfa.VertexOrd.compare";
    *)
    compare v1.Graph.id v2.Graph.id  (* compares the OIDs *)
end
;;

module VertexSet = Set.Make(VertexOrd);;


type dfa_definition =
    { dfa_graph : Graph.graph;
      dfa_start : Graph.vertex;
      dfa_stops : VertexSet.t;
      dfa_null  : bool;
    }
;;

(**********************************************************************)

(* Now that we have all the auxiliary data types, it is time for the
 * algorithm that transforms regexps to DFAs.
 *)

open Pxp_types

let dfa_of_regexp_content_model re =
  let rec get_dfa re =
    match re with
	Child e ->
	  let g = Graph.create() in
	  let v1 = Graph.new_vertex g in
	  let v2 = Graph.new_vertex g in
	  Graph.new_edge v1 e v2;
	  { dfa_graph = g;
	    dfa_start = v1;
	    dfa_stops = VertexSet.singleton v2;
	    dfa_null = false;
	  }
	  
      | Seq [] ->
	  invalid_arg "Pxp_dfa.dfa_of_regexp_content_model"
      | Seq [re'] ->
	  get_dfa re'
      | Seq (re1 :: seq2) ->
	  let dfa1 = get_dfa re1 in
	  let dfa2 = get_dfa (Seq seq2) in
	  (* Merge the two graphs. The result is in dfa1.dfa_graph: *)
	  Graph.union dfa1.dfa_graph dfa2.dfa_graph;
	  (* Concatenation I: Add additional edges to the graph such
	   * that if w1 matches dfa1, and w2 matches dfa2, and w2 is not
	   * empty, w1w2 will match the merged DFAs.
	   *)
	  List.iter
	    (fun (e,v') ->
	       VertexSet.iter
		 (fun v ->
		    Graph.new_edge v e v')
		 dfa1.dfa_stops
	    )
	    (Graph.outgoing_edges dfa2.dfa_start);
	  (* Concatenation II: If the emtpy string matches dfa2, the stop
	   * nodes of dfa1 remain stop nodes.
	   *)
	  let stops =
	    if dfa2.dfa_null then
	      VertexSet.union dfa1.dfa_stops dfa2.dfa_stops
	    else
	      dfa2.dfa_stops
	  in
	  (* The resulting DFA: *)
	  { dfa_graph = dfa1.dfa_graph;
	    dfa_start = dfa1.dfa_start;
	    dfa_stops = stops;
	    dfa_null  = dfa1.dfa_null && dfa2.dfa_null;
	  }

      | Alt [] ->
	  invalid_arg "Pxp_dfa.dfa_of_regexp_content_model"
      | Alt [re'] ->
	  get_dfa re'
      | Alt alt ->
	  let dfa_alt = List.map get_dfa alt in
	  (* Merge the graphs. The result is in g: *)
	  let g = (List.hd dfa_alt).dfa_graph in
	  List.iter
	    (fun dfa ->
	       Graph.union g dfa.dfa_graph
	    )
	    (List.tl dfa_alt);
	  (* Get the new start node: *)
	  let start = Graph.new_vertex g in
	  (* Add the new edges starting at 'start': *)
	  List.iter
	    (fun dfa ->
	       List.iter
		 (fun (e, v) ->
		    Graph.new_edge start e v)
		 (Graph.outgoing_edges dfa.dfa_start)
	    )
	    dfa_alt;
	  (* If one of the old start nodes was a stop node, the new start
	   * node will be a stop node, too.
	   *)
	  let null = List.exists (fun dfa -> dfa.dfa_null) dfa_alt in
	  let stops =
	    List.fold_left
	      (fun s dfa -> VertexSet.union s dfa.dfa_stops)
	      VertexSet.empty
	      dfa_alt in
	  let stops' =
	    if null then
	      VertexSet.union stops (VertexSet.singleton start)
	    else
	      stops in
	  (* The resulting DFA: *)
	  { dfa_graph = g;
	    dfa_start = start;
	    dfa_stops = stops';
	    dfa_null  = null;
	  }

      | Optional re' ->
	  let dfa' = get_dfa re' in
	  if dfa'.dfa_null then
	    (* simple case *)
	    dfa'
	  else begin
	    (* Optimization possible: case ingoing_edges dfa_start = [] *)
	    let start = Graph.new_vertex dfa'.dfa_graph in
	    List.iter
	      (fun (e, v) ->
		 Graph.new_edge start e v)
	      (Graph.outgoing_edges dfa'.dfa_start);
	    
	    (* The resulting DFA: *)
	    { dfa_graph = dfa'.dfa_graph;
	      dfa_start = start;
	      dfa_stops = VertexSet.union dfa'.dfa_stops 
			                  (VertexSet.singleton start);
	      dfa_null  = true;
	    }
	  end

      | Repeated1 re' ->
	  let dfa' = get_dfa re' in
	  List.iter
	    (fun (e, v') ->
	       VertexSet.iter
		 (fun v ->
		    Graph.new_edge v e v')
		 dfa'.dfa_stops
	    )
	    (Graph.outgoing_edges dfa'.dfa_start);

	    (* The resulting DFA: *)
	    { dfa_graph = dfa'.dfa_graph;
	      dfa_start = dfa'.dfa_start;
	      dfa_stops = dfa'.dfa_stops;
	      dfa_null  = dfa'.dfa_null;
	    }

      | Repeated re' ->
	  get_dfa (Optional (Repeated1 re'))

  in
  try
    get_dfa re
  with
      Graph.Edge_not_unique -> raise Not_found
;;

(* ======================================================================
 * History:
 * 
 * $Log: pxp_dfa.ml,v $
 * Revision 1.3  2002/02/20 10:01:36  gerd
 * 	Simplified the representation of the DFA graphs, resulting
 * in performance improvements (but less protection against programming
 * errors)
 *
 * Revision 1.2  2001/06/27 23:34:35  gerd
 * 	Moved module StringMap to Pxp_aux.
 *
 * Revision 1.1  2000/07/23 02:16:08  gerd
 * 	Initial revision.
 *
 * 
 *)

This web site is published by Informatikbüro Gerd Stolpmann
Powered by Caml