Plasma GitLab Archive
Projects Blog Knowledge

(*
 * \begin{doc}
 * \section{Higher-level IO functions}
 *
 * \subsection{Regular expressions}
 * \index{regular expressions}
 *
 * Many of the higher-level functions use regular expressions.
 * Regular expressions are defined by strings with syntax nearly identical
 * to \Cmd{awk}{1}.
 *
 * Strings may contain the following character constants.
 *
 * \begin{itemize}
 * \item \verb+\\+ : a literal backslash.
 * \item \verb+\a+ : the alert character \verb+^G+.
 * \item \verb+\b+ : the backspace character \verb+^H+.
 * \item \verb+\f+ : the formfeed character \verb+^L+.
 * \item \verb+\n+ : the newline character \verb+^J+.
 * \item \verb+\r+ : the carriage return character \verb+^M+.
 * \item \verb+\t+ : the tab character \verb+^I+.
 * \item \verb+\v+ : the vertical tab character.
 * \item \verb+\xhh...+ : the character represented by the string
 *   of hexadecimal digits \verb+h+.  All valid hexadecimal digits
 *   following the sequence are considered to be part of the sequence.
 * \item \verb+\ddd+ : the character represented by 1, 2, or 3 octal
 *   digits.
 * \end{itemize}
 *
 * Regular expressions are defined using the special characters \verb+.\^$[(){}*?++.
 *
 * \begin{itemize}
 * \item \verb+c+ : matches the literal character \verb+c+ if \verb+c+ is not
 *    a special character.
 * \item \verb+\c+ : matches the literal character \verb+c+, even if \verb+c+
 *    is a special character.
 * \item \verb+.+ : matches any character, including newline.
 * \item \verb+^+ : matches the beginning of a line.
 * \item \verb+$+ : matches the end of line.
 * \item \verb+[abc...]+ : matches any of the characters \verb+abc...+
 * \item \verb+[^abc...]+ : matches any character except \verb+abc...+
 * \item \verb+r1|r2+ : matches either \verb+r1+ or \verb+r2+.
 * \item \verb+r1r2+ : matches \verb+r1+ and then \verb+r2+.
 * \item \verb+r++ : matches one or more occurrences of \verb+r+.
 * \item \verb+r*+ : matches zero or more occurrences of \verb+r+.
 * \item \verb+r?+ : matches zero or one occurrence of \verb+r+.
 * \item \verb+(r)+ : parentheses are used for grouping; matches \verb+r+.
 * \item \verb+\(r\)+ : also defines grouping, but the expression matched
 *    within the parentheses is available to the output processor
 *    through one of the variables \verb+$1+, \verb+$2+, ...
 * \item \verb+r{n}+ : matches exactly \verb+n+ occurrences of \verb+r+.
 * \item \verb+r{n,}+ : matches \verb+n+ or more occurrences of \verb+r+.
 * \item \verb+r{n,m}+ : matches at least \verb+n+ occurrences of \verb+r+,
 *    and no more than \verb+m+ occurrences.
 * \item \verb+\y+: matches the empty string at either the beginning or
 *    end of a word.
 * \item \verb+\B+: matches the empty string within a word.
 * \item \verb+\<+: matches the empty string at the beginning of a word.
 * \item \verb+\>+: matches the empty string at the end of a word.
 * \item \verb+\w+: matches any character in a word.
 * \item \verb+\W+: matches any character that does not occur within a word.
 * \item \verb+\`+: matches the empty string at the beginning of a file.
 * \item \verb+\'+: matches the empty string at the end of a file.
 * \end{itemize}
 *
 * Character classes can be used to specify character sequences
 * abstractly.  Some of these sequences can change depending on your LOCALE.
 *
 * \begin{itemize}
 * \item \verb+[[:alnum:]]+ Alphanumeric characters.
 * \item \verb+[[:alpha:]]+ Alphabetic characters.
 * \item \verb+[[:lower:]]+ Lowercase alphabetic characters.
 * \item \verb+[[:upper:]]+ Uppercase alphabetic characters.
 * \item \verb+[[:cntrl:]]+ Control characters.
 * \item \verb+[[:digit:]]+ Numeric characters.
 * \item \verb+[[:xdigit:]]+ Numeric and hexadecimal characters.
 * \item \verb+[[:graph:]]+ Characters that are printable and visible.
 * \item \verb+[[:print:]]+ Characters that are printable, whether they are visible or not.
 * \item \verb+[[:punct:]]+ Punctuation characters.
 * \item \verb+[[:blank:]]+ Space or tab characters.
 * \item \verb+[[:space:]]+ Whitespace characters.
 * \end{itemize}
 * \end{doc}
 *
 *)


include Omake_pos.Make (struct let name = "Omake_builtin_io_fun" end)


let debug_parsing =
  Lm_debug.create_debug (**)
    { debug_name = "parsing";
      debug_description = "Debug parsing operations";
      debug_value = false
    }

(*
 * Concatenate files into a string.
 *
 * \begin{doc}
 * \fun{cat}
 *
 * \begin{verbatim}
 *     cat(files) : Sequence
 *        files : File or InChannel Sequence
 * \end{verbatim}
 *
 * The \verb+cat+ function concatenates the output from multiple files
 * and returns it as a string.
 * \end{doc}
 *)
let cat venv pos loc args =
  let pos = string_pos "cat" pos in
  match args with
    [arg] ->
    let names = Omake_value.values_of_value venv pos arg in
    let buf = Buffer.create 1024 in
    List.iter (fun name ->
      try
        let inp, close_flag = Omake_value.in_channel_of_any_value venv pos name in
        let inx = Omake_env.venv_find_channel venv pos inp in
        let rec copy () =
          let c = Lm_channel.input_char inx in
          Buffer.add_char buf c;
          copy ()
        in
        let () =
          try copy () with
            End_of_file ->
            ()
        in
        if close_flag then
          Omake_env.venv_close_channel venv pos inp
      with
        Sys_error _ ->
        let print_error buf =
          Format.fprintf buf "unable to open file: %a" Omake_value_print.pp_print_value name
        in
        raise (Omake_value_type.OmakeException (loc_pos loc pos, LazyError print_error))) names;
    Omake_value_type.ValString (Buffer.contents buf)
  | _ ->
    raise (Omake_value_type.OmakeException (loc_pos loc pos, ArityMismatch (ArityExact 1, List.length args)))

(*
 * Grep takes some flags.
 *
 * \begin{doc}
 * \fun{grep}
 *
 * \begin{verbatim}
 *    grep(pattern) : String  # input from stdin, default options
 *       pattern : String
 *    grep(pattern, files) : String  # default options
 *       pattern : String
 *       files   : File Sequence
 *    grep(options, pattern, files) : String
 *      options : String
 *      pattern : String
 *      files   : File Sequence
 * \end{verbatim}
 *
 * The \verb+grep+ function searches for occurrences of a regular
 * expression \verb+pattern+ in a set of files, and prints lines that match.
 * This is like a highly-simplified version of \Cmd{grep}{1}.
 *
 * The options are:
 * \begin{description}
 * \item[q] If specified, the output from \verb+grep+ is not displayed.
 * \item[h] If specified, output lines will not include the filename (default, when only one input
 *          file is given).
 * \item[n] If specified, output lines include the filename (default, when more than one input file
 *          is given).
 * \item[v] If specified, search for lines without a match instead of lines with a match,
 * \end{description}
 *
 * The \verb+pattern+ is a regular expression.
 *
 * If successful (\verb+grep+ found a match), the function returns \verb+true+.
 * Otherwise, it returns \verb+false+.
 * \end{doc}
 *)
type grep_flag =
   GrepQuiet
 | GrepPrint
 | GrepNoPrint
 | GrepNoMatch

let grep_flags pos loc s =
   let len = String.length s in
   let rec collect flags i =
      if i = len then
         flags
      else
         let flag =
            match s.[i] with
               'q' ->
                  GrepQuiet
             | 'n' ->
                  GrepPrint
             | 'v' ->
                  GrepNoMatch
             | 'h' ->
                  GrepNoPrint
             | c ->
                  raise (Omake_value_type.OmakeException (loc_pos loc pos, StringStringError ("illegal grep option", String.make 1 c)))
         in
            collect (flag::flags) (succ i)
   in
      collect [] 0

let grep venv pos loc args =
  let pos = string_pos "grep" pos in
  let outx = Omake_value.channel_of_var venv pos loc  Omake_var.stdout_var in
  let flags, pattern, files =
    match args with
      [pattern] ->
      Omake_value_type.ValNone, pattern, Omake_value_type.ValNone
    | [pattern; files] ->
      ValNone, pattern, files
    | [flags; pattern; files] ->
      flags, pattern, files
    | _ ->
      raise (Omake_value_type.OmakeException (loc_pos loc pos, ArityMismatch (ArityRange (1, 3), List.length args)))
  in
  let flags = grep_flags pos loc (Omake_value.string_of_value venv pos flags) in
  let pattern = Omake_value.string_of_value venv pos pattern in
  let pattern =
    try Omake_lexer.lexer_of_string pattern
    with Failure err ->
      let msg = Lm_printf.sprintf "Mailformed regular expression '%s'" pattern in
      raise (Omake_value_type.OmakeException (loc_pos loc pos, StringStringError (msg, err)))
  in
  let files = Omake_value.values_of_value venv pos files in
  let flags, files =
    match files with
      [] ->
      flags, [Omake_env.venv_find_var venv pos loc Omake_var.stdin_var]
    | [_] ->
      flags, files
    | _::_::_ ->
      (if List.mem GrepNoPrint flags then flags else GrepPrint :: flags), files
  in
  let verbose = not (List.mem GrepQuiet flags) in
  let print = List.mem GrepPrint flags in
  let matches = not (List.mem GrepNoMatch flags) in

  (* Grep against a single line *)
  let grep_line file found line =
    let b = (Omake_lexer.lexer_matches pattern line == matches) in
    if b && verbose then
      begin
        if print then
          begin
            Lm_channel.output_string outx file;
            Lm_channel.output_char outx ':'
          end;
        Lm_channel.output_string outx line;
        Lm_channel.output_char outx '\n'
      end;
    found || b
  in

  (* Open the file *)
  let grep_file found s =
    let filename = Omake_value.string_of_value venv pos s in
    let inp, close_flag = Omake_value.in_channel_of_any_value venv pos s in
    let inx = Omake_env.venv_find_channel venv pos inp in
    let rec search found =
      let text =
        try Some (Lm_channel.input_line inx) with
          End_of_file ->
          None
      in
      match text with
        Some line' ->
        search (grep_line filename found line')
      | None ->
        found
    in
    let found = search found in
    if close_flag then
      Omake_env.venv_close_channel venv pos inp;
    found
  in
  let b = List.fold_left grep_file false files in
  Lm_channel.flush outx;
  Omake_builtin_util.val_of_bool b

let builtin_grep venv pos loc args =
   let pos = string_pos "builtin-grep" pos in
   let args =
      match args with
         [arg] ->
            Omake_value.values_of_value venv pos arg
       | _ ->
            raise (Omake_value_type.OmakeException (loc_pos loc pos, ArityMismatch (ArityExact 1, List.length args)))
   in

   (* Eat options *)
   let flags, pattern, files =
      let rec collect flags args =
         match args with
            arg :: args ->
               (match Omake_value.string_of_value venv pos arg with
                   "-q" ->
                      collect ("q" ^ flags) args
                 | "-n" ->
                      collect ("n" ^ flags) args
                 | "-v" ->
                      collect ("v" ^ flags) args
                 | "-h" ->
                      collect ("h" ^ flags) args
                 | pattern ->
                      flags, pattern, args)
          | [] ->
               raise (Omake_value_type.OmakeException (loc_pos loc pos, StringError "no pattern specified"))
      in
         collect "" args
   in
      grep venv pos loc [ValData flags; ValData pattern; ValArray files]

(*
 * \begin{doc}
 * \fun{scan}
 *
 * \begin{verbatim}
 *    scan(input-files)
 *    case string1
 *       body1
 *    case string2
 *       body2
 *    ...
 *    default
 *       bodyd
 * \end{verbatim}
 *
 * The \verb+scan+ function provides input processing in command-line form.
 * The function takes file/filename arguments.  If called with no
 * arguments, the input is taken from \verb+stdin+. If arguments are provided,
 * each specifies an \verb+InChannel+, or the name of a file for input.
 * Output is always to \verb+stdout+.
 *
 * The \verb+scan+ function operates by reading the input one line at a time,
 * and processing it according to the following algorithm.
 *
 * For each line,
 * the record is first split into fields, and
 * the fields are bound to the variables \verb+$1, $2, ...+.  The variable
 * \verb+$0+ is defined to be the entire line, and \verb+$*+ is an array
 * of all the field values.  The \verb+$(NF)+ variable is defined to be the number
 * of fields.
 *
 * Next, a case expression is selected.  If \verb+string_i+ matches the token \verb+$1+,
 * then \verb+body_i+ is evaluated.  If the body ends in an \verb+export+, the state
 * is passed to the next clause.  Otherwise the value is discarded.
 *
 * For example, here is an \verb+scan+ function that acts as a simple command processor.
 *
 * \begin{verbatim}
 *     calc() =
 *        i = 0
 *        scan(script.in)
 *        case print
 *           println($i)
 *        case inc
 *           i = $(add $i, 1)
 *           export
 *        case dec
 *           i = $(sub $i, 1)
 *           export
 *        case addconst
 *           i = $(add $i, $2)
 *           export
 *        default
 *           eprintln($"Unknown command: $1")
 * \end{verbatim}
 *
 * The \verb+scan+ function also supports several options.
 *
 * \begin{verbatim}
 *     scan(options, files)
 *     ...
 * \end{verbatim}
 *
 * \begin{description}
 * \item[A] Parse each line as an argument list, where arguments
 *    may be quoted.  For example, the following line has three words,
 *    ``\verb+ls+'', ``\verb+-l+'', ``\verb+Program Files+''.
 *
 *    \begin{verbatim}
 *        ls -l "Program Files"
 *    \end{verbatim}
 * \item[O] Parse each line using white space as the separator, using the
 *    usual \OMake{} algorithm for string parsing.  This is the default.
 * \item[x] Once each line is split, reduce each word using the
 *    hex representation.  This is the usual hex representation used
 *    in URL specifiers, so the string ``Program Files'' may be
 *    alternately represented in the form Program%20Files or
 *    Program+Files.
 * \end{description}
 *
 * Note, if you want to redirect the output to a file, the easiest way is to
 * redefine the \verb+stdout+ variable.  The \verb+stdout+ variable is scoped the
 * same way as other variables, so this definition does not affect the meaning of
 * \verb+stdout+ outside the \verb+calc+ function.
 *
 * \begin{verbatim}
 *     calc() =
 *         stdout = $(fopen script.out, w)
 *         scan(script.in)
 *            ...
 *         close($(stdout))
 * \end{verbatim}
 * \end{doc}
 *)

(*
 * Scanner options.
 *)
type parse_option =
   ParseArgs
 | ParseWords

type rewrite_option =
   RewriteHex
 | RewriteNone

let scan_options _ pos loc options s =
   let len = String.length s in
   let rec collect ((poption, roption) as options) i =
      if i = len then
         options
      else
         let options =
            match s.[i] with
               'A' ->
                  ParseArgs, roption
             | 'O' ->
                  ParseWords, roption
             | 'x' ->
                  poption, RewriteHex
             | _ ->
                  raise (Omake_value_type.OmakeException (loc_pos loc pos, StringStringError ("illegal option", s)))
         in
            collect options (succ i)
   in
      collect options 0

let scan_options venv pos loc options =
   List.fold_left (scan_options venv pos loc) (ParseWords, RewriteNone) 
    (Omake_value.strings_of_value venv pos options)

(*
 * The arguments.
 *)
let scan_args venv pos loc (args : Omake_value_type.t list) =
   let pos = string_pos "scan_args" pos in
   let cases, options, files =
     match args with
       | [ValCases cases] ->
           cases, Omake_value_type.ValNone, Omake_env.venv_find_var venv pos loc Omake_var.stdin_var
       | [ValCases cases; files] ->
           cases, ValNone, files
       | [ValCases cases; options; files] ->
           cases, options, files
       | (ValBody _) :: _ ->
           raise (Omake_value_type.OmakeException (loc_pos loc pos, SyntaxError("Unexpected body (bad indentation?)")))
       | [_]
       | [_; _]
       | [_; _; _] ->
           raise (Omake_value_type.OmakeException (loc_pos loc pos, SyntaxError("No cases")))
       | _ ->
           raise (Omake_value_type.OmakeException (loc_pos loc pos, ArityMismatch (ArityExact 1, List.length args)))
   in
   let poptions, roptions = scan_options venv pos loc options in
      cases, poptions, roptions, Omake_value.values_of_value venv pos files

(*
 * Awk the value.
 *)
let scan venv pos loc args _ =
  let pos = string_pos "scan" pos in
  let cases, token_mode, rewrite_mode, files = scan_args venv pos loc args in

  (* Get lexers for all the cases *)
  let cases, def =
    List.fold_left (fun (cases, def) (v, test, body, export) ->
      if Lm_symbol.eq v Omake_symbol.case_sym then
        let s = Omake_value.string_of_value venv pos test in
        let cases =
          Lm_symbol.SymbolTable.filter_add cases (Lm_symbol.add s) (fun b ->
            match b with
              Some _ ->
              raise (Omake_value_type.OmakeException (loc_pos loc pos, StringVarError ("duplicate case", v)))
            | None ->
              body, export)
        in
        cases, def
      else if Lm_symbol.eq v Omake_symbol.default_sym then
        match def with
          Some _ ->
          raise (Omake_value_type.OmakeException (loc_pos loc pos, StringError "duplicate default case"))
        | None ->
          cases, Some (body, export)
      else
        raise (Omake_value_type.OmakeException (loc_pos loc pos, StringVarError ("unknown case", v)))) (Lm_symbol.SymbolTable.empty, None) cases
  in

  (* Split a line into words *)
  let collect_words_argv line =
    let words =
      match token_mode with
        ParseArgs ->
        Lm_string_util.parse_args line
      | ParseWords ->
        Omake_value.strings_of_value venv pos (ValString line)
    in
    match rewrite_mode with
      RewriteHex ->
      List.map Lm_string_util.decode_hex_name words
    | RewriteNone ->
      words
  in

  (* Select a case and run it *)
  let eval_case venv words =
    let body =
      match words with
        command :: _ ->
        (try Some (Lm_symbol.SymbolTable.find cases (Lm_symbol.add command)) with
          Not_found ->
          def)
      | [] ->
        def
    in
    match body with
      Some (body, export) ->
      let venv_new, _ = Omake_eval.eval_sequence_exp venv pos body in
      Omake_env.add_exports venv venv_new pos export
    | None ->
      venv
  in

  (* Read the file a line at a time *)
  let rec line_loop venv inx =
    let text =
      try Some (Lm_channel.input_line inx) with
        End_of_file ->
        None
    in
    match text with
      Some line ->
      let words = collect_words_argv line in
      let venv = Omake_env.venv_add_match venv line words in
      let venv = eval_case venv words in
      line_loop venv inx
    | None ->
      venv
  in
  let rec file_loop venv args =
    match args with
      arg :: args ->
      let inp, close_in = Omake_value.in_channel_of_any_value venv pos arg in
      let inx = Omake_env.venv_find_channel venv pos inp in
      let venv =
        try line_loop venv inx with
          exn when close_in ->
          Omake_env.venv_close_channel venv pos inp;
          raise exn
      in
      if close_in then
        Omake_env.venv_close_channel venv pos inp;
      file_loop venv args
    | [] ->
      venv
  in
  file_loop venv files, Omake_value_type.ValNone

(*
 * \begin{doc}
 * \fun{awk}
 *
 * \begin{verbatim}
 *    awk(input-files)
 *    case pattern1:
 *       body1
 *    case pattern2:
 *       body2
 *    ...
 *    default:
 *       bodyd
 *\end{verbatim}
 *
 * or
 *
 * \begin{verbatim}
 *    awk(options, input-files)
 *    case pattern1:
 *       body1
 *    case pattern2:
 *       body2
 *    ...
 *    default:
 *       bodyd
 * \end{verbatim}
 *
 * The \verb+awk+ function provides input processing similar to \Cmd{awk}{1},
 * but more limited.  The \verb+input-files+ argument is a sequence of values,
 * each specifies an \verb+InChannel+, or the name of a file for input.
 * If called with no options and no file arguments, the input is taken from \verb+stdin+.
 * Output is always to \verb+stdout+.
 *
 * The variables \verb+RS+ and \verb+FS+ define record and field separators
 * as regular expressions.
 * The default value of \verb+RS+ is the regular expression \verb+\r|\n|\r\n+.
 * The default value of \verb+FS+ is the regular expression \verb+[ \t]++.
 *
 * The \verb+awk+ function operates by reading the input one record at a time,
 * and processing it according to the following algorithm.
 *
 * For each line,
 * the record is first split into fields using the field separator \verb+FS+, and
 * the fields are bound to the variables \verb+$1, $2, ...+.  The variable
 * \verb+$0+ is defined to be the entire line, and \verb+$*+ is an array
 * of all the field values.  The \verb+$(NF)+ variable is defined to be the number
 * of fields.
 *
 * Next, the cases are evaluated in order.
 * For each case, if the regular expression \verb+pattern_i+ matches the record \verb+$0+,
 * then \verb+body_i+ is evaluated.  If the body ends in an \verb+export+, the state
 * is passed to the next clause.  Otherwise the value is discarded.  If the regular
 * expression contains \verb+\(r\)+ expression, those expression override the
 * fields \verb+$1, $2, ...+.
 *
 * For example, here is an \verb+awk+ function to print the text between two
 * delimiters \verb+\begin{<name>}+ and \verb+\end{<name>}+, where the \verb+<name>+
 * must belong to a set passed as an argument to the \verb+filter+ function.
 *
 * \begin{verbatim}
 *     filter(names) =
 *        print = false
 *
 *        awk(Awk.in)
 *        case $"^\\end\{\([[:alpha:]]+\)\}"
 *           if $(mem $1, $(names))
 *              print = false
 *              export
 *           export
 *        default
 *           if $(print)
 *              println($0)
 *        case $"^\\begin\{\([[:alpha:]]+\)\}"
 *           print = $(mem $1, $(names))
 *           export
 * \end{verbatim}
 *
 * Note, if you want to redirect the output to a file, the easiest way is to
 * redefine the \verb+stdout+ variable.  The \verb+stdout+ variable is scoped the
 * same way as other variables, so this definition does not affect the meaning of
 * \verb+stdout+ outside the \verb+filter+ function.
 *
 * \begin{verbatim}
 *     filter(names) =
 *         stdout = $(fopen file.out, w)
 *         awk(Awk.in)
 *            ...
 *         close($(stdout))
 * \end{verbatim}
 *
 * Options.
 * \begin{description}
 * \item[b] ``Break'' when evaluating cases.  Only the first case that matches will be selected.
 * \end{description}
 *
 * The \hyperfun{break} can be used to abort the loop,
 * exiting the \verb+awk+ function immediately.
 * \end{doc}
 *)

(*
 * Evaluate all the cases that match.
 *)
let rec awk_eval_cases venv pos loc break line cases =
   match cases with
      (None, body, export) :: cases ->
         let venv_new, _ = Omake_eval.eval_sequence_exp venv pos body in
         let venv = Omake_env.add_exports venv venv_new pos export in
            if break then
               venv
            else
               awk_eval_cases venv pos loc break line cases
    | (Some lex, body, export) :: cases ->
         let channel = Lm_channel.of_string line in
         let venv, stop =
            match Omake_lexer.Lexer.search lex channel with
               Some (_, _, _, _, args) ->
                  let venv_new = Omake_env.venv_add_match_args venv args in
                  let venv_new, _ = Omake_eval.eval_sequence_exp venv_new pos body in
                  let venv = Omake_env.add_exports venv venv_new pos export in
                     venv, break
             | None ->
                  venv, false
         in
            if stop then
               venv
            else
               awk_eval_cases venv pos loc break line cases
    | [] ->
         venv

(*
 * The arguments.
 *)
let awk_args venv pos loc (args : Omake_value_type.t list) =
  let pos = string_pos "awk_args" pos in
  match args with
    | [ValCases cases] ->
        cases, [Omake_env.venv_find_var venv pos loc Omake_var.stdin_var]
    | [ValCases cases; files] ->
        cases, Omake_value.values_of_value venv pos files
    | (ValBody _) :: _ ->
        raise (Omake_value_type.OmakeException (loc_pos loc pos, SyntaxError("Unexpected body (bad indentation?)")))
    | [_]
    | [_; _] ->
        raise (Omake_value_type.OmakeException (loc_pos loc pos, SyntaxError("No cases")))
    | _ ->
        raise (Omake_value_type.OmakeException (loc_pos loc pos, ArityMismatch (ArityRange (1, 2), List.length args)))

let awk_option_args venv pos loc (args : Omake_value_type.t list) =
  let pos = string_pos "awk_args" pos in
  match args with
    | [ValCases cases] ->
        cases, "", [Omake_env.venv_find_var venv pos loc Omake_var.stdin_var]
    | [ValCases cases; files] ->
        cases, "", Omake_value.values_of_value venv pos files
    | [ValCases cases; options; files] ->
        cases, Omake_value.string_of_value venv pos options, Omake_value.values_of_value venv pos files
    | (ValBody _) :: _ ->
        raise (Omake_value_type.OmakeException (loc_pos loc pos, SyntaxError("Unexpected body (bad indentation?)")))
    | [_]
    | [_; _]
    | [_; _; _] ->
        raise (Omake_value_type.OmakeException (loc_pos loc pos, SyntaxError("No cases")))
    | _ ->
        raise (Omake_value_type.OmakeException (loc_pos loc pos, ArityMismatch (ArityRange (1, 3), List.length args)))

type awk_flag =
   AwkBreak

let awk_flags pos loc s =
   let len = String.length s in
   let rec collect flags i =
      if i = len then
         flags
      else
         let flag =
            match s.[i] with
               'b' ->
                  AwkBreak
             | c ->
                  raise (Omake_value_type.OmakeException (loc_pos loc pos, StringStringError ("illegal awk option", String.make 1 c)))
         in
            collect (flag :: flags) (succ i)
   in
      collect [] 0

(*
 * Awk the value.
 *)
let awk venv pos loc args _ =
  let pos = string_pos "awk" pos in
  let cases, flags, files = awk_option_args venv pos loc args in
  let flags = awk_flags pos loc flags in
  let break = List.mem AwkBreak flags in

  (* Separator expressions *)
  let rs =
    try Omake_value.string_of_value venv pos (Omake_env.venv_find_var_exn venv Omake_var.rs_var) with
      Not_found ->
      "\r|\n|\r\n"
  in
  let fs =
    try Omake_value.string_of_value venv pos (Omake_env.venv_find_var_exn venv Omake_var.fs_var) with
      Not_found ->
      "[ \t]+"
  in
  let rs_lex =
    try Omake_lexer.lexer_of_string rs with
      Failure err ->
      let msg = Lm_printf.sprintf "Malformed regular expression '%s'" rs in
      raise (Omake_value_type.OmakeException (loc_pos loc pos, StringStringError (msg, err)))
  in
  let fs_lex =
    try Omake_lexer.lexer_of_string fs with
      Failure err ->
      let msg = Lm_printf.sprintf "Malformed regular expression '%s'" fs in
      raise (Omake_value_type.OmakeException (loc_pos loc pos, StringStringError (msg, err)))
  in

  (* Get lexers for all the cases *)
  let cases =
    List.map (fun (v, test, body, export) ->
      if Lm_symbol.eq v Omake_symbol.case_sym then
        let s = Omake_value.string_of_value venv pos test in
        let _, lex =
          try Omake_lexer.Lexer.add_clause Omake_lexer.Lexer.empty v s with
            Failure err ->
            let msg = Lm_printf.sprintf "Malformed regular expression '%s'" s in
            raise (Omake_value_type.OmakeException (loc_pos loc pos, StringStringError (msg, err)))
        in
        Some lex, body, export
      else if Lm_symbol.eq v Omake_symbol.default_sym then
        None, body, export
      else
        raise (Omake_value_type.OmakeException (loc_pos loc pos, StringVarError ("unknown case", v)))) cases
  in

  (* Split a line into words *)
  let collect_words line =
    let channel = Lm_channel.of_string line in
    let rec collect words =
      match Omake_lexer.Lexer.searchto fs_lex channel with
        Omake_lexer.Lexer.LexEOF ->
        List.rev words
      | Omake_lexer.Lexer.LexSkipped (_, skipped)
      | Omake_lexer.Lexer.LexMatched (_, _, skipped, _, _) ->
        collect (skipped :: words)
    in
    collect []
  in

  (* Read the file a line at a time *)
  let rec line_loop venv inx lineno =
    match Omake_lexer.Lexer.searchto rs_lex inx with
      Omake_lexer.Lexer.LexEOF ->
      venv
    | Omake_lexer.Lexer.LexSkipped (_, line)
    | Omake_lexer.Lexer.LexMatched (_, _, line, _, _) ->
      (* Split into words *)
      let words = collect_words line in
      let venv = Omake_env.venv_add_match venv line words in
      let venv = Omake_env.venv_add_var venv Omake_var.fnr_var (ValInt lineno) in
      let venv = awk_eval_cases venv pos loc break line cases in
      line_loop venv inx (lineno + 1)
  in
  let rec file_loop venv args =
    match args with
      arg :: args ->
      let inp, close_in = Omake_value.in_channel_of_any_value venv pos arg in
      let inx = Omake_env.venv_find_channel venv pos inp in
      let venv = Omake_env.venv_add_var venv Omake_var.filename_var (ValData (Lm_channel.name inx)) in
      let venv =
        try line_loop venv inx 1 with
          exn when close_in ->
          Omake_env.venv_close_channel venv pos inp;
          raise exn
      in
      if close_in then
        Omake_env.venv_close_channel venv pos inp;
      file_loop venv args
    | [] ->
      venv
  in
  let venv =
    try file_loop venv files with
      Omake_env.Break (_, venv) ->
      venv
  in
  venv, Omake_value_type.ValNone

(*
 * \begin{doc}
 * \fun{fsubst}
 *
 * \begin{verbatim}
 *    fsubst(files)
 *    case pattern1 [options]
 *       body1
 *    case pattern2 [options]
 *       body2
 *    ...
 *    default
 *       bodyd
 * \end{verbatim}
 *
 * The \verb+fsubst+ function provides a \Cmd{sed}{1}-like substitution
 * function.  Similar to \verb+awk+, if \verb+fsubst+ is called with no
 * arguments, the input is taken from \verb+stdin+.  If arguments are provided,
 * each specifies an \verb+InChannel+, or the name of a file for input.
 *
 * The \verb+RS+ variable defines a regular expression that determines a record separator,
 * The default value of \verb+RS+ is the regular expression \verb+\r|\n|\r\n+.
 *
 * The \verb+fsubst+ function reads the file one record at a time.
 *
 * For each record, the cases are evaluated in order.  Each case defines
 * a substitution from a substring matching the \verb+pattern+ to
 * replacement text defined by the body.
 *
 * Currently, there is only one option: \verb+g+.
 * If specified, each clause specifies a global replacement,
 * and all instances of the pattern define a substitution.
 * Otherwise, the substitution is applied only once.
 *
 * Output can be redirected by redefining the \verb+stdout+ variable.
 *
 * For example, the following program replaces all occurrences of
 * an expression \verb+word.+ with its capitalized form.
 *
 * \begin{verbatim}
 *     section
 *        stdout = $(fopen Subst.out, w)
 *        fsubst(Subst.in)
 *        case $"\<\([[:alnum:]]+\)\." g
 *           value $(capitalize $1).
 *        close($(stdout))
 * \end{verbatim}
 * \end{doc}
 *)

(*
 * Substitution options.
 *)
let subst_global_opt = 1

let subst_options _ pos loc options s =
   let len = String.length s in
   let rec collect options i =
      if i = len then
         options
      else
         let flag =
            match s.[i] with
               'g' ->
                  subst_global_opt
             | _ ->
                  raise (Omake_value_type.OmakeException (loc_pos loc pos, StringStringError ("illegal option", s)))
         in
            collect (options lor flag) (succ i)
   in
      collect options 0

(*
 * Sed function performs a substitution line-by-line.
 *)
let rec subst_eval_case venv pos loc buf channel lex options body =
   match Omake_lexer.Lexer.searchto lex channel with
      Omake_lexer.Lexer.LexEOF ->
         ()
    | Omake_lexer.Lexer.LexSkipped (_, skipped) ->
         Buffer.add_string buf skipped
    | Omake_lexer.Lexer.LexMatched (_, _, skipped, matched, args) ->
         let venv' = Omake_env.venv_add_match venv matched args in
         let _, result = Omake_eval.eval_sequence_exp venv' pos body in
            Buffer.add_string buf skipped;
            Buffer.add_string buf (Omake_value.string_of_value venv pos result);
            if (options land subst_global_opt) <> 0 then
               subst_eval_case venv pos loc buf channel lex options body
            else
               Lm_channel.LexerInput.lex_buffer channel buf

let subst_eval_line venv pos loc line cases =
   let buffer = Buffer.create (String.length line) in
      List.fold_left (fun line (lex, options, body) ->
            let channel = Lm_channel.of_string line in
               Buffer.clear buffer;
               subst_eval_case venv pos loc buffer channel lex options body;
               Buffer.contents buffer) line cases

let fsubst venv pos loc args _ =
   let pos = string_pos "fsubst" pos in
   let cases, files = awk_args venv pos loc args in
   let outp = Omake_value.prim_channel_of_var venv pos loc Omake_var.stdout_var in
   let outx = Omake_env.venv_find_channel venv pos outp in

   (* Record separator *)
   let rs =
      try Omake_value.string_of_value venv pos (Omake_env.venv_find_var_exn venv Omake_var.rs_var) with
         Not_found ->
            "\r|\n|\r\n"
   in
   let rs_lex =
      try Omake_lexer.lexer_of_string rs with
         Failure err ->
            let msg = Lm_printf.sprintf "Malformed regular expression '%s'" rs in
               raise (Omake_value_type.OmakeException (loc_pos loc pos, StringStringError (msg, err)))
   in

   (* Get lexers for all the cases *)
   let cases =
      List.map (fun (v, test, body, _) ->
            let args = Omake_value.values_of_value venv pos test in
            let pattern, options =
               match args with
                  pattern :: options ->
                     Omake_value.string_of_value venv pos pattern, options
                | [] ->
                     "", []
            in
            let pattern, options =
               if Lm_symbol.eq v Omake_symbol.case_sym then
                  pattern, options
               else if Lm_symbol.eq v Omake_symbol.default_sym then
                  ".*", []
               else
                  raise (Omake_value_type.OmakeException (loc_pos loc pos, StringVarError ("unknown case", v)))
            in
            let options =
               List.fold_left (fun options arg ->
                     subst_options venv pos loc options (Omake_value.string_of_value venv pos arg)) 0 options
            in
            let _, lex =
               try Omake_lexer.Lexer.add_clause Omake_lexer.Lexer.empty v pattern with
                  Failure err ->
                     let msg = Lm_printf.sprintf "Malformed regular expression '%s'" pattern in
                        raise (Omake_value_type.OmakeException (loc_pos loc pos, StringStringError (msg, err)))
            in
               lex, options, body) cases
   in

   (* Read the file a line at a time *)
   let rec line_loop inx =
      match Omake_lexer.Lexer.searchto rs_lex inx with
         Omake_lexer.Lexer.LexEOF ->
            ()
       | Omake_lexer.Lexer.LexSkipped (_, line) ->
            let line = subst_eval_line venv pos loc line cases in
               Lm_channel.output_string outx line
       | Omake_lexer.Lexer.LexMatched (_, _, line, term, _) ->
            let line = subst_eval_line venv pos loc line cases in
               Lm_channel.output_string outx line;
               Lm_channel.output_string outx term;
               line_loop inx
   in
   let rec file_loop files =
      match files with
         file :: files ->
            let inp, close_in = Omake_value.in_channel_of_any_value venv pos file in
            let inx = Omake_env.venv_find_channel venv pos inp in
            let () =
               try line_loop inx with
                  exn when close_in ->
                     Omake_env.venv_close_channel venv pos inp;
                     raise exn
            in
               if close_in then
                  Omake_env.venv_close_channel venv pos inp;
               file_loop files
       | [] ->
            ()
   in
   let venv =
      try file_loop files; venv with
         Omake_env.Break (_, venv) ->
            venv
   in
      Lm_channel.flush outx;
      venv, Omake_value_type.ValNone

(*
 * \begin{doc}
 * \fun{lex}
 *
 * \begin{verbatim}
 *    lex(files)
 *    case pattern1
 *       body1
 *    case pattern2
 *       body2
 *    ...
 *    default
 *       bodyd
 * \end{verbatim}
 *
 * The \verb+lex+ function provides a simple lexical-style scanner
 * function.  The input is a sequence of files or channels.  The cases
 * specify regular expressions.  Each time the input is read, the regular
 * expression that matches the \emph{longest prefix} of the input is selected,
 * and the body is evaluated.
 *
 * If two clauses both match the same input, the \emph{last} one is selected
 * for execution.  The \verb+default+ case matches the regular expression \verb+.+;
 * you probably want to place it first in the pattern list.
 *
 * If the body end with an \verb+export+ directive,
 * the state is passed to the next clause.
 *
 * For example, the following program collects all occurrences of alphanumeric
 * words in an input file.
 *
 * \begin{verbatim}
 *     collect-words(files) =
 *        words[] =
 *        lex($(files))
 *        default
 *           # empty
 *        case $"[[:alnum:]]+" g
 *           words[] += $0
 *           export
 *        value $(words)
 * \end{verbatim}
 *
 * The \verb+default+ case, if one exists, matches single characters.  Since
 *
 * It is an error if the input does not match any of the regular expressions.
 *
 * The \hyperfun{break} can be used to abort the loop.
 * \end{doc}
 *)
let eof_sym = Lm_symbol.add "eof"

let lex venv pos loc args _ =
   let pos = string_pos "lex" pos in
   let cases, files = awk_args venv pos loc args in

   (* Add a clause for EOF *)
   let _, lex = Omake_lexer.Lexer.add_clause Omake_lexer.Lexer.empty eof_sym "\\'" in

   (* Get lexers for all the cases *)
   let lex, cases, _ =
      List.fold_left (fun (lex, cases, index) (v, test, body, export) ->
            let args = Omake_value.values_of_value venv pos test in
            let pattern =
               match args with
                  pattern :: _ ->
                     Omake_value.string_of_value venv pos pattern
                | [] ->
                     ""
            in
            let pattern =
               if Lm_symbol.eq v Omake_symbol.case_sym then
                  pattern
               else if Lm_symbol.eq v Omake_symbol.default_sym then
                  "."
               else
                  raise (Omake_value_type.OmakeException (loc_pos loc pos, StringVarError ("unknown case", v)))
            in
               let action_sym = Lm_symbol.make "action" index in
               let _, lex =
                  try Omake_lexer.Lexer.add_clause lex action_sym pattern with
                     Failure err ->
                        let msg = Lm_printf.sprintf "Malformed regular expression '%s'" pattern in
                           raise (Omake_value_type.OmakeException (loc_pos loc pos, StringStringError (msg, err)))
               in
               let cases = Lm_symbol.SymbolTable.add cases action_sym (body, export) in
                  lex, cases, succ index) (lex, Lm_symbol.SymbolTable.empty, 0) cases
   in

   (* Process the files *)
   let rec input_loop venv inx =
      let action_sym, lexeme_loc, lexeme, args = Omake_lexer.Lexer.lex lex inx in
         if Lm_symbol.eq action_sym eof_sym then
            venv
         else
            let venv_new = Omake_env.venv_add_match venv lexeme args in
            let venv_new = Omake_env.venv_add_var venv_new Omake_var.parse_loc_var (ValOther (ValLocation lexeme_loc)) in
            let body, export =
               try Lm_symbol.SymbolTable.find cases action_sym with
                  Not_found ->
                     raise (Invalid_argument "lex")
            in
            let venv_new, _ = Omake_eval.eval_sequence_exp venv_new pos body in
            let venv = Omake_env.add_exports venv venv_new pos export in
               input_loop venv inx
   in
   let rec file_loop venv files =
      match files with
         file :: files ->
            let inp, close_in = Omake_value.in_channel_of_any_value venv pos file in
            let inx = Omake_env.venv_find_channel venv pos inp in
            let venv =
               try input_loop venv inx with
                  (Omake_env.Break _ | Omake_value_type.Return _ ) as exn ->
                     if close_in then
                        Omake_env.venv_close_channel venv pos inp;
                     raise exn
                | exn ->
                     if close_in then
                        Omake_env.venv_close_channel venv pos inp;
                     Omake_eval.raise_uncaught_exception pos exn
            in
               if close_in then
                  Omake_env.venv_close_channel venv pos inp;
               file_loop venv files
       | [] ->
            venv
   in
   let venv =
      try file_loop venv files with
         Omake_env.Break (_, venv) ->
            venv
   in
      venv, Omake_value_type.ValNone

(*
 * \begin{doc}
 * \fun{lex-search}
 *
 * \begin{verbatim}
 *    lex-search(files)
 *    case pattern1
 *       body1
 *    case pattern2
 *       body2
 *    ...
 *    default
 *       bodyd
 * \end{verbatim}
 *
 * The \verb+lex-search+ function is like the \verb+lex+ function, but input that
 * does not match any of the regular expressions is skipped.  If the clauses include
 * a \verb+default+ case, then the \verb+default+ matches any skipped text.
 *
 * For example, the following program collects all occurrences of alphanumeric
 * words in an input file, skipping any other text.
 *
 * \begin{verbatim}
 *     collect-words($(files)) =
 *        words[] =
 *        lex-search($(files))
 *        default
 *           eprintln(Skipped $0)
 *        case $"[[:alnum:]]+" g
 *           words[] += $0
 *           export
 * \end{verbatim}
 *
 * The \verb+default+ case, if one exists, matches single characters.  Since
 *
 * It is an error if the input does not match any of the regular expressions.
 *
 * The \hyperfun{break} can be used to abort the loop.
 * \end{doc}
 *)
let lex_search venv pos loc args _ =
  let pos = string_pos "lex-search" pos in
  let cases, files = awk_args venv pos loc args in

  (* Get lexers for all the cases *)
  let lex, cases, default, _ =
    List.fold_left (fun (lex, cases, default, index) (v, test, body, export) ->
      let args = Omake_value.values_of_value venv pos test in
      let pattern =
        match args with
          pattern :: _ ->
          Omake_value.string_of_value venv pos pattern
        | [] ->
          ""
      in
      if Lm_symbol.eq v Omake_symbol.case_sym then
        let action_sym = Lm_symbol.make "action" index in
        let _, lex =
          try Omake_lexer.Lexer.add_clause lex action_sym pattern with
            Failure err ->
            let msg = Lm_printf.sprintf "Malformed regular expression '%s'" pattern in
            raise (Omake_value_type.OmakeException (loc_pos loc pos, StringStringError (msg, err)))
        in
        let cases = Lm_symbol.SymbolTable.add cases action_sym (body, export) in
        lex, cases, default, succ index
      else if Lm_symbol.eq v Omake_symbol.default_sym then
        lex, cases, Some (body, export), index
      else
        raise (Omake_value_type.OmakeException (loc_pos loc pos, StringVarError ("unknown case", v)))) (**)
      (Omake_lexer.Lexer.empty, Lm_symbol.SymbolTable.empty, None, 0) cases
  in

  (* What to do for skipped text *)
  let skip venv lexeme_loc lexeme =
    match lexeme, default with
      "", _
    | _, None ->
      venv
    | _, Some (body, export) ->
      let venv_new = Omake_env.venv_add_match venv lexeme [] in
      let venv_new = Omake_env.venv_add_var venv_new Omake_var.parse_loc_var (ValOther (ValLocation lexeme_loc)) in
      let venv_new, _ = Omake_eval.eval_sequence_exp venv_new pos body in
      Omake_env.add_exports venv venv_new pos export
  in

  (* Process the files *)
  let rec input_loop venv inx =
    match Omake_lexer.Lexer.searchto lex inx with
      Omake_lexer.Lexer.LexEOF ->
      venv
    | Omake_lexer.Lexer.LexSkipped (_, lexeme) ->
      skip venv loc lexeme
    | Omake_lexer.Lexer.LexMatched (action_sym, lexeme_loc, skipped, lexeme, args) ->
      (* Process skipped text *)
      let venv = skip venv lexeme_loc skipped in

      (* Process the matched text *)
      let venv_new = Omake_env.venv_add_match venv lexeme args in
      let venv_new = Omake_env.venv_add_var venv_new Omake_var.parse_loc_var (ValOther (ValLocation lexeme_loc)) in
      let body, export =
        try Lm_symbol.SymbolTable.find cases action_sym with
          Not_found ->
          raise (Invalid_argument "lex")
      in
      let venv_new, _ = Omake_eval.eval_sequence_exp venv_new pos body in
      let venv = Omake_env.add_exports venv venv_new pos export in
      input_loop venv inx
  in

  (* Process each file *)
  let rec file_loop venv files =
    match files with
      file :: files ->
      let inp, close_in = Omake_value.in_channel_of_any_value venv pos file in
      let inx = Omake_env.venv_find_channel venv pos inp in
      let venv =
        try input_loop venv inx with
          (Omake_env.Break _ | Omake_value_type.Return _) as exn ->
          if close_in then
            Omake_env.venv_close_channel venv pos inp;
          raise exn
        | exn ->
          if close_in then
            Omake_env.venv_close_channel venv pos inp;
          Omake_eval.raise_uncaught_exception pos exn
      in
      if close_in then
        Omake_env.venv_close_channel venv pos inp;
      file_loop venv files
    | [] ->
      venv
  in
  let venv =
    try file_loop venv files with
      Omake_env.Break (_, venv) ->
      venv
  in
  venv, Omake_value_type.ValNone

(*
 * \begin{doc}
 * \obj{Lexer}
 *
 * The \verb+Omake_lexer.Lexer+ object defines a facility for lexical analysis, similar to the
 * \Cmd{lex}{1} and \Cmd{flex}{1} programs.
 *
 * In \Prog{omake}, lexical analyzers can be constructed dynamically by extending
 * the \verb+Omake_lexer.Lexer+ class.  A lexer definition consists of a set of directives specified
 * with method calls,  and set of clauses specified as rules.
 *
 * For example, consider the following lexer definition, which is intended
 * for lexical analysis of simple arithmetic expressions for a desktop
 * calculator.
 *
 * \begin{verbatim}
 *    lexer1. =
 *       extends $(Omake_lexer.Lexer)
 *
 *       other: .
 *          eprintln(Illegal character: $* )
 *          lex()
 *
 *       white: $"[[:space:]]+"
 *          lex()
 *
 *       op: $"[-+*/()]"
 *          switch $*
 *          case +
 *             Token.unit($(loc), plus)
 *          case -
 *             Token.unit($(loc), minus)
 *          case *
 *             Token.unit($(loc), mul)
 *          case /
 *             Token.unit($(loc), div)
 *          case $"("
 *             Token.unit($(loc), lparen)
 *          case $")"
 *             Token.unit($(loc), rparen)
 *
 *       number: $"[[:digit:]]+"
 *          Token.pair($(loc), exp, $(int $* ))
 *
 *       eof: $"\'"
 *          Token.unit($(loc), eof)
 * \end{verbatim}
 *
 * This program defines an object \verb+lexer1+ the extends the \verb+Omake_lexer.Lexer+
 * object, which defines lexing environment.
 *
 * The remainder of the definition consists of a set of clauses,
 * each with a method name before the colon; a regular expression
 * after the colon; and in this case, a body.  The body is optional,
 * if it is not specified, the method with the given name should
 * already exist in the lexer definition.
 *
 * \emph{NB} The clause that matches the \emph{longest} prefix of the input
 * is selected.  If two clauses match the same input prefix, then the \emph{last}
 * one is selected.  This is unlike most standard lexers, but makes more sense
 * for extensible grammars.
 *
 * The first clause matches any input that is not matched by the other clauses.
 * In this case, an error message is printed for any unknown character, and
 * the input is skipped.  Note that this clause is selected only if no other
 * clause matches.
 *
 * The second clause is responsible for ignoring white space.
 * If whitespace is found, it is ignored, and the lexer is called
 * recursively.
 *
 * The third clause is responsible for the arithmetic operators.
 * It makes use of the \verb+Token+ object, which defines three
 * fields: a \verb+loc+ field that represents the source location;
 * a \verb+name+; and a \verb+value+.
 *
 * The lexer defines the \verb+loc+ variable to be the location
 * of the current lexeme in each of the method bodies, so we can use
 * that value to create the tokens.
 *
 * The \verb+Token.unit($(loc), name)+
 * method constructs a new \verb+Token+ object with the given name,
 * and a default value.
 *
 * The \verb+number+ clause matches nonnegative integer constants.
 * The \verb+Token.pair($(loc), name, value)+ constructs a token with the
 * given name and value.
 *
 * \verb+Omake_lexer.Lexer+ object operate on \verb+InChannel+ objects.
 * The method \verb+lexer1.lex-channel(channel)+ reads the next
 * token from the channel argument.
 *
 * \subsection{Omake\textunderscore{}lexer.Lexer matching}
 *
 * During lexical analysis, clauses are selected by longest match.
 * That is, the clause that matches the longest sequence of input
 * characters is chosen for evaluation.  If no clause matches, the
 * lexer raises a \verb+RuntimeException+.  If more than one clause
 * matches the same amount of input, the first one is chosen
 * for evaluation.
 *
 * \subsection{Extending lexer definitions}
 *
 * Suppose we wish to augment the lexer example so that it ignores
 * comments.  We will define comments as any text that begins with
 * the string \verb+(*+, ends with \verb+*)+, and comments may
 * be nested.
 *
 * One convenient way to do this is to define a separate lexer
 * just to skip comments.
 *
 * \begin{verbatim}
 *    lex-comment. =
 *       extends $(Omake_lexer.Lexer)
 *
 *       level = 0
 *
 *       other: .
 *          lex()
 *
 *       term: $"[*][)]"
 *          if $(not $(eq $(level), 0))
 *             level = $(sub $(level), 1)
 *             lex()
 *
 *       next: $"[(][*]"
 *          level = $(add $(level), 1)
 *          lex()
 *
 *       eof: $"\'"
 *          eprintln(Unterminated comment)
 * \end{verbatim}
 *
 * This lexer contains a field \verb+level+ that keeps track of the nesting
 * level.  On encountering a \verb+(*+ string, it increments the level,
 * and for \verb+*)+, it decrements the level if nonzero, and continues.
 *
 * Next, we need to modify our previous lexer to skip comments.
 * We can do this by extending the lexer object \verb+lexer1+
 * that we just created.
 *
 * \begin{verbatim}
 *    lexer1. +=
 *       comment: $"[(][*]"
 *          lex-comment.lex-channel($(channel))
 *          lex()
 * \end{verbatim}
 *
 * The body for the comment clause calls the \verb+lex-comment+ lexer when
 * a comment is encountered, and continues lexing when that lexer returns.
 *
 * \subsection{Threading the lexer object}
 *
 * Clause bodies may also end with an \verb+export+ directive.  In this case
 * the lexer object itself is used as the returned token.  If used with
 * the \verb+Parser+ object below, the lexer should define the \verb+loc+, \verb+name+
 * and \verb+value+ fields in each \verb+export+ clause.  Each time
 * the \verb+Parser+ calls the lexer, it calls it with the lexer returned
 * from the previous lex invocation.
 * \end{doc}
 *)

(*
 * Add a lexer clause.
 *)
let lex_rule venv pos loc (args : Omake_value_type.t list) kargs =
  let pos = string_pos "lex-rule" pos in
  match args, kargs with
    [_; action; _; pattern; _; ValBody (_, [], [], body, export)], [] ->
    let lexer = Omake_value.current_lexer venv pos in
    let action_name = Omake_value.string_of_value venv pos action in
    let action_sym = Lm_symbol.add action_name in
    let pattern = Omake_value.string_of_value venv pos pattern in
    let _, lexer =
      try Omake_lexer.Lexer.add_clause lexer action_sym pattern with
        Failure err ->
        let msg = Lm_printf.sprintf "Malformed regular expression '%s'" pattern in
        raise (Omake_value_type.OmakeException (loc_pos loc pos, StringStringError (msg, err)))
    in

    (* Add the method *)
    let action_var = Omake_ir.VarThis (loc, action_sym) in
    let venv = Omake_env.venv_add_var venv action_var (ValFun (Omake_env.venv_get_env venv, [], [], body, export)) in
    let venv = Omake_env.venv_add_var venv Omake_var.builtin_field_var (ValOther (ValLexer lexer)) in
    venv, Omake_value_type.ValNone

  | _ ->
    raise (Omake_value_type.OmakeException (loc_pos loc pos, ArityMismatch (ArityExact 6, List.length args)))

(*
 * Perform the lexing.
 *)
let lex_engine venv pos loc args kargs =
  let pos = string_pos "lex" pos in
  match args, kargs with
    [arg], [] ->
    let lexer = Omake_value.current_lexer venv pos in
    let inp, close_flag = Omake_value.in_channel_of_any_value venv pos arg in
    let inx = Omake_env.venv_find_channel venv pos inp in
    let action, lexeme_loc, lexeme, args =
      try Omake_lexer.Lexer.lex lexer inx with
        Failure _ as exn ->
        let loc = Lm_channel.loc inx in
        let pos = loc_pos loc pos in
        if close_flag then
          Omake_env.venv_close_channel venv pos inp;
        raise (Omake_value_type.UncaughtException (pos, exn))
    in
    let () =
      if close_flag then
        Omake_env.venv_close_channel venv pos inp
    in
    let venv = Omake_env.venv_add_match venv lexeme args in
    let venv = Omake_env.venv_add_var venv Omake_var.parse_loc_var (ValOther (ValLocation lexeme_loc)) in
    let action = Omake_env.venv_find_var venv pos loc (Omake_ir.VarThis (loc, action)) in
    Omake_eval.eval_apply venv pos loc action [] []
  | _ ->
    raise (Omake_value_type.OmakeException (loc_pos loc pos, ArityMismatch (ArityExact 1, List.length args)))

(*
 * \begin{doc}
 * \obj{Parser}
 *
 * The \verb+Parser+ object provides a facility for syntactic analysis based
 * on context-free grammars.
 *
 * \verb+Parser+ objects are specified as a sequence of directives,
 * specified with method calls; and productions, specified as rules.
 *
 * For example, let's finish building the desktop calculator started
 * in the \verb+Lexer+ example.
 *
 * \begin{verbatim}
 *    parser1. =
 *       extends $(Parser)
 *
 *       #
 *       # Use the main lexer
 *       #
 *       lexer = $(lexer1)
 *
 *       #
 *       # Precedences, in ascending order
 *       #
 *       left(plus minus)
 *       left(mul div)
 *       right(uminus)
 *
 *       #
 *       # A program
 *       #
 *       start(prog)
 *
 *       prog: exp eof
 *          return $1
 *
 *       #
 *       # Simple arithmetic expressions
 *       #
 *       exp: minus exp :prec: uminus
 *          neg($2)
 *
 *       exp: exp plus exp
 *          add($1, $3)
 *
 *       exp: exp minus exp
 *          sub($1, $3)
 *
 *       exp: exp mul exp
 *          mul($1, $3)
 *
 *       exp: exp div exp
 *          div($1, $3)
 *
 *       exp: lparen exp rparen
 *          return $2
 * \end{verbatim}
 *
 * Parsers are defined as extensions of the \verb+Parser+ class.
 * A \verb+Parser+ object must have a \verb+lexer+ field.  The \verb+lexer+
 * is not required to be a \verb+Lexer+ object, but it must provide
 * a \verb+lexer.lex()+ method that returns a token object with
 * \verb+name+ and \verb+value+ fields.  For this example, we use the
 * \verb+lexer1+ object that we defined previously.
 *
 * The next step is to define precedences for the terminal symbols.
 * The precedences are defined with the \verb+left+, \verb+right+,
 * and \verb+nonassoc+ methods in order of increasing precedence.
 *
 * The grammar must have at least one start symbol, declared with
 * the \verb+start+ method.
 *
 * Next, the productions in the grammar are listed as rules.
 * The name of the production is listed before the colon, and
 * a sequence of variables is listed to the right of the colon.
 * The body is a semantic action to be evaluated when the production
 * is recognized as part of the input.
 *
 * In this example, these are the productions for the arithmetic
 * expressions recognized by the desktop calculator.  The semantic
 * action performs the calculation.  The variables \verb+$1, $2, ...+
 * correspond to the values associated with each of the variables
 * on the right-hand-side of the production.
 *
 * \subsection{Calling the parser}
 *
 * The parser is called with the \verb+$(parser1.parse-channel start, channel)+
 * or \verb+$(parser1.parse-file start, file)+ functions.  The \verb+start+
 * argument is the start symbol, and the \verb+channel+ or \verb+file+
 * is the input to the parser.
 *
 * \subsection{Parsing control}
 *
 * The parser generator generates a pushdown automation based on LALR(1)
 * tables.  As usual, if the grammar is ambiguous, this may generate shift/reduce
 * or reduce/reduce conflicts.  These conflicts are printed to standard
 * output when the automaton is generated.
 *
 * By default, the automaton is not constructed until the parser is
 * first used.
 *
 * The \verb+build(debug)+ method forces the construction of the automaton.
 * While not required, it is wise to finish each complete parser with
 * a call to the \verb+build(debug)+ method.  If the \verb+debug+ variable
 * is set, this also prints with parser table together with any conflicts.
 *
 * The \verb+loc+ variable is defined within action bodies, and represents
 * the input range for all tokens on the right-hand-side of the production.
 *
 * \subsection{Extending parsers}
 *
 * Parsers may also be extended by inheritance.
 * For example, let's extend the grammar so that it also recognizes
 * the \verb+<<+ and \verb+>>+ shift operations.
 *
 * First, we extend the lexer so that it recognizes these tokens.
 * This time, we choose to leave \verb+lexer1+ intact, instead of
 * using the += operator.
 *
 * \begin{verbatim}
 *    lexer2. =
 *       extends $(lexer1)
 *
 *       lsl: $"<<"
 *          Token.unit($(loc), lsl)
 *
 *       asr: $">>"
 *          Token.unit($(loc), asr)
 * \end{verbatim}
 *
 * Next, we extend the parser to handle these new operators.
 * We intend that the bitwise operators have lower precedence
 * than the other arithmetic operators.  The two-argument form
 * of the \verb+left+ method accomplishes this.
 *
 * \begin{verbatim}
 *    parser2. =
 *       extends $(parser1)
 *
 *       left(plus, lsl lsr asr)
 *
 *       lexer = $(lexer2)
 *
 *       exp: exp lsl exp
 *          lsl($1, $3)
 *
 *       exp: exp asr exp
 *          asr($1, $3)
 * \end{verbatim}
 *
 * In this case, we use the new lexer \verb+lexer2+, and we add productions
 * for the new shift operations.
 * \end{doc}
 *)

(*
 * Add start symbols.
 *)
let parse_start venv pos loc args kargs =
  let pos = string_pos "parse-start" pos in
  let parse = Omake_value.current_parser venv pos in
  let args =
    match args, kargs with
      [arg], [] ->
      Omake_value.strings_of_value venv pos arg
    | _ ->
      raise (Omake_value_type.OmakeException (loc_pos loc pos, ArityMismatch (ArityExact 1, List.length args)))
  in
  let parse =
    List.fold_left (fun parse s ->
      Omake_parser.Parser.add_start parse (Lm_symbol.add s)) parse args in
  (* Redefine the parser *)
  let venv = Omake_env.venv_add_var venv Omake_var.builtin_field_var (ValOther (ValParser parse)) in
  venv, Omake_value_type.ValNone

(*
 * Precedence operations.
 *)
let parse_prec venv pos loc args kargs assoc =
  let pos = string_pos "parse-prec" pos in
  let this = Omake_env.venv_this venv in
  let parse = Omake_value.current_parser venv pos in
  let parse, level, args =
    match args, kargs with
      [before; args], [] ->
      let current_prec = Lm_symbol.add (Omake_value.string_of_value venv pos before) in
      let level =
        try Omake_parser.Parser.find_prec parse current_prec with
          Not_found ->
          raise (Omake_value_type.OmakeException (loc_pos loc pos, StringVarError ("no such precedence", current_prec)))
      in
      let parse, level = Omake_parser.Parser.create_prec_lt parse level assoc in
      parse, level, args
    | [args], [] ->
      let current_prec = Lm_symbol.add (Omake_value.string_of_value venv pos (Omake_env.venv_find_field_internal this pos Omake_symbol.current_prec_sym)) in
      let level =
        try Omake_parser.Parser.find_prec parse current_prec with
          Not_found ->
          raise (Omake_value_type.OmakeException (loc_pos loc pos, StringVarError ("current precedence is not found", current_prec)))
      in
      let parse, level = Omake_parser.Parser.create_prec_gt parse level assoc in
      parse, level, args
    | _ ->
      raise (Omake_value_type.OmakeException (loc_pos loc pos, ArityMismatch (ArityRange (1, 2), List.length args)))
  in
  let args = Omake_value.strings_of_value venv pos args in
  let parse =
    List.fold_left (fun parse s ->
      Omake_parser.Parser.add_prec parse level (Lm_symbol.add s)) parse args
  in

  (* Reset the current precedence *)
  let venv =
    match args with
      arg :: _ ->
      Omake_env.venv_add_var venv Omake_var.current_prec_field_var (ValString arg)
    | [] ->
      venv
  in

  (* Redefine the parser *)
  let venv = Omake_env.venv_add_var venv Omake_var.builtin_field_var (ValOther (ValParser parse)) in
  venv, Omake_value_type.ValNone

let parse_left venv pos loc args kargs =
   let pos = string_pos "parse-left" pos in
      parse_prec venv pos loc args kargs LeftAssoc

let parse_right venv pos loc args kargs =
   let pos = string_pos "parse-right" pos in
      parse_prec venv pos loc args kargs RightAssoc

let parse_nonassoc venv pos loc args kargs =
   let pos = string_pos "parse-nonassoc" pos in
      parse_prec venv pos loc args kargs NonAssoc

(*
 * Build the parser.
 *)
let parse_build venv pos loc args =
  let pos = string_pos "parse-build" pos in
  match args with
    [arg] ->
    let par = Omake_value.current_parser venv pos in
    let debug = Omake_value.bool_of_value venv pos arg in
    Omake_parser.Parser.build par debug;
    Omake_value_type.ValNone
  | _ ->
    raise (Omake_value_type.OmakeException (loc_pos loc pos, ArityMismatch (ArityExact 1, List.length args)))

(*
 * Get the precedence option.
 *)
let prec_option venv pos _ options =
   Omake_env.venv_map_fold (fun _ optname optval ->
         let s = Omake_value.string_of_value venv pos optname in
            if s = ":prec:" then
               Some (Lm_symbol.add (Omake_value.string_of_value venv pos optval))
            else
               raise (Omake_value_type.OmakeException (pos, StringValueError ("illegal option", optname)))) None options

(*
 * Compute an action name that is not defined in the current object.
 *)
let action_sym = Lm_symbol.add "action"

let find_action_name venv loc =
   Lm_symbol.new_name action_sym (fun v -> Omake_env.venv_defined venv (Omake_ir.VarThis (loc, v)))

(*
 * Add a parser clause.
 *)
let parse_rule venv pos loc (args : Omake_value_type.t list) kargs =
  let pos = string_pos "parse-rule" pos in
  let action, head, rhs, options, body, export =
    match args, kargs with
      [_; action; head; rhs; ValMap options; ValBody (_, [], [], body, export)], [] ->
      let action = Omake_value.string_of_value venv pos action in
      let head = Omake_value.string_of_value venv pos head in
      if head = "" then   (* Action name was omitted *)
        find_action_name venv loc, Lm_symbol.add action, rhs, options, body, export
      else
        Lm_symbol.add action, Lm_symbol.add head, rhs, options, body, export
    | _ ->
      raise (Omake_value_type.OmakeException (loc_pos loc pos, ArityMismatch (ArityExact 6, List.length args)))
  in
  let par = Omake_value.current_parser venv pos in
  let rhs = List.map Lm_symbol.add (Omake_value.strings_of_value venv pos rhs) in
  let pre = prec_option venv pos loc options in
  let par = Omake_parser.Parser.add_production par action head rhs pre in

  (* Add the method if there is a body *)
  let venv =
    match body with
      _ :: _ ->
      let body : Omake_ir.exp list =
        LetVarExp (loc, Omake_ir.VarThis (loc, Omake_symbol.val_sym), [], VarDefNormal, ConstString (loc, "")) :: body
      in
      Omake_env.venv_add_var venv (Omake_ir.VarThis (loc, action)) (ValFun (Omake_env.venv_get_env venv, [], [], body, export))
    | [] ->
      venv
  in

  (* Add back the parser *)
  let venv = Omake_env.venv_add_var venv Omake_var.builtin_field_var (ValOther (ValParser par)) in
  venv, Omake_value_type.ValNone

(*
 * Perform the lexing.
 *)
let parse_engine venv pos loc args =
  let pos = string_pos "parse-engine" pos in
  match args with
    [start] ->
    let dfa = Omake_value.current_parser venv pos in
    let start = Lm_symbol.add (Omake_value.string_of_value venv pos start) in
    let lexer = Omake_env.venv_find_var venv pos loc Omake_var.lexer_field_var in
    let lexer = Omake_eval.eval_object venv pos lexer in
    let parser_obj = Omake_env.venv_this venv in
    let lex (venv, parser_obj, lexer) =
      let lex = Omake_env.venv_find_field_internal lexer pos Omake_symbol.lex_sym in
      let venv = Omake_env.venv_with_object venv lexer in
      let venv, result = Omake_eval.eval_apply venv pos loc lex [] [] in
      let obj = Omake_eval.eval_object venv pos result in
      try
        let lex_loc = Omake_env.venv_find_field_internal_exn obj Omake_symbol.loc_sym in
        let lex_loc = Omake_value.loc_of_value venv pos lex_loc in
        let name = Omake_env.venv_find_field_internal_exn obj Omake_symbol.name_sym in
        let name = Lm_symbol.add (Omake_value.string_of_value venv pos name) in
        let value = Omake_env.venv_find_field_internal_exn obj Omake_symbol.val_sym in
        name, lex_loc, (venv, parser_obj, lexer), value
      with
        Not_found ->
        let print_error buf =
          Format.fprintf buf "@[<v 3>The lexer returned a malformed object.\
                              @ @[<v 3>The result of a lexer action should be an object with at least 3 fields:\
                              @ loc: the location of the token\
                              @ name: the name of the token\
                              @ val: the value of the token@]\
                              @ %a@]" Omake_value_print.pp_print_value (ValObject obj)
        in
        raise (Omake_value_type.OmakeException (pos, LazyError print_error))
    in
    let eval (venv, parser_obj, lexer) action loc args =
      let pos = loc_pos loc pos in
      let venv = Omake_env.venv_add_match_values venv args in
      let action = Omake_env.venv_find_field_internal parser_obj pos action in
      let venv = Omake_env.venv_with_object venv parser_obj in
      let venv = Omake_env.venv_add_var venv Omake_var.parse_loc_var (ValOther (ValLocation loc)) in
      let venv, result = Omake_eval.eval_apply venv pos loc action [] [] in
      (venv, parser_obj, lexer), result
    in
    let _, value =
      try Omake_parser.Parser.parse dfa start lex eval (venv, parser_obj, lexer) with
        Failure _ as exn ->
        raise (Omake_value_type.UncaughtException (pos, exn))
      | Lm_parser.ParseError (loc, s) ->
        raise (Omake_value_type.OmakeException (loc_pos loc pos, StringError s))
    in
    value
  | _ ->
    raise (Omake_value_type.OmakeException (loc_pos loc pos, ArityMismatch (ArityExact 2, List.length args)))

(************************************************************************
 * External interface.
*)
let () =
  let builtin_funs =
    [true, "grep",                  grep,                 Omake_ir.ArityRange (1, 3);
     true, "builtin-grep",          builtin_grep,         ArityExact 1;
     true, "cat",                   cat,                  ArityExact 1;
     true, "parse-engine",          parse_engine,         ArityExact 1;
     true, "parse-build",           parse_build,          ArityExact 1;
    ]
  in
  let builtin_kfuns =
    [true, "lex-rule",              lex_rule,             Omake_ir.ArityRange (3, 4);
     true, "lex-engine",            lex_engine,           ArityExact 1;
     true, "parse-rule",            parse_rule,           ArityRange (3, 5);
     true, "parse-start",           parse_start,          ArityExact 1;
     true, "parse-left",            parse_left,           ArityExact 1;
     true, "parse-right",           parse_right,          ArityExact 1;
     true, "parse-nonassoc",        parse_nonassoc,       ArityExact 1;
     true, "scan",                  scan,                 ArityRange (1, 3);
     true, "awk",                   awk,                  ArityExact 3;
     true, "fsubst",                fsubst,               ArityExact 3;
     true, "lex",                   lex,                  ArityExact 3;
     true, "lex-search",            lex_search,           ArityExact 3;
    ]
  in
  let builtin_info =
    {Omake_builtin_type.builtin_empty with builtin_funs = builtin_funs;
      builtin_kfuns = builtin_kfuns
    }
  in
  Omake_builtin.register_builtin builtin_info

This web site is published by Informatikbüro Gerd Stolpmann
Powered by Caml