Plasma GitLab Archive
Projects Blog Knowledge

(* $Id$ *)
{
  
open Printf
open Lexing

open Cppo_types
open Cppo_parser

let pos1 lexbuf = lexbuf.lex_start_p
let pos2 lexbuf = lexbuf.lex_curr_p
let loc lexbuf = (pos1 lexbuf, pos2 lexbuf)

let lexer_error lexbuf descr =
  error (loc lexbuf) descr

let new_file lb name =
  lb.lex_curr_p <- { lb.lex_curr_p with pos_fname = name }

let lex_new_lines lb =
  let n = ref 0 in
  let s = lb.lex_buffer in
  for i = lb.lex_start_pos to lb.lex_curr_pos do
    if s.[i] = '\n' then
      incr n
  done;
  let p = lb.lex_curr_p in
  lb.lex_curr_p <- 
    { p with
	pos_lnum = p.pos_lnum + !n;
	pos_bol = p.pos_cnum
    }

let count_new_lines lb n =
  let p = lb.lex_curr_p in
  lb.lex_curr_p <- 
    { p with
	pos_lnum = p.pos_lnum + n;
	pos_bol = p.pos_cnum
    }

(* must start a new line *)
let update_pos lb p added_chars added_breaks =
  let cnum = p.pos_cnum + added_chars in
  lb.lex_curr_p <-
    { pos_fname = p.pos_fname;
      pos_lnum = p.pos_lnum + added_breaks;
      pos_bol = cnum;
      pos_cnum = cnum }

let set_lnum lb opt_file lnum =
  let p = lb.lex_curr_p in
  let cnum = p.pos_cnum in
  let fname =
    match opt_file with
	None -> p.pos_fname
      | Some file -> file
  in
  lb.lex_curr_p <-
    { pos_fname = fname;
      pos_bol = cnum;
      pos_cnum = cnum;
      pos_lnum = lnum }
	
let shift lb n =
  let p = lb.lex_curr_p in
  lb.lex_curr_p <- { p with pos_cnum = p.pos_cnum + n }

let read_hexdigit c =
  match c with
      '0'..'9' -> Char.code c - 48
    | 'A'..'F' -> Char.code c - 55
    | 'a'..'z' -> Char.code c - 87
    | _ -> invalid_arg "read_hexdigit"

let read_hex2 c1 c2 =
  Char.chr (read_hexdigit c1 * 16 + read_hexdigit c2)

type env = {
  preserve_quotations : bool;
  mutable lexer : [ `Ocaml | `Test ];
  mutable line_start : bool;
  mutable in_directive : bool; (* true while processing a directive, until the
				  final newline *)
  buf : Buffer.t;
  mutable token_start : Lexing.position;
  lexbuf : Lexing.lexbuf;
}

let new_line env =
  env.line_start <- true;
  count_new_lines env.lexbuf 1

let clear env = Buffer.clear env.buf

let add env s =
  env.line_start <- false;
  Buffer.add_string env.buf s

let add_char env c =
  env.line_start <- false;
  Buffer.add_char env.buf c

let get env = Buffer.contents env.buf

let long_loc e = (e.token_start, pos2 e.lexbuf)
}

(* standard character classes used for macro identifiers *)
let upper = ['A'-'Z']
let lower = ['a'-'z']
let digit = ['0'-'9']

let identchar = upper | lower | digit | [ '_' '\'' ]


(* iso-8859-1 upper and lower characters used for ocaml identifiers *)
let oc_upper = ['A'-'Z' '\192'-'\214' '\216'-'\222']
let oc_lower = ['a'-'z' '\223'-'\246' '\248'-'\255']
let oc_identchar = oc_upper | oc_lower | digit | ['_' '\'']

(*
  Identifiers: ident is used for macro names and is a subset of oc_ident
*)
let ident = (lower | '_' identchar | upper) identchar*
let oc_ident = (oc_lower | '_' oc_identchar | oc_upper) oc_identchar*



let hex = ['0'-'9' 'a'-'f' 'A'-'F']
let oct = ['0'-'7']
let bin = ['0'-'1']

let operator_char = 
  [ '!' '$' '%' '&' '*' '+' '-' '.' '/' ':' '<' '=' '>' '?' '@' '^' '|' '~']
let infix_symbol =
  ['=' '<' '>' '@' '^' '|' '&' '+' '-' '*' '/' '$' '%'] operator_char*
let prefix_symbol = ['!' '?' '~'] operator_char*

let blank = [ ' ' '\t' ]
let space = [ ' ' '\t' '\r' '\n' ]

let line = ( [^'\n'] | '\\' ('\r'? '\n') )* ('\n' | eof)

let dblank0 = (blank | '\\' '\r'? '\n')*
let dblank1 = blank (blank | '\\' '\r'? '\n')*

rule token e = parse
    ""
      {
	(*
	  We use two different lexers for boolean expressions in #if directives
	  and for regular OCaml tokens.
	*)
	match e.lexer with
	    `Ocaml -> ocaml_token e lexbuf
	  | `Test -> test_token e lexbuf
      }

and line e = parse
    blank* "#" as s
        { 
	  match e.lexer with
	      `Test -> lexer_error lexbuf "Syntax error in boolean expression"
	    | `Ocaml ->
		if e.line_start then (
		  e.in_directive <- true;
		  clear e;
		  add e s;
		  e.token_start <- pos1 lexbuf;
		  e.line_start <- false;
		  directive e lexbuf
		)
		else (
		  e.line_start <- false;
		  clear e;
		  TEXT (loc lexbuf, false, s)
		)
	}

  | ""  { clear e;
	  token e lexbuf }

and directive e = parse
    blank* "define" dblank1 (ident as id) "(" 
      { DEFUN (long_loc e, id) }

  | blank* "define" dblank1 (ident as id)
      { assert e.in_directive;
	DEF (long_loc e, id) }

  | blank* "undef" dblank1 (ident as id)
      { blank_until_eol e lexbuf;
	UNDEF (long_loc e, id) }

  | blank* "if" dblank1    { e.lexer <- `Test;
			     IF (long_loc e) }
  | blank* "elif" dblank1  { e.lexer <- `Test;
			     ELIF (long_loc e) }

  | blank* "ifdef" dblank1 (ident as id)
      { blank_until_eol e lexbuf;
	IFDEF (long_loc e, `Defined id) }

  | blank* "ifndef" dblank1 (ident as id)
      { blank_until_eol e lexbuf;
	IFDEF (long_loc e, `Not (`Defined id)) }

  | blank* "ext" dblank1 (ident as id)
      { blank_until_eol e lexbuf;
        clear e;
        let s = read_ext e lexbuf in
        EXT (long_loc e, id, s) }

  | blank* "define" dblank1 oc_ident
  | blank* "undef" dblank1 oc_ident
  | blank* "ifdef" dblank1 oc_ident
  | blank* "ifndef" dblank1 oc_ident
  | blank* "ext" dblank1 oc_ident
      { error (loc lexbuf)
	  "Identifiers containing non-ASCII characters \
           may not be used as macro identifiers" }

  | blank* "else"
      { blank_until_eol e lexbuf;
	ELSE (long_loc e) }

  | blank* "endif"
      { blank_until_eol e lexbuf;
	ENDIF (long_loc e) }

  | blank* "include" dblank0 '"'
      { clear e;
	eval_string e lexbuf;
	blank_until_eol e lexbuf;
	INCLUDE (long_loc e, get e) }
  
  | blank* "error" dblank0 '"'
      { clear e;
	eval_string e lexbuf;
	blank_until_eol e lexbuf;
	ERROR (long_loc e, get e) }

  | blank* "warning" dblank0 '"'
      { clear e;
	eval_string e lexbuf;
	blank_until_eol e lexbuf;
	WARNING (long_loc e, get e) }

  | blank* (['0'-'9']+ as lnum) dblank0 '\r'? '\n'
      { e.in_directive <- false;
	new_line e;
	let here = long_loc e in
	let fname = None in
	let lnum = int_of_string lnum in
	(* Apply line directive regardless of possible #if condition. *)
	set_lnum lexbuf fname lnum;
	LINE (here, None, lnum) }

  | blank* (['0'-'9']+ as lnum) dblank0 '"'
      { clear e;
	eval_string e lexbuf;
	blank_until_eol e lexbuf;
	let here = long_loc e in
	let fname = Some (get e) in
	let lnum = int_of_string lnum in
	(* Apply line directive regardless of possible #if condition. *)
	set_lnum lexbuf fname lnum;
	LINE (here, fname, lnum) }

  | blank*
      { e.in_directive <- false;
	add e (lexeme lexbuf);
	TEXT (long_loc e, true, get e) }

  | blank* ['a'-'z']+
      { e.in_directive <- false;
	add e (lexeme lexbuf);
	TEXT (long_loc e, false, get e) }


and blank_until_eol e = parse
    blank* eof
  | blank* '\r'? '\n' { new_line e;
			e.in_directive <- false }
  | ""                { lexer_error lexbuf "syntax error in directive" }

and read_ext e = parse
    blank* "#" blank* "endext" blank* ('\r'? '\n' | eof)
      { let s = get e in
        clear e;
	new_line e;
        e.in_directive <- false;
        s }

  | (blank* as a) "\\" ("#" blank* "endext" blank* '\r'? '\n' as b)
      { add e a;
        add e b;
        new_line e;
        read_ext e lexbuf }

  | [^'\n']* '\n' as x
      { add e x;
        new_line e;
        read_ext e lexbuf }

  | eof
      { lexer_error lexbuf "End of file within #ext ... #endext" }

and ocaml_token e = parse
    "__LINE__"
      { e.line_start <- false;
	CURRENT_LINE (loc lexbuf) }

  | "__FILE__"
      { e.line_start <- false;
	CURRENT_FILE (loc lexbuf) }

  | ident as s
      { e.line_start <- false;
	IDENT (loc lexbuf, s) }

  | oc_ident as s
      { e.line_start <- false;
	TEXT (loc lexbuf, false, s) }

  | ident as s "("
      { e.line_start <- false;
	FUNIDENT (loc lexbuf, s) }

  | "'\n'" 
  | "'\r\n'" 
      { new_line e;
	TEXT (loc lexbuf, false, lexeme lexbuf) }

  | ")"       { e.line_start <- false; CL_PAREN (loc lexbuf) }
  | ","       { e.line_start <- false; COMMA (loc lexbuf) }

  | "\\)"     { e.line_start <- false; TEXT (loc lexbuf, false, " )") }
  | "\\,"     { e.line_start <- false; TEXT (loc lexbuf, false, " ,") }
  | "\\("     { e.line_start <- false; TEXT (loc lexbuf, false, " (") }
  | "\\#"     { e.line_start <- false; TEXT (loc lexbuf, false, " #") }

  | '`'
  | "!=" | "#" | "&" | "&&" | "(" |  "*" | "+" | "-"
  | "-." | "->" | "." | ".. :" | "::" | ":=" | ":>" | ";" | ";;" | "<"
  | "<-" | "=" | ">" | ">]" | ">}" | "?" | "??" | "[" | "[<" | "[>" | "[|"
  | "]" | "_" | "`" | "{" | "{<" | "|" | "|]" | "}" | "~"
  | ">>"
  | prefix_symbol 
  | infix_symbol
  | "'" ([^ '\'' '\\'] 
         | '\\' (_ | digit digit digit | 'x' hex hex)) "'"

      { e.line_start <- false; 
	TEXT (loc lexbuf, false, lexeme lexbuf) }

  | blank+
      { TEXT (loc lexbuf, true, lexeme lexbuf) }

  | '\\' ('\r'? '\n' as nl)

      {
	new_line e;
	if e.in_directive then
	  TEXT (loc lexbuf, true, nl)
	else
	  TEXT (loc lexbuf, false, lexeme lexbuf)
      }

  | '\r'? '\n'
      {
	new_line e;
	if e.in_directive then (
	  e.in_directive <- false;
	  ENDEF (loc lexbuf)
	)
	else
	  TEXT (loc lexbuf, true, lexeme lexbuf)
      }

  | "(*"
      { clear e;
	add e "(*";
	e.token_start <- pos1 lexbuf;
	comment (loc lexbuf) e 1 lexbuf }

  | '"'
      { clear e;
	add e "\"";
	e.token_start <- pos1 lexbuf;
	string e lexbuf;
	e.line_start <- false;
	TEXT (long_loc e, false, get e) }

  | "<:"
  | "<<"
      { if e.preserve_quotations then (
	  clear e;
	  add e (lexeme lexbuf);
	  e.token_start <- pos1 lexbuf;
	  quotation e lexbuf;
	  e.line_start <- false;
	  TEXT (long_loc e, false, get e)
	)
	else (
	  e.line_start <- false;
	  TEXT (loc lexbuf, false, lexeme lexbuf)
	)
      }


  | '-'? ( digit (digit | '_')*
         | ("0x"| "0X") hex (hex | '_')*
	 | ("0o"| "0O") oct (oct | '_')*	
	 | ("0b"| "0B") bin (bin | '_')* )

  | '-'? digit (digit | '_')* ('.' (digit | '_')* )? 
      (['e' 'E'] ['+' '-']? digit (digit | '_')* )? 
      { e.line_start <- false;
	TEXT (loc lexbuf, false, lexeme lexbuf) }

  | blank+
      { TEXT (loc lexbuf, true, lexeme lexbuf) }

  | _
      { e.line_start <- false;
	TEXT (loc lexbuf, false, lexeme lexbuf) }

  | eof 
      { EOF }


and comment startloc e depth = parse
    "(*"
      { add e "(*";
	comment startloc e (depth + 1) lexbuf }
      
  | "*)"
      { let depth = depth - 1 in 
	add e "*)";
	if depth > 0 then
	  comment startloc e depth lexbuf
	else (
	  e.line_start <- false;
	  TEXT (long_loc e, false, get e)
	)
      }
  | '"'
      { add_char e '"';
	string e lexbuf;
	comment startloc e depth lexbuf }
      
  | "'\n'" 
  | "'\r\n'" 
      { new_line e;
	add e (lexeme lexbuf);
	comment startloc e depth lexbuf }

  | "'" ([^ '\'' '\\'] 
         | '\\' (_ | digit digit digit | 'x' hex hex)) "'"
      { add e (lexeme lexbuf);
	comment startloc e depth lexbuf }

  | '\r'? '\n'
      { 
	new_line e;
	add e (lexeme lexbuf);
	comment startloc e depth lexbuf
      }
      
  | [^'(' '*' '"' '\'' '\r' '\n']+
      { (* Cannot tolerate "unmatched single quotes", because of '"' *)
        add e (lexeme lexbuf);
	comment startloc e depth lexbuf
      }

  | _
      { add e (lexeme lexbuf);
	comment startloc e depth lexbuf }
      
  | eof
      { error startloc "Unterminated comment reaching the end of file" }
      
      
and string e = parse
    '"'
      { add_char e '"' }
      
  | "\\\\"
  | '\\' '"'
      { add e (lexeme lexbuf);
	string e lexbuf }
      
  | '\\' '\r'? '\n'
      {
	add e (lexeme lexbuf);
	new_line e;
	string e lexbuf
      }
      
  | '\r'? '\n'
      {
	if e.in_directive then
	  lexer_error lexbuf "Unterminated string literal"
	else (
	  add e (lexeme lexbuf);
	  new_line e;
	  string e lexbuf
	)
      }
      
  | _ as c
      { add_char e c;
	string e lexbuf }
      
  | eof
      { }
      

and eval_string e = parse
    '"'
      {  }
      
  | '\\' (['\'' '\"' '\\'] as c)
      { add_char e c;
	eval_string e lexbuf }

  | '\\' '\r'? '\n'
      { assert e.in_directive;
        eval_string e lexbuf }
      
  | '\r'? '\n'
      { assert e.in_directive;
        lexer_error lexbuf "Unterminated string literal" }
      
  | '\\' (digit digit digit as s)
      { add_char e (Char.chr (int_of_string s));
	eval_string e lexbuf }
      
  | '\\' 'x' (hex as c1) (hex as c2)
      { add_char e (read_hex2 c1 c2);
	eval_string e lexbuf }
      
  | '\\' 'b'
      { add_char e '\b';
	eval_string e lexbuf }
      
  | '\\' 'n'
      { add_char e '\n';
	eval_string e lexbuf }
      
  | '\\' 'r'
      { add_char e '\r';
	eval_string e lexbuf }
      
  | '\\' 't'
      { add_char e '\t';
	eval_string e lexbuf }
      
  | [^ '\"' '\\']+
      { add e (lexeme lexbuf);
	eval_string e lexbuf }
      
  | eof
      { lexer_error lexbuf "Unterminated string literal" }
      
      
and quotation e = parse
    ">>"
      { add e ">>" }
      
  | "\\>>"
      { add e "\\>>";
	quotation e lexbuf }

  | '\\' '\r'? '\n'
      {
	if e.in_directive then (
	  new_line e;
	  quotation e lexbuf
	)
	else (
	  add e (lexeme lexbuf);
	  new_line e;
	  quotation e lexbuf
	)
      }
      
  | '\r'? '\n'
      {
	if e.in_directive then
	  lexer_error lexbuf "Unterminated quotation"
	else (
	  add e (lexeme lexbuf);
	  new_line e;
	  quotation e lexbuf
	)
      }
      
  | [^'>' '\\' '\r' '\n']+
      { add e (lexeme lexbuf);
	quotation e lexbuf }
      
  | eof 
      { lexer_error lexbuf "Unterminated quotation" }

and test_token e = parse
    "true"    { TRUE }
  | "false"   { FALSE }
  | "defined" { DEFINED }
  | "("       { OP_PAREN }
  | ")"       { CL_PAREN (loc lexbuf) }
  | "&&"      { AND }
  | "||"      { OR }
  | "not"     { NOT }
  | "="       { EQ }
  | "<"       { LT }
  | ">"       { GT }
  | "<>"      { NE }
  | "<="      { LE }
  | ">="      { GE }

  | '-'? ( digit (digit | '_')*
         | ("0x"| "0X") hex (hex | '_')*
	 | ("0o"| "0O") oct (oct | '_')*	
	 | ("0b"| "0B") bin (bin | '_')* )
      { let s = Lexing.lexeme lexbuf in
	try INT (Int64.of_string s)
	with _ -> 
	  error (loc lexbuf)
	    (sprintf "Integer constant %s is out the valid range for int64" s)
      }

  | "+"       { PLUS }
  | "-"       { MINUS }
  | "*"       { STAR }
  | "/"       { SLASH (loc lexbuf) }
  | "mod"     { MOD (loc lexbuf) }
  | "lsl"     { LSL }
  | "lsr"     { LSR }
  | "asr"     { ASR }
  | "land"    { LAND }
  | "lor"     { LOR }
  | "lxor"    { LXOR }
  | "lnot"    { LNOT }

  | ident
      { IDENT (loc lexbuf, lexeme lexbuf) }

  | blank+                   { test_token e lexbuf }
  | '\\' '\r'? '\n'          { new_line e;
			       test_token e lexbuf }
  | '\r'? '\n' 
  | eof        { assert e.in_directive;
		 e.in_directive <- false;
		 new_line e;
		 e.lexer <- `Ocaml;
		 ENDTEST (loc lexbuf) }
  | _          { error (loc lexbuf)
		   (sprintf "Invalid token %s" (Lexing.lexeme lexbuf)) }


{
  let init ~preserve_quotations file lexbuf =
    new_file lexbuf file;
    {
      preserve_quotations = preserve_quotations;
      lexer = `Ocaml;
      line_start = true;
      in_directive = false;
      buf = Buffer.create 200;
      token_start = Lexing.dummy_pos;
      lexbuf = lexbuf;
    }
}

This web site is published by Informatikbüro Gerd Stolpmann
Powered by Caml