(* $Id: nethtml_scanner.mll 1219 2009-04-14 13:28:56Z ChriS $
* ----------------------------------------------------------------------
*
*)
{
type token =
Lcomment (* <!-- *)
| Rcomment (* --> *)
| Mcomment (* within comment *)
| Ldoctype (* <! *)
| Rdoctype (* > *)
| Mdoctype (* within declaration *)
| Lpi (* <? *)
| Rpi (* ?> or > *)
| Mpi (* within processing instruction *)
| Lelement of string
| Lelementend of string
| Relement (* > *)
| Relement_empty (* />, for XML compat *)
| Cdata of string
| Space of int
| Name of string
| Is
| Literal of string
| Other
| Eof
}
(* Simplified rules: Only ASCII is recognized as character set *)
let letter = ['A'-'Z' 'a'-'z' ]
let digit = ['0'-'9']
let hexdigit = ['0'-'9' 'A'-'F' 'a'-'f']
let namechar = letter | digit | '.' | ':' | '-' | '_'
let name = ( letter | '_' | ':' ) namechar*
let nmtoken = namechar+
let ws = [ ' ' '\t' '\r' '\n' ]
let string_literal1 = '"' [^ '"' ]* '"'
let string_literal2 = "'" [^ '\'' ]* "'"
let string_literal3 = [^ '"' '\'' '>' '=' ' ' '\t' '\n' '\r' ]+
let string_literal4 = [^ '"' '\'' '>' ' ' '\t' '\n' '\r' ]+
(* This following rules reflect HTML as it is used, not the SGML
* rules.
*)
rule scan_document = parse
| "<!--"
{ Lcomment }
| "<!"
{ Ldoctype }
| "<?"
{ Lpi }
| "<" name
{ let s = Lexing.lexeme lexbuf in
Lelement (String.sub s 1 (String.length s - 1))
}
| "</" name
{ let s = Lexing.lexeme lexbuf in
Lelementend (String.sub s 2 (String.length s - 2))
}
| "<" (* misplaced "<" *)
{ Cdata "<" }
| eof
{ Eof }
| [^ '<' ]+
{ Cdata (Lexing.lexeme lexbuf)}
and scan_special = parse
| "</" name
{ let s = Lexing.lexeme lexbuf in
Lelementend (String.sub s 2 (String.length s - 2))
}
| "<"
{ Cdata "<" }
| eof
{ Eof }
| [^ '<' ]+
{ Cdata (Lexing.lexeme lexbuf)}
and scan_comment = parse
| "-->"
{ Rcomment } (* FIXME: There may be any number of ws between -- and > *)
| "-"
{ Mcomment }
| eof
{ Eof }
| [^ '-']+
{ Mcomment }
and scan_doctype = parse
| ">" (* Occurence in strings, and [ ] brackets ignored *)
{ Rdoctype }
| eof
{ Eof }
| [^ '>' ] +
{ Mdoctype }
and scan_pi = parse
| "?>"
{ Rpi }
| ">"
{ Rpi }
| eof
{ Eof }
| '?'
{ Mpi }
| [^ '>' '?' ] +
{ Mpi }
and scan_element = parse
| ">"
{ Relement }
| "/>"
{ Relement_empty }
| ws+
{ Space (String.length (Lexing.lexeme lexbuf)) }
| name
{ Name (Lexing.lexeme lexbuf) }
| "="
{ Is }
| '"'
{ Other }
| "'"
{ Other }
| string_literal3
{ Literal (Lexing.lexeme lexbuf) }
| eof
{ Eof }
| _
{ Other }
and scan_element_after_Is = parse
| ">"
{ Relement }
| "/>"
{ Relement_empty }
| ws+
{ Space (String.length (Lexing.lexeme lexbuf)) }
| '"'
{ try
Literal (scan_string_literal1 lexbuf)
with
| _ -> Other
}
| "'"
{ try
Literal (scan_string_literal2 lexbuf)
with
| _ -> Other
}
| string_literal4
{ Literal (Lexing.lexeme lexbuf) }
| eof
{ Eof }
| _
{ Other }
and scan_string_literal1 = parse
| ( [^ '"' ]* as s) '"'
{ s }
and scan_string_literal2 = parse
| ( [^ '\'' ]* as s) '\''
{ s }