open Html;;
open Printf
let db_filename = ref "db";;
(* The name of the file storing the current knowledge *)
let pipeline_length = ref 10;;
(* Maximum number of requests to put on the pipeline.
* Also limits the number of parallel connections.
*)
let max_age = ref 6000.0;;
(* If the entries are older than this amount of seconds, they are
* checked again.
*)
(**********************************************************************)
(*** AUXILIARY FUNCTIONS ***)
(**********************************************************************)
let slash_re = Str.regexp "[/]";;
let text_html_re = Str.regexp "^[ \t\r]*text/html";;
let norm_path s =
(* remove ".", "..", "//" as much as possible
* Example:
* norm_path "/x/y/..//z" = "/x/z"
*)
let rec simplify_dot l =
match l with
"." :: l' -> simplify_dot l'
| "" :: l' when l' <> [] -> simplify_dot l'
| x :: l' -> x :: simplify_dot l'
| [] -> []
in
let rec simplify_dot_dot l =
match l with
| ".." :: l' -> ".." :: simplify_dot_dot l'
| x :: l' ->
let l'' = simplify_dot_dot l' in
begin match l'' with
".." :: l''' ->
l'''
| _ -> x :: l''
end
| [] -> []
in
let l = Str.split_delim slash_re s in
let s' =
match l with
"" :: l' ->
(* absolute path *)
"/" ^ String.concat "/" (simplify_dot_dot (simplify_dot l'))
| _ ->
(* relative path *)
String.concat "/" (simplify_dot_dot (simplify_dot l))
in
if s' = "" then "." else s'
;;
let norm_url s =
(* Splits the URL into the "host" part "http://..." and the path.
* Raises Not_found on any problem.
* Example:
* norm_url "http://localhost:8080/where/to/go"
* = "http://localhost:8080", "/where/to/go"
*)
try
let u = Neturl.parse_url s in
let user = try Neturl.url_user u with Not_found -> "" in
let password = try ":" ^ Neturl.url_password u with Not_found -> "" in
let host = Neturl.url_host u in
let port = try Neturl.url_port u with Not_found -> 80 in
let path1 = Neturl.join_path (Neturl.url_path u) in
let path = if path1 = "" then "/" else path1 in
let query = try "?" ^ Neturl.url_query u with Not_found -> "" in
let p1 =
"http://" ^ user ^ password ^ (if user <> "" then "@" else "") ^
String.lowercase host ^ ":" ^ string_of_int port in
let p2 = norm_path (path ^ query) in
(p1,p2)
with
| Neturl.Malformed_URL ->
eprintf "Malformed: %s\n%!" s;
raise Not_found
;;
(**********************************************************************)
(*** HTML PARSING AND DATABASE UPDATES ***)
(**********************************************************************)
let hash_re = Str.regexp "^\\([^#]*\\)#.*";;
let last_save = ref 0.0;;
(* when the database was last saved; in seconds since the epoch *)
let parse_html db base s =
(* Parses the HTML document s, and puts new links into 'db'.
* 'base' is the URL of 's'.
*)
(* TODO: The 'base' element of HTML is not recognized. *)
let base_host, base_path =
try norm_url base
with Not_found -> failwith ("parse_html: bad base URL: " ^ base)
in
let add_link href =
(* Adds the hyperlink 'href' to the database 'db'. *)
(* Remove everything after '#': *)
let href' =
if Str.string_match hash_re href 0 then
Str.matched_group 1 href
else
href
in
(* Absolute or relative URL? - Not_found on bad URL *)
(* The following block is not completely correct *)
try
let href'' =
if String.length href' >= 7 && String.sub href' 0 7 = "http://" then (
let host_part, path = norm_url href' in
host_part ^ path
)
else
if String.length href' >= 2 && String.sub href' 0 2 = "//" then (
let host_part, path = norm_url ("http:" ^ href') in
host_part ^ path
)
else (
(* what's with ".", ".."? *)
if String.length href' >= 1 && href'.[0] = '/' then
base_host ^ href'
else (
let d =
if base_path = "/" or base_path = ""
then "/"
else Filename.dirname base_path in
base_host ^ norm_path(d ^ "/" ^ href')
)
)
in
(* prerr_endline ("Found URL " ^ href''); *)
Database.add db href'';
let t0 = Unix.gettimeofday() in
if t0 -. !last_save >= 60.0 then begin
prerr_endline "************* SAVING ****************";
Database.save db !db_filename;
last_save := t0
end
with
Not_found -> ()
in
let rec traverse_list doclist =
(* Walks through the document given as 'doclist', and enters all
* hyperlinks into 'db'.
* All "src" and "href" attributes are recognized as hyperlinks.
*)
match doclist with
[] -> ()
| Data _ :: doclist' ->
traverse_list doclist'
| Element (name, atts, doclist') :: doclist'' ->
begin try
let href = List.assoc "href" atts in
add_link href
with Not_found -> ()
end;
begin try
let src = List.assoc "src" atts in
add_link src
with Not_found -> ()
end;
traverse_list doclist';
traverse_list doclist''
in
traverse_list(Parse.parse_string s)
;;
(**********************************************************************)
(*** THE USER AGENT ***)
(**********************************************************************)
let run_through_db pipeline db =
(* Uses 'pipeline' to process 'db' *)
let s = Database.iter db !max_age !max_age in
(* 's': a stream of database entries which needs to be processed *)
let rec add_get_request url =
(* Retrieves the document for the URL 'url' from the content server,
* checks whether it is an HTML document, and if so, parses this document
* and enters all new hyperlinks to the database 'db'.
* After that, 'add_next' is called to put new requests on to the pipeline.
*)
let request = new Nethttp_client.get url in
pipeline # add_with_callback
request
(fun m ->
(* Parse the response if it is OK and mime type = HTML: *)
begin try
let http_version, code, _ = m # dest_status() in
if code >= 200 && code <= 299 then begin
let mime_type =
try m # assoc_resp_header "content-type"
with Not_found -> "" in
let t0 = Unix.gettimeofday() in
Database.update
db
url
code
t0
t0
mime_type;
if Str.string_match text_html_re mime_type 0 then begin
(* TODO: or "text/html;...options..." *)
let base_url =
try m # assoc_resp_header "content-base"
with
Not_found ->
try m # assoc_resp_header "content-location"
with
Not_found ->
url
in
parse_html db base_url (m # get_resp_body());
end;
prerr_endline ("Done " ^ url);
end
else begin
let t0 = Unix.gettimeofday() in
Database.update
db
url
code
0.0
t0
"";
prerr_endline ("Error " ^ url);
end
with
any ->
prerr_endline ("Serious error: " ^
Printexc.to_string any);
let t0 = Unix.gettimeofday() in
Database.update
db
url
999
0.0
t0
"";
prerr_endline ("Error " ^ url);
end;
(* Put the next request into the pipeline: *)
add_next()
)
and add_head_request url =
(* Retrieves the header for the document for the URL 'url' from the
* content server, checks whether it is an HTML document, and if so,
* adds a GET request to the pipeline (by calling add_get_request).
* If the document is not an HTML document (or if an error occurred),
* 'add_next' is called to put new requests on to the pipeline.
*)
let request = new Nethttp_client.head url in
pipeline # add_with_callback
request
(fun m ->
(* Parse the response if it is OK and mime type = HTML: *)
begin try
let http_version, code, _ = m # dest_status() in
if code >= 200 && code <= 299 then begin
let mime_type =
try m # assoc_resp_header "content-type"
with Not_found -> "" in
let t0 = Unix.gettimeofday() in
Database.update
db
url
code
t0
t0
mime_type;
if Str.string_match text_html_re mime_type 0 then begin
(* TODO: or "text/html;...options..." *)
add_get_request url
end
else begin
prerr_endline ("Done " ^ url);
add_next()
end
end
else begin
let t0 = Unix.gettimeofday() in
Database.update
db
url
code
0.0
t0
"";
prerr_endline ("Error " ^ url);
add_next()
end
with
any ->
prerr_endline ("Serious error: " ^
Printexc.to_string any);
let t0 = Unix.gettimeofday() in
Database.update
db
url
999
0.0
t0
"";
prerr_endline ("Error " ^ url);
add_next()
end;
)
and add_next() =
(* Fetch the next task(s) from the stream 's', and add new requests
* onto the pipeline.
*)
try
let (next_url,_,_,_,_) = Stream.next s in (* or Stream.Failure *)
try
add_head_request next_url;
if pipeline # number_of_open_messages < !pipeline_length then
add_next()
with
| any ->
prerr_endline ("Error while adding to the pipeline: " ^
Printexc.to_string any);
prerr_endline "Proceeding with next element...";
let t0 = Unix.gettimeofday() in
Database.update
db
next_url
999
0.0
t0
"";
add_next()
with
Stream.Failure -> ()
(* 's' is empty: all is done *)
in
let rec run_pipeline () =
try
pipeline # run()
with
Unix.Unix_error(_,_,_) ->
prerr_endline ("Unix error. Continuing with next element...");
run_pipeline()
| Sys_error s ->
prerr_endline ("System error: " ^ s ^
" - Continuing with next element...");
run_pipeline()
| Failure s ->
prerr_endline ("Failure: " ^ s ^
" - Continuing with next element...");
run_pipeline()
in
add_next(); (* Add the initial requests to the pipeline *)
run_pipeline() (* Start executing the pipeline *)
;;
let main() =
let start_points = ref [] in
let proxy_host = ref "" in
let proxy_port = ref 8080 in
let no_proxy = ref [] in
Arg.parse
[ "-db", Arg.String (fun s -> db_filename := s),
" <name> Name of the database file (default: db)";
"-limit", Arg.Int (fun n -> pipeline_length := n),
" <n> Limit for the length of the pipeline (default: 1000)";
"-proxy-host", Arg.String (fun s -> proxy_host := s),
" <name> Host name of the proxy to use";
"-proxy-port", Arg.Int (fun n -> proxy_port := n),
" <n> Port of the proxy to use (default: 8080)";
"-no-proxy", Arg.String (fun s -> no_proxy := s :: !no_proxy),
" <name> Do not use proxy for this host or this domain";
"-max-age", Arg.Int (fun n -> max_age := float_of_int n),
" <n> Maximum age of database entries";
]
(fun s -> start_points := s :: !start_points)
"usage: spider [options] URL ...";
let pipeline = new Nethttp_client.pipeline in
if !proxy_host <> "" then
pipeline # set_proxy !proxy_host !proxy_port;
pipeline # avoid_proxy_for !no_proxy;
let db = Database.restore !db_filename in
List.iter
(fun s ->
try
let host_part, path = norm_url s in
prerr_endline ("Adding " ^ host_part ^ path);
Database.add db (host_part ^ path)
with
Not_found ->
prerr_endline ("Bad URL: " ^ s)
)
!start_points;
let opt = pipeline # get_options in
pipeline # set_options
{ opt with
Nethttp_client.number_of_parallel_connections = 1;
Nethttp_client.verbose_status = true;
Nethttp_client.verbose_connection = true;
Nethttp_client.verbose_response_header = true;
(* Nethttp_client.verbose_response_contents = true;
Nethttp_client.verbose_request_header = true;
Nethttp_client.verbose_request_contents = true;
*)
};
run_through_db pipeline db;
Database.save db !db_filename
;;