Mercurial > hg > lukkari
comparison fetchdata.pl @ 1:21fde93375e9
Add beta code.
author | Matti Hamalainen <ccr@tnsp.org> |
---|---|
date | Tue, 11 Jan 2011 20:43:12 +0200 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:02f0f79f98b6 | 1:21fde93375e9 |
---|---|
1 #!/usr/bin/perl -w | |
2 # | |
3 # Fetch and parse HTML format class timetable into more sane formats | |
4 # (C) Copyright 2010-2010 Matti Hämäläinen <ccr@tnsp.org> | |
5 # | |
6 use strict; | |
7 use Data::Dumper; | |
8 use LWP::UserAgent; | |
9 use HTML::Entities; | |
10 | |
11 my $userAgent = "Lukkari/0.7"; | |
12 | |
13 | |
14 sub urlencode($) | |
15 { | |
16 my $value = $_[0]; | |
17 $value =~ s/([^a-zA-Z_0-9 ])/"%" . uc(sprintf "%lx" , unpack("C", $1))/eg; | |
18 $value =~ tr/ /+/; | |
19 return $value; | |
20 } | |
21 | |
22 | |
23 sub str_trim($) | |
24 { | |
25 my $str = $_[0]; | |
26 if (defined($str)) { | |
27 $str =~ s/^\s*//; | |
28 $str =~ s/\s*$//; | |
29 } | |
30 return $str; | |
31 } | |
32 | |
33 | |
34 sub pop_token_a($) | |
35 { | |
36 my $tokens = $_[0]; | |
37 return shift(@$tokens); | |
38 } | |
39 | |
40 | |
41 sub pop_token($) | |
42 { | |
43 return str_trim(pop_token_a($_[0])); | |
44 } | |
45 | |
46 | |
47 sub parse_html_str($) | |
48 { | |
49 my $tokens = $_[0]; | |
50 my $token = pop_token($tokens); | |
51 my $str = ""; | |
52 $token =~ s/^\s*//; | |
53 | |
54 return undef unless (substr($token, 0, 1) eq '"'); | |
55 $token = substr($token, 1); | |
56 | |
57 while (defined($token)) { | |
58 if (substr($token, -1) eq '"') { | |
59 $str .= substr($token, 0, -1); | |
60 return $str; | |
61 } else { | |
62 $str .= $token; | |
63 } | |
64 $token = shift(@$tokens); | |
65 } | |
66 return undef; | |
67 } | |
68 | |
69 | |
70 sub parse_html_tree($$); | |
71 | |
72 sub parse_html_tree($$) | |
73 { | |
74 my ($tokens, $tree) = @_; | |
75 | |
76 while (my $token = pop_token($tokens)) { | |
77 if ($token =~ /^<[!\/]?[a-zA-Z]+/) { | |
78 $token = lc($token); | |
79 if ($token =~ /^<\!.*>$/) { | |
80 # Ignore comments etc. | |
81 } elsif ($token =~ /^<([a-z]+)(.*)>$/) { | |
82 my ($name, $args) = ($1, $2); | |
83 if ($name eq "meta" || $name eq "img") { | |
84 my $tmp = {}; | |
85 parse_html_tree($tokens, $tree); | |
86 return $tree; | |
87 } else { | |
88 my $tmp = { "name" => $name, "args" => str_trim($args) }; | |
89 parse_html_tree($tokens, $tmp); | |
90 push(@{$$tree{"nodes"}}, $tmp); | |
91 } | |
92 } elsif ($token =~ /^<\/([a-z]+)>$/) { | |
93 return $tree; | |
94 } else { | |
95 die("HORROR TERROR ELITE: $token\n"); | |
96 } | |
97 } else { | |
98 $token = str_trim(decode_entities($token)); | |
99 push(@{$$tree{"nodes"}}, { "name" => "text", "args" => "", "text" => $token }) if length($token) > 0; | |
100 } | |
101 } | |
102 | |
103 return $tree; | |
104 } | |
105 | |
106 | |
107 sub parse_html($) | |
108 { | |
109 return undef unless defined($_[0]); | |
110 my $str = $_[0]; | |
111 my $res = { "name" => "", "args" => "" }; | |
112 $str =~ tr/\r/ /; | |
113 $str =~ tr/\n/ /; | |
114 my @tokens = grep { !($_ =~ /^\s*$/) } split(/(<\/?[a-zA-Z]+.*?>)/, $str); | |
115 parse_html_tree(\@tokens, $res); | |
116 return $res; | |
117 } | |
118 | |
119 | |
120 sub http_fetch($) | |
121 { | |
122 my $agent = LWP::UserAgent->new; | |
123 $agent->agent($userAgent); | |
124 $agent->timeout(10); | |
125 | |
126 my $req = HTTP::Request->new(GET => $_[0]); | |
127 $req->user_agent($userAgent); | |
128 my $res = $agent->request($req); | |
129 | |
130 if ($res->is_success) { | |
131 return $res->content; | |
132 } else { | |
133 print STDERR "HTTP request failed: [".$res->code."] ".$res->message."\n"; | |
134 return undef; | |
135 } | |
136 } | |
137 | |
138 | |
139 sub html_find_node($$$); | |
140 | |
141 sub html_find_node($$$) | |
142 { | |
143 my ($node, $name, $args) = @_; | |
144 | |
145 if (defined($node)) { | |
146 if (ref($node) eq "ARRAY") { | |
147 foreach my $n (@$node) { | |
148 my $tmp = html_find_node($n, $name, $args); | |
149 # Must do it like this, in order not to break the loop | |
150 return $tmp if defined($tmp); | |
151 } | |
152 } elsif (ref($node) eq "HASH") { | |
153 if (defined($$node{"name"})) { | |
154 if ($$node{"name"} eq $name) { | |
155 if ($args ne "") { | |
156 if (defined($$node{"args"}) && $$node{"args"} =~ /$args/) { | |
157 } else { | |
158 return undef; | |
159 } | |
160 } | |
161 return $node; | |
162 } else { | |
163 return html_find_node($$node{"nodes"}, $name, $args); | |
164 } | |
165 } | |
166 } | |
167 } | |
168 return undef; | |
169 } | |
170 | |
171 | |
172 sub fnode($$) | |
173 { | |
174 return html_find_node($_[0], $_[1], ""); | |
175 } | |
176 | |
177 | |
178 sub fnodea($$$) | |
179 { | |
180 return html_find_node($_[0], $_[1], $_[2]); | |
181 } | |
182 | |
183 | |
184 sub escape($) | |
185 { | |
186 my $s = $_[0]; | |
187 $s =~ s/(['"])/\\$1/g; | |
188 return $s; | |
189 } | |
190 | |
191 | |
192 sub html_collapse($$); | |
193 | |
194 sub html_collapse($$) | |
195 { | |
196 my ($node, $strip) = @_; | |
197 my $str = ""; | |
198 | |
199 if ($$node{"name"} eq "text") { | |
200 $str .= $$node{"text"}; | |
201 } else { | |
202 $str .= "<".$$node{"name"}.">" unless ($strip); | |
203 foreach my $n (@{$$node{"nodes"}}) { | |
204 $str .= html_collapse($n, $strip); | |
205 } | |
206 $str .= "</".$$node{"name"}.">" unless ($strip); | |
207 } | |
208 | |
209 return $str; | |
210 } | |
211 | |
212 | |
213 ### | |
214 ### Main program | |
215 ### | |
216 my $modes = "simple|php|xml"; | |
217 my $opt_mode = "php"; | |
218 my $opt_dump = 0; | |
219 my $opt_filename; | |
220 my $opt_outfile; | |
221 | |
222 while (defined(my $arg = shift)) { | |
223 if (substr($arg, 0, 1) eq "-") { | |
224 if ($arg =~ /^-($modes)$/o) { | |
225 $opt_mode = $1; | |
226 } | |
227 elsif ($arg eq "-dump") { | |
228 $opt_dump = 1; | |
229 } | |
230 elsif ($arg eq "-o") { | |
231 $opt_outfile = shift or die("Output filename option -o requires an argument.\n"); | |
232 } else { | |
233 die("Invalid option '$arg'.\n"); | |
234 } | |
235 } else { | |
236 $opt_filename = $arg; | |
237 } | |
238 } | |
239 | |
240 die("Usage: $0 [options] <filename|URI> | |
241 | |
242 -php Output a PHP include file with data in arrays (default) | |
243 -simple Output simple tabled output for easy parsing. | |
244 -xml Output XML. | |
245 | |
246 -o <filename> Set output filename. Default is to use stdout. | |
247 | |
248 -dump Dump HTML tree to stdout and quit. | |
249 | |
250 ") unless defined($opt_filename); | |
251 | |
252 | |
253 my $data; | |
254 if ($opt_filename =~ /^(http|https):/) { | |
255 $data = http_fetch($opt_filename) or die("Could not fetch: $opt_filename\n"); | |
256 } else { | |
257 open(my $fh, '<', $opt_filename) or die("Error opening '$opt_filename': $!\n"); | |
258 $data = do { local $/; <$fh> }; | |
259 close($fh); | |
260 } | |
261 | |
262 die("No data in input.\n") unless (defined($data) && $data ne ""); | |
263 | |
264 | |
265 # Filter out certain unneeded elements | |
266 $data =~ s/<font[^>]*>//ig; | |
267 $data =~ s/<\/font>//ig; | |
268 $data =~ s/<\/?center>//ig; | |
269 $data =~ s/<br>//ig; | |
270 $data =~ s/ / /ig; | |
271 | |
272 ### Get some general information | |
273 my $otree = parse_html($data); | |
274 if ($opt_dump) { | |
275 print Dumper(fnode($otree, "html")); | |
276 exit; | |
277 } | |
278 | |
279 my %class = (); | |
280 my $body = fnode($otree, "body"); | |
281 if (defined($body) && defined($$body{"nodes"})) { | |
282 foreach my $n (@{$$body{"nodes"}}) { | |
283 if ($$n{"name"} eq "text") { | |
284 push(@{$class{"info"}}, $$n{"text"}); | |
285 } | |
286 elsif ($$n{"name"} eq "b") { | |
287 push(@{$class{"data"}}, $n); | |
288 } | |
289 } | |
290 } | |
291 | |
292 # Filter out some more, for easier tree access during table parsing | |
293 $data =~ s/<\/?b>//ig; | |
294 my $tree = parse_html($data); | |
295 my $node = fnode(fnode($tree, "body"), "table"); | |
296 die("No table element found in document. Perhaps the format has changed? :(\n") unless defined($node); | |
297 | |
298 | |
299 ### Parse through the HTML document node tree to find the data we need | |
300 my $id = 0; | |
301 my $q = $$node{"nodes"}; | |
302 my $tunnit = {}; | |
303 my $taulu = {}; | |
304 my $maxdays = 6; | |
305 my $maxhours = 0; | |
306 | |
307 # Skip zero this way | |
308 for (my $i = 1; $i < scalar(@{$q}); $i++) { | |
309 my $d = $$q[$i]{"nodes"}; | |
310 if (defined($d)) { | |
311 foreach my $n (@{$d}) { | |
312 my $l = $$n{"nodes"}[0]{"nodes"}; | |
313 if (defined($l) && $$n{"args"} =~ /colspan=6\s+rowspan=(\d+)/) { | |
314 my $tuntia = $1 / 2; | |
315 my $data = []; | |
316 my $grouped = 0; | |
317 foreach my $h (@{$l}) { | |
318 if (defined($$h{"nodes"})) { | |
319 foreach my $b (@{$$h{"nodes"}}) { | |
320 if (defined($$b{"nodes"})) { | |
321 my $text = $$b{"nodes"}[0]{"text"}; | |
322 $text =~ s/\.$//; | |
323 | |
324 $grouped = 1 if ($text =~ /vuorov/); | |
325 | |
326 push(@$data, $text); | |
327 } | |
328 } | |
329 } | |
330 } | |
331 | |
332 my $tid; | |
333 if (scalar(@$data) > 0) { | |
334 $id++; | |
335 $tid = $id; | |
336 } else { | |
337 $tid = 0; | |
338 } | |
339 | |
340 my $tpd = 0; | |
341 for (my $x = 0; $x < $maxdays; $x++) { | |
342 if (!defined($$taulu{$maxhours}{$x})) { | |
343 $tpd = $x; | |
344 last; | |
345 } | |
346 } | |
347 for (my $t = 0; $t < $tuntia; $t++) { | |
348 $$taulu{$maxhours + $t}{$tpd} = $tid; | |
349 } | |
350 | |
351 if (scalar(@$data) > 0) { | |
352 # Grouped, if there is another class ID in second slot | |
353 $grouped = 1 if ($$data[1] =~ /^[A-Z]\d{6}$/); | |
354 $$tunnit{$id} = { "grouped" => $grouped, "day" => $tpd, "start" => $maxhours, "hours" => $tuntia, "data" => $data }; | |
355 } | |
356 } | |
357 } | |
358 $maxhours++; | |
359 } | |
360 } | |
361 | |
362 | |
363 ### Open output file, if specified | |
364 if (defined($opt_outfile)) { | |
365 open(STDOUT, '>', $opt_outfile) or die("Could not open output file '$opt_outfile'.\n"); | |
366 } | |
367 | |
368 | |
369 ### Output data in desired format | |
370 if ($opt_mode eq "php") { | |
371 print "<?\n". | |
372 "\$classInfo = array(\n". | |
373 " \"general\" => array(".join(", ", map { "\"".escape($_)."\""; } @{$class{"info"}})."),\n". | |
374 " \"info\" => array(".join(", ", map { "\"".escape(html_collapse($_, 1))."\""; } @{$class{"data"}})."),\n". | |
375 " \"info_tags\" => array(".join(", ", map { "\"".escape(html_collapse($_, 0))."\""; } @{$class{"data"}})."),\n". | |
376 " \"maxdays\" => $maxdays,\n". | |
377 " \"maxhours\" => $maxhours,\n". | |
378 ");\n\n"; | |
379 | |
380 print "\$classDefs = array(\n"; | |
381 foreach my $id (sort { $a <=> $b } keys %{$tunnit}) { | |
382 print " $id => array("; | |
383 foreach my $key (keys %{$$tunnit{$id}}) { | |
384 my $a = $$tunnit{$id}{$key}; | |
385 print "\"$key\" => "; | |
386 if (ref($a) eq "ARRAY") { | |
387 print "array(".join(", ", map { "\"".escape($_)."\""; } @$a).")"; | |
388 } | |
389 elsif ($a =~ /^\d+$/) { | |
390 print $a; | |
391 } else { | |
392 print "\"".escape($a)."\""; | |
393 } | |
394 print ", "; | |
395 } | |
396 print "),\n"; | |
397 } | |
398 | |
399 print ");\n". | |
400 "\n". | |
401 "\$classHourTable = array(\n"; | |
402 for (my $y = 0; $y < $maxhours; $y++) { | |
403 my $str = ""; | |
404 for (my $x = 0; $x < $maxdays; $x++) { | |
405 $str .= ", " unless ($str eq ""); | |
406 $str .= sprintf "%3d", $$taulu{$y}{$x}; | |
407 } | |
408 print " array(".$str."),\n"; | |
409 } | |
410 print ");\n?>\n"; | |
411 } | |
412 | |
413 elsif ($opt_mode eq "xml") { | |
414 print "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n". | |
415 "<timetable>\n". | |
416 " <class>\n". | |
417 " <general>".join("", map { "<node>".encode_entities($_)."</node>"; } @{$class{"info"}})."</general>\n". | |
418 " <info>".join("", map { "<node>".encode_entities(html_collapse($_, 1))."</node>"; } @{$class{"data"}})."</info>\n". | |
419 " <maxdays>$maxdays</maxdays>\n". | |
420 " <maxhours>$maxhours</maxhours>\n". | |
421 " </class>\n"; | |
422 | |
423 | |
424 print "</timetable>\n"; | |
425 } | |
426 | |
427 | |
428 | |
429 elsif ($opt_mode eq "simple") { | |
430 for (my $y = 0; $y < $maxhours; $y++) { | |
431 for (my $x = 0; $x < $maxdays; $x++) { | |
432 printf "%3d ", $$taulu{$y}{$x}; | |
433 } | |
434 print "\n"; | |
435 } | |
436 } | |
437 | |
438 close (STDOUT); |