1
|
1 #!/usr/bin/perl -w
|
|
2 #
|
|
3 # Fetch and parse HTML format class timetable into more sane formats
|
|
4 # (C) Copyright 2010-2010 Matti Hämäläinen <ccr@tnsp.org>
|
|
5 #
|
|
6 use strict;
|
|
7 use Data::Dumper;
|
|
8 use LWP::UserAgent;
|
|
9 use HTML::Entities;
|
|
10
|
|
11 my $userAgent = "Lukkari/0.7";
|
|
12
|
|
13
|
|
14 sub urlencode($)
|
|
15 {
|
|
16 my $value = $_[0];
|
|
17 $value =~ s/([^a-zA-Z_0-9 ])/"%" . uc(sprintf "%lx" , unpack("C", $1))/eg;
|
|
18 $value =~ tr/ /+/;
|
|
19 return $value;
|
|
20 }
|
|
21
|
|
22
|
|
23 sub str_trim($)
|
|
24 {
|
|
25 my $str = $_[0];
|
|
26 if (defined($str)) {
|
|
27 $str =~ s/^\s*//;
|
|
28 $str =~ s/\s*$//;
|
|
29 }
|
|
30 return $str;
|
|
31 }
|
|
32
|
|
33
|
|
34 sub pop_token_a($)
|
|
35 {
|
|
36 my $tokens = $_[0];
|
|
37 return shift(@$tokens);
|
|
38 }
|
|
39
|
|
40
|
|
41 sub pop_token($)
|
|
42 {
|
|
43 return str_trim(pop_token_a($_[0]));
|
|
44 }
|
|
45
|
|
46
|
|
47 sub parse_html_str($)
|
|
48 {
|
|
49 my $tokens = $_[0];
|
|
50 my $token = pop_token($tokens);
|
|
51 my $str = "";
|
|
52 $token =~ s/^\s*//;
|
|
53
|
|
54 return undef unless (substr($token, 0, 1) eq '"');
|
|
55 $token = substr($token, 1);
|
|
56
|
|
57 while (defined($token)) {
|
|
58 if (substr($token, -1) eq '"') {
|
|
59 $str .= substr($token, 0, -1);
|
|
60 return $str;
|
|
61 } else {
|
|
62 $str .= $token;
|
|
63 }
|
|
64 $token = shift(@$tokens);
|
|
65 }
|
|
66 return undef;
|
|
67 }
|
|
68
|
|
69
|
|
70 sub parse_html_tree($$);
|
|
71
|
|
72 sub parse_html_tree($$)
|
|
73 {
|
|
74 my ($tokens, $tree) = @_;
|
|
75
|
|
76 while (my $token = pop_token($tokens)) {
|
|
77 if ($token =~ /^<[!\/]?[a-zA-Z]+/) {
|
|
78 $token = lc($token);
|
|
79 if ($token =~ /^<\!.*>$/) {
|
|
80 # Ignore comments etc.
|
|
81 } elsif ($token =~ /^<([a-z]+)(.*)>$/) {
|
|
82 my ($name, $args) = ($1, $2);
|
|
83 if ($name eq "meta" || $name eq "img") {
|
|
84 my $tmp = {};
|
|
85 parse_html_tree($tokens, $tree);
|
|
86 return $tree;
|
|
87 } else {
|
|
88 my $tmp = { "name" => $name, "args" => str_trim($args) };
|
|
89 parse_html_tree($tokens, $tmp);
|
|
90 push(@{$$tree{"nodes"}}, $tmp);
|
|
91 }
|
|
92 } elsif ($token =~ /^<\/([a-z]+)>$/) {
|
|
93 return $tree;
|
|
94 } else {
|
|
95 die("HORROR TERROR ELITE: $token\n");
|
|
96 }
|
|
97 } else {
|
|
98 $token = str_trim(decode_entities($token));
|
|
99 push(@{$$tree{"nodes"}}, { "name" => "text", "args" => "", "text" => $token }) if length($token) > 0;
|
|
100 }
|
|
101 }
|
|
102
|
|
103 return $tree;
|
|
104 }
|
|
105
|
|
106
|
|
107 sub parse_html($)
|
|
108 {
|
|
109 return undef unless defined($_[0]);
|
|
110 my $str = $_[0];
|
|
111 my $res = { "name" => "", "args" => "" };
|
|
112 $str =~ tr/\r/ /;
|
|
113 $str =~ tr/\n/ /;
|
|
114 my @tokens = grep { !($_ =~ /^\s*$/) } split(/(<\/?[a-zA-Z]+.*?>)/, $str);
|
|
115 parse_html_tree(\@tokens, $res);
|
|
116 return $res;
|
|
117 }
|
|
118
|
|
119
|
|
120 sub http_fetch($)
|
|
121 {
|
|
122 my $agent = LWP::UserAgent->new;
|
|
123 $agent->agent($userAgent);
|
|
124 $agent->timeout(10);
|
|
125
|
|
126 my $req = HTTP::Request->new(GET => $_[0]);
|
|
127 $req->user_agent($userAgent);
|
|
128 my $res = $agent->request($req);
|
|
129
|
|
130 if ($res->is_success) {
|
|
131 return $res->content;
|
|
132 } else {
|
|
133 print STDERR "HTTP request failed: [".$res->code."] ".$res->message."\n";
|
|
134 return undef;
|
|
135 }
|
|
136 }
|
|
137
|
|
138
|
|
139 sub html_find_node($$$);
|
|
140
|
|
141 sub html_find_node($$$)
|
|
142 {
|
|
143 my ($node, $name, $args) = @_;
|
|
144
|
|
145 if (defined($node)) {
|
|
146 if (ref($node) eq "ARRAY") {
|
|
147 foreach my $n (@$node) {
|
|
148 my $tmp = html_find_node($n, $name, $args);
|
|
149 # Must do it like this, in order not to break the loop
|
|
150 return $tmp if defined($tmp);
|
|
151 }
|
|
152 } elsif (ref($node) eq "HASH") {
|
|
153 if (defined($$node{"name"})) {
|
|
154 if ($$node{"name"} eq $name) {
|
|
155 if ($args ne "") {
|
|
156 if (defined($$node{"args"}) && $$node{"args"} =~ /$args/) {
|
|
157 } else {
|
|
158 return undef;
|
|
159 }
|
|
160 }
|
|
161 return $node;
|
|
162 } else {
|
|
163 return html_find_node($$node{"nodes"}, $name, $args);
|
|
164 }
|
|
165 }
|
|
166 }
|
|
167 }
|
|
168 return undef;
|
|
169 }
|
|
170
|
|
171
|
|
172 sub fnode($$)
|
|
173 {
|
|
174 return html_find_node($_[0], $_[1], "");
|
|
175 }
|
|
176
|
|
177
|
|
178 sub fnodea($$$)
|
|
179 {
|
|
180 return html_find_node($_[0], $_[1], $_[2]);
|
|
181 }
|
|
182
|
|
183
|
|
184 sub escape($)
|
|
185 {
|
|
186 my $s = $_[0];
|
|
187 $s =~ s/(['"])/\\$1/g;
|
|
188 return $s;
|
|
189 }
|
|
190
|
|
191
|
|
192 sub html_collapse($$);
|
|
193
|
|
194 sub html_collapse($$)
|
|
195 {
|
|
196 my ($node, $strip) = @_;
|
|
197 my $str = "";
|
|
198
|
|
199 if ($$node{"name"} eq "text") {
|
|
200 $str .= $$node{"text"};
|
|
201 } else {
|
|
202 $str .= "<".$$node{"name"}.">" unless ($strip);
|
|
203 foreach my $n (@{$$node{"nodes"}}) {
|
|
204 $str .= html_collapse($n, $strip);
|
|
205 }
|
|
206 $str .= "</".$$node{"name"}.">" unless ($strip);
|
|
207 }
|
|
208
|
|
209 return $str;
|
|
210 }
|
|
211
|
|
212
|
|
213 ###
|
|
214 ### Main program
|
|
215 ###
|
|
216 my $modes = "simple|php|xml";
|
|
217 my $opt_mode = "php";
|
|
218 my $opt_dump = 0;
|
|
219 my $opt_filename;
|
|
220 my $opt_outfile;
|
|
221
|
|
222 while (defined(my $arg = shift)) {
|
|
223 if (substr($arg, 0, 1) eq "-") {
|
|
224 if ($arg =~ /^-($modes)$/o) {
|
|
225 $opt_mode = $1;
|
|
226 }
|
|
227 elsif ($arg eq "-dump") {
|
|
228 $opt_dump = 1;
|
|
229 }
|
|
230 elsif ($arg eq "-o") {
|
|
231 $opt_outfile = shift or die("Output filename option -o requires an argument.\n");
|
|
232 } else {
|
|
233 die("Invalid option '$arg'.\n");
|
|
234 }
|
|
235 } else {
|
|
236 $opt_filename = $arg;
|
|
237 }
|
|
238 }
|
|
239
|
|
240 die("Usage: $0 [options] <filename|URI>
|
|
241
|
|
242 -php Output a PHP include file with data in arrays (default)
|
|
243 -simple Output simple tabled output for easy parsing.
|
|
244 -xml Output XML.
|
|
245
|
|
246 -o <filename> Set output filename. Default is to use stdout.
|
|
247
|
|
248 -dump Dump HTML tree to stdout and quit.
|
|
249
|
|
250 ") unless defined($opt_filename);
|
|
251
|
|
252
|
|
253 my $data;
|
|
254 if ($opt_filename =~ /^(http|https):/) {
|
|
255 $data = http_fetch($opt_filename) or die("Could not fetch: $opt_filename\n");
|
|
256 } else {
|
|
257 open(my $fh, '<', $opt_filename) or die("Error opening '$opt_filename': $!\n");
|
|
258 $data = do { local $/; <$fh> };
|
|
259 close($fh);
|
|
260 }
|
|
261
|
|
262 die("No data in input.\n") unless (defined($data) && $data ne "");
|
|
263
|
|
264
|
|
265 # Filter out certain unneeded elements
|
|
266 $data =~ s/<font[^>]*>//ig;
|
|
267 $data =~ s/<\/font>//ig;
|
|
268 $data =~ s/<\/?center>//ig;
|
|
269 $data =~ s/<br>//ig;
|
|
270 $data =~ s/ / /ig;
|
|
271
|
|
272 ### Get some general information
|
|
273 my $otree = parse_html($data);
|
|
274 if ($opt_dump) {
|
|
275 print Dumper(fnode($otree, "html"));
|
|
276 exit;
|
|
277 }
|
|
278
|
|
279 my %class = ();
|
|
280 my $body = fnode($otree, "body");
|
|
281 if (defined($body) && defined($$body{"nodes"})) {
|
|
282 foreach my $n (@{$$body{"nodes"}}) {
|
|
283 if ($$n{"name"} eq "text") {
|
|
284 push(@{$class{"info"}}, $$n{"text"});
|
|
285 }
|
|
286 elsif ($$n{"name"} eq "b") {
|
|
287 push(@{$class{"data"}}, $n);
|
|
288 }
|
|
289 }
|
|
290 }
|
|
291
|
|
292 # Filter out some more, for easier tree access during table parsing
|
|
293 $data =~ s/<\/?b>//ig;
|
|
294 my $tree = parse_html($data);
|
|
295 my $node = fnode(fnode($tree, "body"), "table");
|
|
296 die("No table element found in document. Perhaps the format has changed? :(\n") unless defined($node);
|
|
297
|
|
298
|
|
299 ### Parse through the HTML document node tree to find the data we need
|
|
300 my $id = 0;
|
|
301 my $q = $$node{"nodes"};
|
|
302 my $tunnit = {};
|
|
303 my $taulu = {};
|
|
304 my $maxdays = 6;
|
|
305 my $maxhours = 0;
|
|
306
|
|
307 # Skip zero this way
|
|
308 for (my $i = 1; $i < scalar(@{$q}); $i++) {
|
|
309 my $d = $$q[$i]{"nodes"};
|
|
310 if (defined($d)) {
|
|
311 foreach my $n (@{$d}) {
|
|
312 my $l = $$n{"nodes"}[0]{"nodes"};
|
|
313 if (defined($l) && $$n{"args"} =~ /colspan=6\s+rowspan=(\d+)/) {
|
|
314 my $tuntia = $1 / 2;
|
|
315 my $data = [];
|
|
316 my $grouped = 0;
|
|
317 foreach my $h (@{$l}) {
|
|
318 if (defined($$h{"nodes"})) {
|
|
319 foreach my $b (@{$$h{"nodes"}}) {
|
|
320 if (defined($$b{"nodes"})) {
|
|
321 my $text = $$b{"nodes"}[0]{"text"};
|
|
322 $text =~ s/\.$//;
|
|
323
|
|
324 $grouped = 1 if ($text =~ /vuorov/);
|
|
325
|
|
326 push(@$data, $text);
|
|
327 }
|
|
328 }
|
|
329 }
|
|
330 }
|
|
331
|
|
332 my $tid;
|
|
333 if (scalar(@$data) > 0) {
|
|
334 $id++;
|
|
335 $tid = $id;
|
|
336 } else {
|
|
337 $tid = 0;
|
|
338 }
|
|
339
|
|
340 my $tpd = 0;
|
|
341 for (my $x = 0; $x < $maxdays; $x++) {
|
|
342 if (!defined($$taulu{$maxhours}{$x})) {
|
|
343 $tpd = $x;
|
|
344 last;
|
|
345 }
|
|
346 }
|
|
347 for (my $t = 0; $t < $tuntia; $t++) {
|
|
348 $$taulu{$maxhours + $t}{$tpd} = $tid;
|
|
349 }
|
|
350
|
|
351 if (scalar(@$data) > 0) {
|
|
352 # Grouped, if there is another class ID in second slot
|
|
353 $grouped = 1 if ($$data[1] =~ /^[A-Z]\d{6}$/);
|
|
354 $$tunnit{$id} = { "grouped" => $grouped, "day" => $tpd, "start" => $maxhours, "hours" => $tuntia, "data" => $data };
|
|
355 }
|
|
356 }
|
|
357 }
|
|
358 $maxhours++;
|
|
359 }
|
|
360 }
|
|
361
|
|
362
|
|
363 ### Open output file, if specified
|
|
364 if (defined($opt_outfile)) {
|
|
365 open(STDOUT, '>', $opt_outfile) or die("Could not open output file '$opt_outfile'.\n");
|
|
366 }
|
|
367
|
|
368
|
|
369 ### Output data in desired format
|
|
370 if ($opt_mode eq "php") {
|
|
371 print "<?\n".
|
|
372 "\$classInfo = array(\n".
|
|
373 " \"general\" => array(".join(", ", map { "\"".escape($_)."\""; } @{$class{"info"}})."),\n".
|
|
374 " \"info\" => array(".join(", ", map { "\"".escape(html_collapse($_, 1))."\""; } @{$class{"data"}})."),\n".
|
|
375 " \"info_tags\" => array(".join(", ", map { "\"".escape(html_collapse($_, 0))."\""; } @{$class{"data"}})."),\n".
|
|
376 " \"maxdays\" => $maxdays,\n".
|
|
377 " \"maxhours\" => $maxhours,\n".
|
|
378 ");\n\n";
|
|
379
|
|
380 print "\$classDefs = array(\n";
|
|
381 foreach my $id (sort { $a <=> $b } keys %{$tunnit}) {
|
|
382 print " $id => array(";
|
|
383 foreach my $key (keys %{$$tunnit{$id}}) {
|
|
384 my $a = $$tunnit{$id}{$key};
|
|
385 print "\"$key\" => ";
|
|
386 if (ref($a) eq "ARRAY") {
|
|
387 print "array(".join(", ", map { "\"".escape($_)."\""; } @$a).")";
|
|
388 }
|
|
389 elsif ($a =~ /^\d+$/) {
|
|
390 print $a;
|
|
391 } else {
|
|
392 print "\"".escape($a)."\"";
|
|
393 }
|
|
394 print ", ";
|
|
395 }
|
|
396 print "),\n";
|
|
397 }
|
|
398
|
|
399 print ");\n".
|
|
400 "\n".
|
|
401 "\$classHourTable = array(\n";
|
|
402 for (my $y = 0; $y < $maxhours; $y++) {
|
|
403 my $str = "";
|
|
404 for (my $x = 0; $x < $maxdays; $x++) {
|
|
405 $str .= ", " unless ($str eq "");
|
|
406 $str .= sprintf "%3d", $$taulu{$y}{$x};
|
|
407 }
|
|
408 print " array(".$str."),\n";
|
|
409 }
|
|
410 print ");\n?>\n";
|
|
411 }
|
|
412
|
|
413 elsif ($opt_mode eq "xml") {
|
|
414 print "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n".
|
|
415 "<timetable>\n".
|
|
416 " <class>\n".
|
|
417 " <general>".join("", map { "<node>".encode_entities($_)."</node>"; } @{$class{"info"}})."</general>\n".
|
|
418 " <info>".join("", map { "<node>".encode_entities(html_collapse($_, 1))."</node>"; } @{$class{"data"}})."</info>\n".
|
|
419 " <maxdays>$maxdays</maxdays>\n".
|
|
420 " <maxhours>$maxhours</maxhours>\n".
|
|
421 " </class>\n";
|
|
422
|
|
423
|
|
424 print "</timetable>\n";
|
|
425 }
|
|
426
|
|
427
|
|
428
|
|
429 elsif ($opt_mode eq "simple") {
|
|
430 for (my $y = 0; $y < $maxhours; $y++) {
|
|
431 for (my $x = 0; $x < $maxdays; $x++) {
|
|
432 printf "%3d ", $$taulu{$y}{$x};
|
|
433 }
|
|
434 print "\n";
|
|
435 }
|
|
436 }
|
|
437
|
|
438 close (STDOUT);
|