comparison fetchdata.pl @ 1:21fde93375e9

Add beta code.
author Matti Hamalainen <ccr@tnsp.org>
date Tue, 11 Jan 2011 20:43:12 +0200
parents
children
comparison
equal deleted inserted replaced
0:02f0f79f98b6 1:21fde93375e9
1 #!/usr/bin/perl -w
2 #
3 # Fetch and parse HTML format class timetable into more sane formats
4 # (C) Copyright 2010-2010 Matti Hämäläinen <ccr@tnsp.org>
5 #
6 use strict;
7 use Data::Dumper;
8 use LWP::UserAgent;
9 use HTML::Entities;
10
11 my $userAgent = "Lukkari/0.7";
12
13
14 sub urlencode($)
15 {
16 my $value = $_[0];
17 $value =~ s/([^a-zA-Z_0-9 ])/"%" . uc(sprintf "%lx" , unpack("C", $1))/eg;
18 $value =~ tr/ /+/;
19 return $value;
20 }
21
22
23 sub str_trim($)
24 {
25 my $str = $_[0];
26 if (defined($str)) {
27 $str =~ s/^\s*//;
28 $str =~ s/\s*$//;
29 }
30 return $str;
31 }
32
33
34 sub pop_token_a($)
35 {
36 my $tokens = $_[0];
37 return shift(@$tokens);
38 }
39
40
41 sub pop_token($)
42 {
43 return str_trim(pop_token_a($_[0]));
44 }
45
46
47 sub parse_html_str($)
48 {
49 my $tokens = $_[0];
50 my $token = pop_token($tokens);
51 my $str = "";
52 $token =~ s/^\s*//;
53
54 return undef unless (substr($token, 0, 1) eq '"');
55 $token = substr($token, 1);
56
57 while (defined($token)) {
58 if (substr($token, -1) eq '"') {
59 $str .= substr($token, 0, -1);
60 return $str;
61 } else {
62 $str .= $token;
63 }
64 $token = shift(@$tokens);
65 }
66 return undef;
67 }
68
69
70 sub parse_html_tree($$);
71
72 sub parse_html_tree($$)
73 {
74 my ($tokens, $tree) = @_;
75
76 while (my $token = pop_token($tokens)) {
77 if ($token =~ /^<[!\/]?[a-zA-Z]+/) {
78 $token = lc($token);
79 if ($token =~ /^<\!.*>$/) {
80 # Ignore comments etc.
81 } elsif ($token =~ /^<([a-z]+)(.*)>$/) {
82 my ($name, $args) = ($1, $2);
83 if ($name eq "meta" || $name eq "img") {
84 my $tmp = {};
85 parse_html_tree($tokens, $tree);
86 return $tree;
87 } else {
88 my $tmp = { "name" => $name, "args" => str_trim($args) };
89 parse_html_tree($tokens, $tmp);
90 push(@{$$tree{"nodes"}}, $tmp);
91 }
92 } elsif ($token =~ /^<\/([a-z]+)>$/) {
93 return $tree;
94 } else {
95 die("HORROR TERROR ELITE: $token\n");
96 }
97 } else {
98 $token = str_trim(decode_entities($token));
99 push(@{$$tree{"nodes"}}, { "name" => "text", "args" => "", "text" => $token }) if length($token) > 0;
100 }
101 }
102
103 return $tree;
104 }
105
106
107 sub parse_html($)
108 {
109 return undef unless defined($_[0]);
110 my $str = $_[0];
111 my $res = { "name" => "", "args" => "" };
112 $str =~ tr/\r/ /;
113 $str =~ tr/\n/ /;
114 my @tokens = grep { !($_ =~ /^\s*$/) } split(/(<\/?[a-zA-Z]+.*?>)/, $str);
115 parse_html_tree(\@tokens, $res);
116 return $res;
117 }
118
119
120 sub http_fetch($)
121 {
122 my $agent = LWP::UserAgent->new;
123 $agent->agent($userAgent);
124 $agent->timeout(10);
125
126 my $req = HTTP::Request->new(GET => $_[0]);
127 $req->user_agent($userAgent);
128 my $res = $agent->request($req);
129
130 if ($res->is_success) {
131 return $res->content;
132 } else {
133 print STDERR "HTTP request failed: [".$res->code."] ".$res->message."\n";
134 return undef;
135 }
136 }
137
138
139 sub html_find_node($$$);
140
141 sub html_find_node($$$)
142 {
143 my ($node, $name, $args) = @_;
144
145 if (defined($node)) {
146 if (ref($node) eq "ARRAY") {
147 foreach my $n (@$node) {
148 my $tmp = html_find_node($n, $name, $args);
149 # Must do it like this, in order not to break the loop
150 return $tmp if defined($tmp);
151 }
152 } elsif (ref($node) eq "HASH") {
153 if (defined($$node{"name"})) {
154 if ($$node{"name"} eq $name) {
155 if ($args ne "") {
156 if (defined($$node{"args"}) && $$node{"args"} =~ /$args/) {
157 } else {
158 return undef;
159 }
160 }
161 return $node;
162 } else {
163 return html_find_node($$node{"nodes"}, $name, $args);
164 }
165 }
166 }
167 }
168 return undef;
169 }
170
171
172 sub fnode($$)
173 {
174 return html_find_node($_[0], $_[1], "");
175 }
176
177
178 sub fnodea($$$)
179 {
180 return html_find_node($_[0], $_[1], $_[2]);
181 }
182
183
184 sub escape($)
185 {
186 my $s = $_[0];
187 $s =~ s/(['"])/\\$1/g;
188 return $s;
189 }
190
191
192 sub html_collapse($$);
193
194 sub html_collapse($$)
195 {
196 my ($node, $strip) = @_;
197 my $str = "";
198
199 if ($$node{"name"} eq "text") {
200 $str .= $$node{"text"};
201 } else {
202 $str .= "<".$$node{"name"}.">" unless ($strip);
203 foreach my $n (@{$$node{"nodes"}}) {
204 $str .= html_collapse($n, $strip);
205 }
206 $str .= "</".$$node{"name"}.">" unless ($strip);
207 }
208
209 return $str;
210 }
211
212
213 ###
214 ### Main program
215 ###
216 my $modes = "simple|php|xml";
217 my $opt_mode = "php";
218 my $opt_dump = 0;
219 my $opt_filename;
220 my $opt_outfile;
221
222 while (defined(my $arg = shift)) {
223 if (substr($arg, 0, 1) eq "-") {
224 if ($arg =~ /^-($modes)$/o) {
225 $opt_mode = $1;
226 }
227 elsif ($arg eq "-dump") {
228 $opt_dump = 1;
229 }
230 elsif ($arg eq "-o") {
231 $opt_outfile = shift or die("Output filename option -o requires an argument.\n");
232 } else {
233 die("Invalid option '$arg'.\n");
234 }
235 } else {
236 $opt_filename = $arg;
237 }
238 }
239
240 die("Usage: $0 [options] <filename|URI>
241
242 -php Output a PHP include file with data in arrays (default)
243 -simple Output simple tabled output for easy parsing.
244 -xml Output XML.
245
246 -o <filename> Set output filename. Default is to use stdout.
247
248 -dump Dump HTML tree to stdout and quit.
249
250 ") unless defined($opt_filename);
251
252
253 my $data;
254 if ($opt_filename =~ /^(http|https):/) {
255 $data = http_fetch($opt_filename) or die("Could not fetch: $opt_filename\n");
256 } else {
257 open(my $fh, '<', $opt_filename) or die("Error opening '$opt_filename': $!\n");
258 $data = do { local $/; <$fh> };
259 close($fh);
260 }
261
262 die("No data in input.\n") unless (defined($data) && $data ne "");
263
264
265 # Filter out certain unneeded elements
266 $data =~ s/<font[^>]*>//ig;
267 $data =~ s/<\/font>//ig;
268 $data =~ s/<\/?center>//ig;
269 $data =~ s/<br>//ig;
270 $data =~ s/&nbsp;/ /ig;
271
272 ### Get some general information
273 my $otree = parse_html($data);
274 if ($opt_dump) {
275 print Dumper(fnode($otree, "html"));
276 exit;
277 }
278
279 my %class = ();
280 my $body = fnode($otree, "body");
281 if (defined($body) && defined($$body{"nodes"})) {
282 foreach my $n (@{$$body{"nodes"}}) {
283 if ($$n{"name"} eq "text") {
284 push(@{$class{"info"}}, $$n{"text"});
285 }
286 elsif ($$n{"name"} eq "b") {
287 push(@{$class{"data"}}, $n);
288 }
289 }
290 }
291
292 # Filter out some more, for easier tree access during table parsing
293 $data =~ s/<\/?b>//ig;
294 my $tree = parse_html($data);
295 my $node = fnode(fnode($tree, "body"), "table");
296 die("No table element found in document. Perhaps the format has changed? :(\n") unless defined($node);
297
298
299 ### Parse through the HTML document node tree to find the data we need
300 my $id = 0;
301 my $q = $$node{"nodes"};
302 my $tunnit = {};
303 my $taulu = {};
304 my $maxdays = 6;
305 my $maxhours = 0;
306
307 # Skip zero this way
308 for (my $i = 1; $i < scalar(@{$q}); $i++) {
309 my $d = $$q[$i]{"nodes"};
310 if (defined($d)) {
311 foreach my $n (@{$d}) {
312 my $l = $$n{"nodes"}[0]{"nodes"};
313 if (defined($l) && $$n{"args"} =~ /colspan=6\s+rowspan=(\d+)/) {
314 my $tuntia = $1 / 2;
315 my $data = [];
316 my $grouped = 0;
317 foreach my $h (@{$l}) {
318 if (defined($$h{"nodes"})) {
319 foreach my $b (@{$$h{"nodes"}}) {
320 if (defined($$b{"nodes"})) {
321 my $text = $$b{"nodes"}[0]{"text"};
322 $text =~ s/\.$//;
323
324 $grouped = 1 if ($text =~ /vuorov/);
325
326 push(@$data, $text);
327 }
328 }
329 }
330 }
331
332 my $tid;
333 if (scalar(@$data) > 0) {
334 $id++;
335 $tid = $id;
336 } else {
337 $tid = 0;
338 }
339
340 my $tpd = 0;
341 for (my $x = 0; $x < $maxdays; $x++) {
342 if (!defined($$taulu{$maxhours}{$x})) {
343 $tpd = $x;
344 last;
345 }
346 }
347 for (my $t = 0; $t < $tuntia; $t++) {
348 $$taulu{$maxhours + $t}{$tpd} = $tid;
349 }
350
351 if (scalar(@$data) > 0) {
352 # Grouped, if there is another class ID in second slot
353 $grouped = 1 if ($$data[1] =~ /^[A-Z]\d{6}$/);
354 $$tunnit{$id} = { "grouped" => $grouped, "day" => $tpd, "start" => $maxhours, "hours" => $tuntia, "data" => $data };
355 }
356 }
357 }
358 $maxhours++;
359 }
360 }
361
362
363 ### Open output file, if specified
364 if (defined($opt_outfile)) {
365 open(STDOUT, '>', $opt_outfile) or die("Could not open output file '$opt_outfile'.\n");
366 }
367
368
369 ### Output data in desired format
370 if ($opt_mode eq "php") {
371 print "<?\n".
372 "\$classInfo = array(\n".
373 " \"general\" => array(".join(", ", map { "\"".escape($_)."\""; } @{$class{"info"}})."),\n".
374 " \"info\" => array(".join(", ", map { "\"".escape(html_collapse($_, 1))."\""; } @{$class{"data"}})."),\n".
375 " \"info_tags\" => array(".join(", ", map { "\"".escape(html_collapse($_, 0))."\""; } @{$class{"data"}})."),\n".
376 " \"maxdays\" => $maxdays,\n".
377 " \"maxhours\" => $maxhours,\n".
378 ");\n\n";
379
380 print "\$classDefs = array(\n";
381 foreach my $id (sort { $a <=> $b } keys %{$tunnit}) {
382 print " $id => array(";
383 foreach my $key (keys %{$$tunnit{$id}}) {
384 my $a = $$tunnit{$id}{$key};
385 print "\"$key\" => ";
386 if (ref($a) eq "ARRAY") {
387 print "array(".join(", ", map { "\"".escape($_)."\""; } @$a).")";
388 }
389 elsif ($a =~ /^\d+$/) {
390 print $a;
391 } else {
392 print "\"".escape($a)."\"";
393 }
394 print ", ";
395 }
396 print "),\n";
397 }
398
399 print ");\n".
400 "\n".
401 "\$classHourTable = array(\n";
402 for (my $y = 0; $y < $maxhours; $y++) {
403 my $str = "";
404 for (my $x = 0; $x < $maxdays; $x++) {
405 $str .= ", " unless ($str eq "");
406 $str .= sprintf "%3d", $$taulu{$y}{$x};
407 }
408 print " array(".$str."),\n";
409 }
410 print ");\n?>\n";
411 }
412
413 elsif ($opt_mode eq "xml") {
414 print "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n".
415 "<timetable>\n".
416 " <class>\n".
417 " <general>".join("", map { "<node>".encode_entities($_)."</node>"; } @{$class{"info"}})."</general>\n".
418 " <info>".join("", map { "<node>".encode_entities(html_collapse($_, 1))."</node>"; } @{$class{"data"}})."</info>\n".
419 " <maxdays>$maxdays</maxdays>\n".
420 " <maxhours>$maxhours</maxhours>\n".
421 " </class>\n";
422
423
424 print "</timetable>\n";
425 }
426
427
428
429 elsif ($opt_mode eq "simple") {
430 for (my $y = 0; $y < $maxhours; $y++) {
431 for (my $x = 0; $x < $maxdays; $x++) {
432 printf "%3d ", $$taulu{$y}{$x};
433 }
434 print "\n";
435 }
436 }
437
438 close (STDOUT);