Mercurial > hg > lukkari
annotate parsedata.pl @ 25:a076d8d22422
Fix string parsing in the HTML parser.
author | Matti Hamalainen <ccr@tnsp.org> |
---|---|
date | Sun, 27 Mar 2011 01:14:40 +0200 |
parents | a52a0bdb5ea1 |
children | eb82a7ddf5bd |
rev | line source |
---|---|
1 | 1 #!/usr/bin/perl -w |
2 # | |
3 # Fetch and parse HTML format class timetable into more sane formats | |
4 # (C) Copyright 2010-2010 Matti Hämäläinen <ccr@tnsp.org> | |
5 # | |
6 use strict; | |
7 use Data::Dumper; | |
8 use HTML::Entities; | |
9 | |
10 | |
11 sub urlencode($) | |
12 { | |
13 my $value = $_[0]; | |
14 $value =~ s/([^a-zA-Z_0-9 ])/"%" . uc(sprintf "%lx" , unpack("C", $1))/eg; | |
15 $value =~ tr/ /+/; | |
16 return $value; | |
17 } | |
18 | |
19 | |
20 sub str_trim($) | |
21 { | |
22 my $str = $_[0]; | |
23 if (defined($str)) { | |
24 $str =~ s/^\s*//; | |
25 $str =~ s/\s*$//; | |
26 } | |
27 return $str; | |
28 } | |
29 | |
30 | |
31 sub pop_token_a($) | |
32 { | |
33 my $tokens = $_[0]; | |
34 return shift(@$tokens); | |
35 } | |
36 | |
37 | |
38 sub pop_token($) | |
39 { | |
40 return str_trim(pop_token_a($_[0])); | |
41 } | |
42 | |
43 | |
44 sub parse_html_str($) | |
45 { | |
46 my $tokens = $_[0]; | |
47 my $token = pop_token($tokens); | |
48 my $str = ""; | |
49 $token =~ s/^\s*//; | |
50 | |
51 return undef unless (substr($token, 0, 1) eq '"'); | |
52 $token = substr($token, 1); | |
53 | |
54 while (defined($token)) { | |
25
a076d8d22422
Fix string parsing in the HTML parser.
Matti Hamalainen <ccr@tnsp.org>
parents:
8
diff
changeset
|
55 $tmp =~ s/\s*$//; |
a076d8d22422
Fix string parsing in the HTML parser.
Matti Hamalainen <ccr@tnsp.org>
parents:
8
diff
changeset
|
56 if (substr($tmp, -1) eq '"') { |
a076d8d22422
Fix string parsing in the HTML parser.
Matti Hamalainen <ccr@tnsp.org>
parents:
8
diff
changeset
|
57 $str .= substr($tmp, 0, -1); |
1 | 58 return $str; |
59 } else { | |
60 $str .= $token; | |
61 } | |
62 $token = shift(@$tokens); | |
63 } | |
64 return undef; | |
65 } | |
66 | |
67 | |
68 sub parse_html_tree($$); | |
69 | |
70 sub parse_html_tree($$) | |
71 { | |
72 my ($tokens, $tree) = @_; | |
73 | |
74 while (my $token = pop_token($tokens)) { | |
75 if ($token =~ /^<[!\/]?[a-zA-Z]+/) { | |
76 $token = lc($token); | |
77 if ($token =~ /^<\!.*>$/) { | |
78 # Ignore comments etc. | |
79 } elsif ($token =~ /^<([a-z]+)(.*)>$/) { | |
80 my ($name, $args) = ($1, $2); | |
81 if ($name eq "meta" || $name eq "img") { | |
82 my $tmp = {}; | |
83 parse_html_tree($tokens, $tree); | |
84 return $tree; | |
85 } else { | |
86 my $tmp = { "name" => $name, "args" => str_trim($args) }; | |
87 parse_html_tree($tokens, $tmp); | |
88 push(@{$$tree{"nodes"}}, $tmp); | |
89 } | |
90 } elsif ($token =~ /^<\/([a-z]+)>$/) { | |
91 return $tree; | |
92 } else { | |
93 die("HORROR TERROR ELITE: $token\n"); | |
94 } | |
95 } else { | |
96 $token = str_trim(decode_entities($token)); | |
97 push(@{$$tree{"nodes"}}, { "name" => "text", "args" => "", "text" => $token }) if length($token) > 0; | |
98 } | |
99 } | |
100 | |
101 return $tree; | |
102 } | |
103 | |
104 | |
105 sub parse_html($) | |
106 { | |
107 return undef unless defined($_[0]); | |
108 my $str = $_[0]; | |
109 my $res = { "name" => "", "args" => "" }; | |
110 $str =~ tr/\r/ /; | |
111 $str =~ tr/\n/ /; | |
112 my @tokens = grep { !($_ =~ /^\s*$/) } split(/(<\/?[a-zA-Z]+.*?>)/, $str); | |
113 parse_html_tree(\@tokens, $res); | |
114 return $res; | |
115 } | |
116 | |
117 sub html_find_node($$$); | |
118 | |
119 sub html_find_node($$$) | |
120 { | |
121 my ($node, $name, $args) = @_; | |
122 | |
123 if (defined($node)) { | |
124 if (ref($node) eq "ARRAY") { | |
125 foreach my $n (@$node) { | |
126 my $tmp = html_find_node($n, $name, $args); | |
127 # Must do it like this, in order not to break the loop | |
128 return $tmp if defined($tmp); | |
129 } | |
130 } elsif (ref($node) eq "HASH") { | |
131 if (defined($$node{"name"})) { | |
132 if ($$node{"name"} eq $name) { | |
133 if ($args ne "") { | |
134 if (defined($$node{"args"}) && $$node{"args"} =~ /$args/) { | |
135 } else { | |
136 return undef; | |
137 } | |
138 } | |
139 return $node; | |
140 } else { | |
141 return html_find_node($$node{"nodes"}, $name, $args); | |
142 } | |
143 } | |
144 } | |
145 } | |
146 return undef; | |
147 } | |
148 | |
149 | |
150 sub fnode($$) | |
151 { | |
152 return html_find_node($_[0], $_[1], ""); | |
153 } | |
154 | |
155 | |
156 sub fnodea($$$) | |
157 { | |
158 return html_find_node($_[0], $_[1], $_[2]); | |
159 } | |
160 | |
161 | |
162 sub escape($) | |
163 { | |
164 my $s = $_[0]; | |
165 $s =~ s/(['"])/\\$1/g; | |
166 return $s; | |
167 } | |
168 | |
169 | |
170 sub html_collapse($$); | |
171 | |
172 sub html_collapse($$) | |
173 { | |
174 my ($node, $strip) = @_; | |
175 my $str = ""; | |
176 | |
177 if ($$node{"name"} eq "text") { | |
178 $str .= $$node{"text"}; | |
179 } else { | |
180 $str .= "<".$$node{"name"}.">" unless ($strip); | |
181 foreach my $n (@{$$node{"nodes"}}) { | |
182 $str .= html_collapse($n, $strip); | |
183 } | |
184 $str .= "</".$$node{"name"}.">" unless ($strip); | |
185 } | |
186 | |
187 return $str; | |
188 } | |
189 | |
190 | |
191 ### | |
192 ### Main program | |
193 ### | |
194 my $modes = "simple|php|xml"; | |
195 my $opt_mode = "php"; | |
196 my $opt_dump = 0; | |
197 my $opt_filename; | |
198 my $opt_outfile; | |
199 | |
200 while (defined(my $arg = shift)) { | |
201 if (substr($arg, 0, 1) eq "-") { | |
202 if ($arg =~ /^-($modes)$/o) { | |
203 $opt_mode = $1; | |
204 } | |
205 elsif ($arg eq "-dump") { | |
206 $opt_dump = 1; | |
207 } | |
208 elsif ($arg eq "-o") { | |
209 $opt_outfile = shift or die("Output filename option -o requires an argument.\n"); | |
210 } else { | |
211 die("Invalid option '$arg'.\n"); | |
212 } | |
213 } else { | |
214 $opt_filename = $arg; | |
215 } | |
216 } | |
217 | |
4
dd2bce7ec0c8
Removed http fetching feature, it's not really useful here.
Matti Hamalainen <ccr@tnsp.org>
parents:
3
diff
changeset
|
218 die("Usage: $0 [options] <filename> |
1 | 219 |
220 -php Output a PHP include file with data in arrays (default) | |
221 -simple Output simple tabled output for easy parsing. | |
222 -xml Output XML. | |
223 | |
224 -o <filename> Set output filename. Default is to use stdout. | |
225 | |
226 -dump Dump HTML tree to stdout and quit. | |
227 | |
228 ") unless defined($opt_filename); | |
229 | |
230 | |
231 my $data; | |
4
dd2bce7ec0c8
Removed http fetching feature, it's not really useful here.
Matti Hamalainen <ccr@tnsp.org>
parents:
3
diff
changeset
|
232 open(my $fh, '<', $opt_filename) or die("Error opening '$opt_filename': $!\n"); |
dd2bce7ec0c8
Removed http fetching feature, it's not really useful here.
Matti Hamalainen <ccr@tnsp.org>
parents:
3
diff
changeset
|
233 $data = do { local $/; <$fh> }; |
dd2bce7ec0c8
Removed http fetching feature, it's not really useful here.
Matti Hamalainen <ccr@tnsp.org>
parents:
3
diff
changeset
|
234 close($fh); |
1 | 235 |
236 die("No data in input.\n") unless (defined($data) && $data ne ""); | |
237 | |
238 | |
239 # Filter out certain unneeded elements | |
240 $data =~ s/<font[^>]*>//ig; | |
241 $data =~ s/<\/font>//ig; | |
242 $data =~ s/<\/?center>//ig; | |
243 $data =~ s/<br>//ig; | |
244 $data =~ s/ / /ig; | |
245 | |
246 ### Get some general information | |
247 my $otree = parse_html($data); | |
248 if ($opt_dump) { | |
249 print Dumper(fnode($otree, "html")); | |
250 exit; | |
251 } | |
252 | |
253 my %class = (); | |
254 my $body = fnode($otree, "body"); | |
255 if (defined($body) && defined($$body{"nodes"})) { | |
256 foreach my $n (@{$$body{"nodes"}}) { | |
257 if ($$n{"name"} eq "text") { | |
258 push(@{$class{"info"}}, $$n{"text"}); | |
259 } | |
260 elsif ($$n{"name"} eq "b") { | |
261 push(@{$class{"data"}}, $n); | |
262 } | |
263 } | |
264 } | |
265 | |
266 # Filter out some more, for easier tree access during table parsing | |
267 $data =~ s/<\/?b>//ig; | |
268 my $tree = parse_html($data); | |
269 my $node = fnode(fnode($tree, "body"), "table"); | |
270 die("No table element found in document. Perhaps the format has changed? :(\n") unless defined($node); | |
271 | |
272 ### Parse through the HTML document node tree to find the data we need | |
273 my $id = 0; | |
274 my $q = $$node{"nodes"}; | |
275 my $tunnit = {}; | |
276 my $taulu = {}; | |
277 my $maxdays = 6; | |
278 my $maxhours = 0; | |
279 | |
8
a52a0bdb5ea1
Crop off empty hours (time slots) and days.
Matti Hamalainen <ccr@tnsp.org>
parents:
4
diff
changeset
|
280 # Skip zero position this way (can't use foreach here) |
1 | 281 for (my $i = 1; $i < scalar(@{$q}); $i++) { |
282 my $d = $$q[$i]{"nodes"}; | |
283 if (defined($d)) { | |
284 foreach my $n (@{$d}) { | |
285 my $l = $$n{"nodes"}[0]{"nodes"}; | |
286 if (defined($l) && $$n{"args"} =~ /colspan=6\s+rowspan=(\d+)/) { | |
287 my $tuntia = $1 / 2; | |
288 my $data = []; | |
289 my $grouped = 0; | |
290 foreach my $h (@{$l}) { | |
291 if (defined($$h{"nodes"})) { | |
292 foreach my $b (@{$$h{"nodes"}}) { | |
293 if (defined($$b{"nodes"})) { | |
294 my $text = $$b{"nodes"}[0]{"text"}; | |
295 $text =~ s/\.$//; | |
296 | |
297 $grouped = 1 if ($text =~ /vuorov/); | |
298 | |
299 push(@$data, $text); | |
300 } | |
301 } | |
302 } | |
303 } | |
304 | |
305 my $tid; | |
306 if (scalar(@$data) > 0) { | |
307 $id++; | |
308 $tid = $id; | |
309 } else { | |
310 $tid = 0; | |
311 } | |
312 | |
313 my $tpd = 0; | |
314 for (my $x = 0; $x < $maxdays; $x++) { | |
315 if (!defined($$taulu{$maxhours}{$x})) { | |
316 $tpd = $x; | |
317 last; | |
318 } | |
319 } | |
320 for (my $t = 0; $t < $tuntia; $t++) { | |
321 $$taulu{$maxhours + $t}{$tpd} = $tid; | |
322 } | |
323 | |
324 if (scalar(@$data) > 0) { | |
325 # Grouped, if there is another class ID in second slot | |
326 $grouped = 1 if ($$data[1] =~ /^[A-Z]\d{6}$/); | |
327 $$tunnit{$id} = { "grouped" => $grouped, "day" => $tpd, "start" => $maxhours, "hours" => $tuntia, "data" => $data }; | |
328 } | |
329 } | |
330 } | |
331 $maxhours++; | |
332 } | |
333 } | |
334 | |
335 | |
8
a52a0bdb5ea1
Crop off empty hours (time slots) and days.
Matti Hamalainen <ccr@tnsp.org>
parents:
4
diff
changeset
|
336 ### Go through hour table, find last day and hour of the week, crop |
a52a0bdb5ea1
Crop off empty hours (time slots) and days.
Matti Hamalainen <ccr@tnsp.org>
parents:
4
diff
changeset
|
337 my $flag = 1; |
a52a0bdb5ea1
Crop off empty hours (time slots) and days.
Matti Hamalainen <ccr@tnsp.org>
parents:
4
diff
changeset
|
338 for (my $y = $maxhours - 1; $y >= 0 && $flag; $y--) { |
a52a0bdb5ea1
Crop off empty hours (time slots) and days.
Matti Hamalainen <ccr@tnsp.org>
parents:
4
diff
changeset
|
339 for (my $x = 0; $x < $maxdays && $flag; $x++) { |
a52a0bdb5ea1
Crop off empty hours (time slots) and days.
Matti Hamalainen <ccr@tnsp.org>
parents:
4
diff
changeset
|
340 $flag = 0 if (defined($$taulu{$y}{$x}) && $$taulu{$y}{$x} != 0); |
a52a0bdb5ea1
Crop off empty hours (time slots) and days.
Matti Hamalainen <ccr@tnsp.org>
parents:
4
diff
changeset
|
341 } |
a52a0bdb5ea1
Crop off empty hours (time slots) and days.
Matti Hamalainen <ccr@tnsp.org>
parents:
4
diff
changeset
|
342 $maxhours-- if ($flag); |
a52a0bdb5ea1
Crop off empty hours (time slots) and days.
Matti Hamalainen <ccr@tnsp.org>
parents:
4
diff
changeset
|
343 } |
a52a0bdb5ea1
Crop off empty hours (time slots) and days.
Matti Hamalainen <ccr@tnsp.org>
parents:
4
diff
changeset
|
344 |
a52a0bdb5ea1
Crop off empty hours (time slots) and days.
Matti Hamalainen <ccr@tnsp.org>
parents:
4
diff
changeset
|
345 $flag = 1; |
a52a0bdb5ea1
Crop off empty hours (time slots) and days.
Matti Hamalainen <ccr@tnsp.org>
parents:
4
diff
changeset
|
346 for (my $x = $maxdays - 1; $x >= 0 && $flag; $x--) { |
a52a0bdb5ea1
Crop off empty hours (time slots) and days.
Matti Hamalainen <ccr@tnsp.org>
parents:
4
diff
changeset
|
347 for (my $y = 0; $y < $maxhours && $flag; $y++) { |
a52a0bdb5ea1
Crop off empty hours (time slots) and days.
Matti Hamalainen <ccr@tnsp.org>
parents:
4
diff
changeset
|
348 $flag = 0 if (defined($$taulu{$y}{$x}) && $$taulu{$y}{$x} != 0); |
a52a0bdb5ea1
Crop off empty hours (time slots) and days.
Matti Hamalainen <ccr@tnsp.org>
parents:
4
diff
changeset
|
349 } |
a52a0bdb5ea1
Crop off empty hours (time slots) and days.
Matti Hamalainen <ccr@tnsp.org>
parents:
4
diff
changeset
|
350 $maxdays-- if ($flag); |
a52a0bdb5ea1
Crop off empty hours (time slots) and days.
Matti Hamalainen <ccr@tnsp.org>
parents:
4
diff
changeset
|
351 } |
a52a0bdb5ea1
Crop off empty hours (time slots) and days.
Matti Hamalainen <ccr@tnsp.org>
parents:
4
diff
changeset
|
352 |
a52a0bdb5ea1
Crop off empty hours (time slots) and days.
Matti Hamalainen <ccr@tnsp.org>
parents:
4
diff
changeset
|
353 |
1 | 354 ### Open output file, if specified |
355 if (defined($opt_outfile)) { | |
356 open(STDOUT, '>', $opt_outfile) or die("Could not open output file '$opt_outfile'.\n"); | |
357 } | |
358 | |
359 | |
360 ### Output data in desired format | |
361 if ($opt_mode eq "php") { | |
362 print "<?\n". | |
363 "\$classInfo = array(\n". | |
364 " \"general\" => array(".join(", ", map { "\"".escape($_)."\""; } @{$class{"info"}})."),\n". | |
365 " \"info\" => array(".join(", ", map { "\"".escape(html_collapse($_, 1))."\""; } @{$class{"data"}})."),\n". | |
366 " \"info_tags\" => array(".join(", ", map { "\"".escape(html_collapse($_, 0))."\""; } @{$class{"data"}})."),\n". | |
367 " \"maxdays\" => $maxdays,\n". | |
368 " \"maxhours\" => $maxhours,\n". | |
369 ");\n\n"; | |
370 | |
371 print "\$classDefs = array(\n"; | |
372 foreach my $id (sort { $a <=> $b } keys %{$tunnit}) { | |
373 print " $id => array("; | |
374 foreach my $key (keys %{$$tunnit{$id}}) { | |
375 my $a = $$tunnit{$id}{$key}; | |
376 print "\"$key\" => "; | |
377 if (ref($a) eq "ARRAY") { | |
378 print "array(".join(", ", map { "\"".escape($_)."\""; } @$a).")"; | |
379 } | |
380 elsif ($a =~ /^\d+$/) { | |
381 print $a; | |
382 } else { | |
383 print "\"".escape($a)."\""; | |
384 } | |
385 print ", "; | |
386 } | |
387 print "),\n"; | |
388 } | |
389 | |
390 print ");\n". | |
391 "\n". | |
392 "\$classHourTable = array(\n"; | |
393 for (my $y = 0; $y < $maxhours; $y++) { | |
394 my $str = ""; | |
395 for (my $x = 0; $x < $maxdays; $x++) { | |
396 $str .= ", " unless ($str eq ""); | |
397 $str .= sprintf "%3d", $$taulu{$y}{$x}; | |
398 } | |
399 print " array(".$str."),\n"; | |
400 } | |
401 print ");\n?>\n"; | |
402 } | |
403 | |
404 elsif ($opt_mode eq "xml") { | |
405 print "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n". | |
406 "<timetable>\n". | |
407 " <class>\n". | |
408 " <general>".join("", map { "<node>".encode_entities($_)."</node>"; } @{$class{"info"}})."</general>\n". | |
409 " <info>".join("", map { "<node>".encode_entities(html_collapse($_, 1))."</node>"; } @{$class{"data"}})."</info>\n". | |
410 " <maxdays>$maxdays</maxdays>\n". | |
411 " <maxhours>$maxhours</maxhours>\n". | |
412 " </class>\n"; | |
413 | |
414 | |
415 print "</timetable>\n"; | |
416 } | |
417 | |
418 elsif ($opt_mode eq "simple") { | |
419 for (my $y = 0; $y < $maxhours; $y++) { | |
420 for (my $x = 0; $x < $maxdays; $x++) { | |
421 printf "%3d ", $$taulu{$y}{$x}; | |
422 } | |
423 print "\n"; | |
424 } | |
425 } | |
426 | |
427 close (STDOUT); |