Mercurial > hg > lukkari
comparison parsedata.pl @ 162:70f432e3d1dc
Some remodeling here and there.
author | Matti Hamalainen <ccr@tnsp.org> |
---|---|
date | Thu, 20 Aug 2015 06:42:32 +0300 |
parents | 62687fee6f1c |
children | 3790db4eb29b |
comparison
equal
deleted
inserted
replaced
161:62687fee6f1c | 162:70f432e3d1dc |
---|---|
4 # (C) Copyright 2010-2012 Matti Hämäläinen <ccr@tnsp.org> | 4 # (C) Copyright 2010-2012 Matti Hämäläinen <ccr@tnsp.org> |
5 # | 5 # |
6 use strict; | 6 use strict; |
7 use Data::Dumper; | 7 use Data::Dumper; |
8 use HTML::Entities; | 8 use HTML::Entities; |
9 | |
10 ### | |
11 ### Some globals | |
12 ### | |
13 my $modes = "php|xml"; | |
14 my $opt_mode = "php"; | |
15 my $opt_dump = 0; | |
16 my $opt_filename; | |
17 my $opt_outfile; | |
18 | |
19 my $cid = 0; | |
20 my $hourTimes = []; | |
21 my $hourDefs = {}; | |
22 my $hourTable = {}; | |
23 my $hourFillTable = {}; | |
24 my $maxDays = 0; | |
25 my $firstHour = 0; | |
26 my $lastHour = 0; | |
27 my $totalHours = 0; | |
9 | 28 |
10 | 29 |
11 sub urlencode($) | 30 sub urlencode($) |
12 { | 31 { |
13 my $value = $_[0]; | 32 my $value = $_[0]; |
187 | 206 |
188 return $str; | 207 return $str; |
189 } | 208 } |
190 | 209 |
191 | 210 |
192 ### | 211 sub parse_hour_data($$) |
193 ### Main program | |
194 ### | |
195 my $modes = "php|xml"; | |
196 my $opt_mode = "php"; | |
197 my $opt_dump = 0; | |
198 my $opt_filename; | |
199 my $opt_outfile; | |
200 | |
201 while (defined(my $arg = shift)) { | |
202 if (substr($arg, 0, 1) eq "-") { | |
203 if ($arg =~ /^-($modes)$/o) { | |
204 $opt_mode = $1; | |
205 } | |
206 elsif ($arg eq "-dump") { | |
207 $opt_dump = 1; | |
208 } | |
209 elsif ($arg eq "-o") { | |
210 $opt_outfile = shift or die("Output filename option -o requires an argument.\n"); | |
211 } else { | |
212 die("Invalid option '$arg'.\n"); | |
213 } | |
214 } else { | |
215 $opt_filename = $arg; | |
216 } | |
217 } | |
218 | |
219 die("Usage: $0 [options] <filename> | |
220 | |
221 -php Output a PHP include file with data in arrays (default) | |
222 -xml Output a simple XML file. | |
223 | |
224 -o <filename> Set output filename. Default is to use stdout. | |
225 | |
226 -dump Dump HTML tree to stdout and quit. | |
227 | |
228 ") unless defined($opt_filename); | |
229 | |
230 | |
231 my $data; | |
232 open(my $fh, '<:encoding(iso-8859-1)', $opt_filename) or die("Error opening '$opt_filename': $!\n"); | |
233 $data = do { local $/; <$fh> }; | |
234 close($fh); | |
235 | |
236 die("No data in input.\n") unless (defined($data) && $data ne ""); | |
237 | |
238 | |
239 # Filter out certain unneeded elements | |
240 $data =~ s/<font[^>]*>//ig; | |
241 $data =~ s/<\/font>//ig; | |
242 $data =~ s/<\/?center>//ig; | |
243 $data =~ s/<br>//ig; | |
244 $data =~ s/ / /ig; | |
245 | |
246 ### Get some general information | |
247 my $otree = parse_html($data); | |
248 if ($opt_dump) { | |
249 print Dumper(fnode($otree, "html")); | |
250 exit; | |
251 } | |
252 | |
253 my %class = (); | |
254 my $body = fnode($otree, "body"); | |
255 if (defined($body) && defined($$body{"nodes"})) { | |
256 foreach my $n (@{$$body{"nodes"}}) { | |
257 if ($$n{"name"} eq "text") { | |
258 push(@{$class{"info"}}, $$n{"text"}); | |
259 } | |
260 elsif ($$n{"name"} eq "b") { | |
261 push(@{$class{"data"}}, $n); | |
262 } | |
263 } | |
264 } | |
265 | |
266 # Filter out some more, for easier tree access during table parsing | |
267 $data =~ s/<\/?b>//ig; | |
268 my $tree = parse_html($data); | |
269 my $node = fnode(fnode($tree, "body"), "table"); | |
270 die("No table element found in document. Perhaps the format has changed? :(\n") unless defined($node); | |
271 | |
272 ### Parse through the HTML document node tree to find the data we need | |
273 my $cid = 0; | |
274 my $q = $$node{"nodes"}; | |
275 my $hourTimes = []; | |
276 my $hourDefs = {}; | |
277 my $hourTable = {}; | |
278 my $hourFillTable = {}; | |
279 my $maxDays = 0; | |
280 my $firstHour = 0; | |
281 my $lastHour = 0; | |
282 my $totalHours = 0; | |
283 | |
284 | |
285 sub parseHourData($$) | |
286 { | 212 { |
287 my ($l, $rowspan) = @_; | 213 my ($l, $rowspan) = @_; |
288 my $chours = $rowspan / 2; # The table is actually in half cells | 214 my $chours = $rowspan / 2; # The table is actually in half cells |
289 my $cdata = []; | 215 my $cdata = []; |
290 my $cturns = 0; | 216 my $cturns = 0; |
357 push(@{$$hourTable{$cday}}, $tid); | 283 push(@{$$hourTable{$cday}}, $tid); |
358 $totalHours += $chours; | 284 $totalHours += $chours; |
359 } | 285 } |
360 } | 286 } |
361 | 287 |
362 sub parseHour($) | 288 sub parse_hour_header($) |
363 { | 289 { |
364 if ($_[0] =~ /(\d+):(\d+)/) | 290 if ($_[0] =~ /(\d+):(\d+)/) |
365 { | 291 { |
366 return ((int($1) * 60 + int($2)) * 60); | 292 return ((int($1) * 60 + int($2)) * 60); |
367 } | 293 } |
427 } | 353 } |
428 | 354 |
429 return join(", ", @out) if ($opt_mode eq "php"); | 355 return join(", ", @out) if ($opt_mode eq "php"); |
430 return join("", @out); | 356 return join("", @out); |
431 } | 357 } |
358 | |
359 | |
360 ### | |
361 ### Main program | |
362 ### | |
363 | |
364 while (defined(my $arg = shift)) { | |
365 if (substr($arg, 0, 1) eq "-") { | |
366 if ($arg =~ /^-($modes)$/o) { | |
367 $opt_mode = $1; | |
368 } | |
369 elsif ($arg eq "-dump") { | |
370 $opt_dump = 1; | |
371 } | |
372 elsif ($arg eq "-o") { | |
373 $opt_outfile = shift or die("Output filename option -o requires an argument.\n"); | |
374 } else { | |
375 die("Invalid option '$arg'.\n"); | |
376 } | |
377 } else { | |
378 $opt_filename = $arg; | |
379 } | |
380 } | |
381 | |
382 die("Usage: $0 [options] <filename> | |
383 | |
384 -php Output a PHP include file with data in arrays (default) | |
385 -xml Output a simple XML file. | |
386 | |
387 -o <filename> Set output filename. Default is to use stdout. | |
388 | |
389 -dump Dump HTML tree to stdout and quit. | |
390 | |
391 ") unless defined($opt_filename); | |
392 | |
393 | |
394 my $data; | |
395 open(my $fh, '<:encoding(iso-8859-1)', $opt_filename) or die("Error opening '$opt_filename': $!\n"); | |
396 $data = do { local $/; <$fh> }; | |
397 close($fh); | |
398 | |
399 die("No data in input.\n") unless (defined($data) && $data ne ""); | |
400 | |
401 | |
402 # Filter out certain unneeded elements | |
403 $data =~ s/<font[^>]*>//ig; | |
404 $data =~ s/<\/font>//ig; | |
405 $data =~ s/<\/?center>//ig; | |
406 $data =~ s/<br>//ig; | |
407 $data =~ s/ / /ig; | |
408 | |
409 ### Get some general information | |
410 my $otree = parse_html($data); | |
411 if ($opt_dump) { | |
412 print Dumper(fnode($otree, "html")); | |
413 exit; | |
414 } | |
415 | |
416 my %class = (); | |
417 my $body = fnode($otree, "body"); | |
418 if (defined($body) && defined($$body{"nodes"})) { | |
419 foreach my $n (@{$$body{"nodes"}}) { | |
420 if ($$n{"name"} eq "text") { | |
421 push(@{$class{"info"}}, $$n{"text"}); | |
422 } | |
423 elsif ($$n{"name"} eq "b") { | |
424 push(@{$class{"data"}}, $n); | |
425 } | |
426 } | |
427 } | |
428 | |
429 # Filter out some more, for easier tree access during table parsing | |
430 $data =~ s/<\/?b>//ig; | |
431 my $tree = parse_html($data); | |
432 my $node = fnode(fnode($tree, "body"), "table"); | |
433 die("No table element found in document. Perhaps the format has changed? :(\n") unless defined($node); | |
434 | |
435 | |
436 ### Parse through the HTML document node tree to find the data we need | |
432 | 437 |
433 # Skip zero position this way (can't use foreach here) | 438 # Skip zero position this way (can't use foreach here) |
434 for (my $i = 1; $i < scalar(@{$q}); $i++) | 439 for (my $i = 1; $i < scalar(@{$q}); $i++) |
435 { | 440 { |
436 my $d = $$q[$i]{"nodes"}; | 441 my $d = $$q[$i]{"nodes"}; |
441 my $l = $$n{"nodes"}[0]{"nodes"}; | 446 my $l = $$n{"nodes"}[0]{"nodes"}; |
442 if (defined($l)) | 447 if (defined($l)) |
443 { | 448 { |
444 if ($$n{"args"} =~ /colspan=6\s+rowspan=(\d+)/) | 449 if ($$n{"args"} =~ /colspan=6\s+rowspan=(\d+)/) |
445 { | 450 { |
446 parseHourData($l, $1); | 451 parse_hour_data($l, $1); |
447 } | 452 } |
448 elsif ($$n{"args"} =~ /rowspan=2\s+align/) | 453 elsif ($$n{"args"} =~ /rowspan=2\s+align/) |
449 { | 454 { |
450 my $qstart = parseHour($$l[0]{"nodes"}[0]{"nodes"}[0]{"text"}); | 455 my $qstart = parse_hour_header($$l[0]{"nodes"}[0]{"nodes"}[0]{"text"}); |
451 my $qend = parseHour($$l[1]{"nodes"}[0]{"nodes"}[0]{"text"}); | 456 my $qend = parse_hour_header($$l[1]{"nodes"}[0]{"nodes"}[0]{"text"}); |
452 if (defined($qstart) && defined($qend)) | 457 if (defined($qstart) && defined($qend)) |
453 { | 458 { |
454 push(@$hourTimes, {"start" => $qstart, "end" => $qend}); | 459 push(@$hourTimes, {"start" => $qstart, "end" => $qend}); |
455 } | 460 } |
456 } | 461 } |