comparison parsedata.pl @ 162:70f432e3d1dc

Some remodeling here and there.
author Matti Hamalainen <ccr@tnsp.org>
date Thu, 20 Aug 2015 06:42:32 +0300
parents 62687fee6f1c
children 3790db4eb29b
comparison
equal deleted inserted replaced
161:62687fee6f1c 162:70f432e3d1dc
4 # (C) Copyright 2010-2012 Matti Hämäläinen <ccr@tnsp.org> 4 # (C) Copyright 2010-2012 Matti Hämäläinen <ccr@tnsp.org>
5 # 5 #
6 use strict; 6 use strict;
7 use Data::Dumper; 7 use Data::Dumper;
8 use HTML::Entities; 8 use HTML::Entities;
9
10 ###
11 ### Some globals
12 ###
13 my $modes = "php|xml";
14 my $opt_mode = "php";
15 my $opt_dump = 0;
16 my $opt_filename;
17 my $opt_outfile;
18
19 my $cid = 0;
20 my $hourTimes = [];
21 my $hourDefs = {};
22 my $hourTable = {};
23 my $hourFillTable = {};
24 my $maxDays = 0;
25 my $firstHour = 0;
26 my $lastHour = 0;
27 my $totalHours = 0;
9 28
10 29
11 sub urlencode($) 30 sub urlencode($)
12 { 31 {
13 my $value = $_[0]; 32 my $value = $_[0];
187 206
188 return $str; 207 return $str;
189 } 208 }
190 209
191 210
192 ### 211 sub parse_hour_data($$)
193 ### Main program
194 ###
195 my $modes = "php|xml";
196 my $opt_mode = "php";
197 my $opt_dump = 0;
198 my $opt_filename;
199 my $opt_outfile;
200
201 while (defined(my $arg = shift)) {
202 if (substr($arg, 0, 1) eq "-") {
203 if ($arg =~ /^-($modes)$/o) {
204 $opt_mode = $1;
205 }
206 elsif ($arg eq "-dump") {
207 $opt_dump = 1;
208 }
209 elsif ($arg eq "-o") {
210 $opt_outfile = shift or die("Output filename option -o requires an argument.\n");
211 } else {
212 die("Invalid option '$arg'.\n");
213 }
214 } else {
215 $opt_filename = $arg;
216 }
217 }
218
219 die("Usage: $0 [options] <filename>
220
221 -php Output a PHP include file with data in arrays (default)
222 -xml Output a simple XML file.
223
224 -o <filename> Set output filename. Default is to use stdout.
225
226 -dump Dump HTML tree to stdout and quit.
227
228 ") unless defined($opt_filename);
229
230
231 my $data;
232 open(my $fh, '<:encoding(iso-8859-1)', $opt_filename) or die("Error opening '$opt_filename': $!\n");
233 $data = do { local $/; <$fh> };
234 close($fh);
235
236 die("No data in input.\n") unless (defined($data) && $data ne "");
237
238
239 # Filter out certain unneeded elements
240 $data =~ s/<font[^>]*>//ig;
241 $data =~ s/<\/font>//ig;
242 $data =~ s/<\/?center>//ig;
243 $data =~ s/<br>//ig;
244 $data =~ s/&nbsp;/ /ig;
245
246 ### Get some general information
247 my $otree = parse_html($data);
248 if ($opt_dump) {
249 print Dumper(fnode($otree, "html"));
250 exit;
251 }
252
253 my %class = ();
254 my $body = fnode($otree, "body");
255 if (defined($body) && defined($$body{"nodes"})) {
256 foreach my $n (@{$$body{"nodes"}}) {
257 if ($$n{"name"} eq "text") {
258 push(@{$class{"info"}}, $$n{"text"});
259 }
260 elsif ($$n{"name"} eq "b") {
261 push(@{$class{"data"}}, $n);
262 }
263 }
264 }
265
266 # Filter out some more, for easier tree access during table parsing
267 $data =~ s/<\/?b>//ig;
268 my $tree = parse_html($data);
269 my $node = fnode(fnode($tree, "body"), "table");
270 die("No table element found in document. Perhaps the format has changed? :(\n") unless defined($node);
271
272 ### Parse through the HTML document node tree to find the data we need
273 my $cid = 0;
274 my $q = $$node{"nodes"};
275 my $hourTimes = [];
276 my $hourDefs = {};
277 my $hourTable = {};
278 my $hourFillTable = {};
279 my $maxDays = 0;
280 my $firstHour = 0;
281 my $lastHour = 0;
282 my $totalHours = 0;
283
284
285 sub parseHourData($$)
286 { 212 {
287 my ($l, $rowspan) = @_; 213 my ($l, $rowspan) = @_;
288 my $chours = $rowspan / 2; # The table is actually in half cells 214 my $chours = $rowspan / 2; # The table is actually in half cells
289 my $cdata = []; 215 my $cdata = [];
290 my $cturns = 0; 216 my $cturns = 0;
357 push(@{$$hourTable{$cday}}, $tid); 283 push(@{$$hourTable{$cday}}, $tid);
358 $totalHours += $chours; 284 $totalHours += $chours;
359 } 285 }
360 } 286 }
361 287
362 sub parseHour($) 288 sub parse_hour_header($)
363 { 289 {
364 if ($_[0] =~ /(\d+):(\d+)/) 290 if ($_[0] =~ /(\d+):(\d+)/)
365 { 291 {
366 return ((int($1) * 60 + int($2)) * 60); 292 return ((int($1) * 60 + int($2)) * 60);
367 } 293 }
427 } 353 }
428 354
429 return join(", ", @out) if ($opt_mode eq "php"); 355 return join(", ", @out) if ($opt_mode eq "php");
430 return join("", @out); 356 return join("", @out);
431 } 357 }
358
359
360 ###
361 ### Main program
362 ###
363
364 while (defined(my $arg = shift)) {
365 if (substr($arg, 0, 1) eq "-") {
366 if ($arg =~ /^-($modes)$/o) {
367 $opt_mode = $1;
368 }
369 elsif ($arg eq "-dump") {
370 $opt_dump = 1;
371 }
372 elsif ($arg eq "-o") {
373 $opt_outfile = shift or die("Output filename option -o requires an argument.\n");
374 } else {
375 die("Invalid option '$arg'.\n");
376 }
377 } else {
378 $opt_filename = $arg;
379 }
380 }
381
382 die("Usage: $0 [options] <filename>
383
384 -php Output a PHP include file with data in arrays (default)
385 -xml Output a simple XML file.
386
387 -o <filename> Set output filename. Default is to use stdout.
388
389 -dump Dump HTML tree to stdout and quit.
390
391 ") unless defined($opt_filename);
392
393
394 my $data;
395 open(my $fh, '<:encoding(iso-8859-1)', $opt_filename) or die("Error opening '$opt_filename': $!\n");
396 $data = do { local $/; <$fh> };
397 close($fh);
398
399 die("No data in input.\n") unless (defined($data) && $data ne "");
400
401
402 # Filter out certain unneeded elements
403 $data =~ s/<font[^>]*>//ig;
404 $data =~ s/<\/font>//ig;
405 $data =~ s/<\/?center>//ig;
406 $data =~ s/<br>//ig;
407 $data =~ s/&nbsp;/ /ig;
408
409 ### Get some general information
410 my $otree = parse_html($data);
411 if ($opt_dump) {
412 print Dumper(fnode($otree, "html"));
413 exit;
414 }
415
416 my %class = ();
417 my $body = fnode($otree, "body");
418 if (defined($body) && defined($$body{"nodes"})) {
419 foreach my $n (@{$$body{"nodes"}}) {
420 if ($$n{"name"} eq "text") {
421 push(@{$class{"info"}}, $$n{"text"});
422 }
423 elsif ($$n{"name"} eq "b") {
424 push(@{$class{"data"}}, $n);
425 }
426 }
427 }
428
429 # Filter out some more, for easier tree access during table parsing
430 $data =~ s/<\/?b>//ig;
431 my $tree = parse_html($data);
432 my $node = fnode(fnode($tree, "body"), "table");
433 die("No table element found in document. Perhaps the format has changed? :(\n") unless defined($node);
434
435
436 ### Parse through the HTML document node tree to find the data we need
432 437
433 # Skip zero position this way (can't use foreach here) 438 # Skip zero position this way (can't use foreach here)
434 for (my $i = 1; $i < scalar(@{$q}); $i++) 439 for (my $i = 1; $i < scalar(@{$q}); $i++)
435 { 440 {
436 my $d = $$q[$i]{"nodes"}; 441 my $d = $$q[$i]{"nodes"};
441 my $l = $$n{"nodes"}[0]{"nodes"}; 446 my $l = $$n{"nodes"}[0]{"nodes"};
442 if (defined($l)) 447 if (defined($l))
443 { 448 {
444 if ($$n{"args"} =~ /colspan=6\s+rowspan=(\d+)/) 449 if ($$n{"args"} =~ /colspan=6\s+rowspan=(\d+)/)
445 { 450 {
446 parseHourData($l, $1); 451 parse_hour_data($l, $1);
447 } 452 }
448 elsif ($$n{"args"} =~ /rowspan=2\s+align/) 453 elsif ($$n{"args"} =~ /rowspan=2\s+align/)
449 { 454 {
450 my $qstart = parseHour($$l[0]{"nodes"}[0]{"nodes"}[0]{"text"}); 455 my $qstart = parse_hour_header($$l[0]{"nodes"}[0]{"nodes"}[0]{"text"});
451 my $qend = parseHour($$l[1]{"nodes"}[0]{"nodes"}[0]{"text"}); 456 my $qend = parse_hour_header($$l[1]{"nodes"}[0]{"nodes"}[0]{"text"});
452 if (defined($qstart) && defined($qend)) 457 if (defined($qstart) && defined($qend))
453 { 458 {
454 push(@$hourTimes, {"start" => $qstart, "end" => $qend}); 459 push(@$hourTimes, {"start" => $qstart, "end" => $qend});
455 } 460 }
456 } 461 }