# HG changeset patch # User Matti Hamalainen # Date 1440042152 -10800 # Node ID 70f432e3d1dc72acf72f52408951a956d3d6e67b # Parent 62687fee6f1cf2e26b0baa792a939d29d0021e1f Some remodeling here and there. diff -r 62687fee6f1c -r 70f432e3d1dc parsedata.pl --- a/parsedata.pl Thu Aug 20 06:27:40 2015 +0300 +++ b/parsedata.pl Thu Aug 20 06:42:32 2015 +0300 @@ -7,6 +7,25 @@ use Data::Dumper; use HTML::Entities; +### +### Some globals +### +my $modes = "php|xml"; +my $opt_mode = "php"; +my $opt_dump = 0; +my $opt_filename; +my $opt_outfile; + +my $cid = 0; +my $hourTimes = []; +my $hourDefs = {}; +my $hourTable = {}; +my $hourFillTable = {}; +my $maxDays = 0; +my $firstHour = 0; +my $lastHour = 0; +my $totalHours = 0; + sub urlencode($) { @@ -189,100 +208,7 @@ } -### -### Main program -### -my $modes = "php|xml"; -my $opt_mode = "php"; -my $opt_dump = 0; -my $opt_filename; -my $opt_outfile; - -while (defined(my $arg = shift)) { - if (substr($arg, 0, 1) eq "-") { - if ($arg =~ /^-($modes)$/o) { - $opt_mode = $1; - } - elsif ($arg eq "-dump") { - $opt_dump = 1; - } - elsif ($arg eq "-o") { - $opt_outfile = shift or die("Output filename option -o requires an argument.\n"); - } else { - die("Invalid option '$arg'.\n"); - } - } else { - $opt_filename = $arg; - } -} - -die("Usage: $0 [options] - - -php Output a PHP include file with data in arrays (default) - -xml Output a simple XML file. - - -o Set output filename. Default is to use stdout. - - -dump Dump HTML tree to stdout and quit. - -") unless defined($opt_filename); - - -my $data; -open(my $fh, '<:encoding(iso-8859-1)', $opt_filename) or die("Error opening '$opt_filename': $!\n"); -$data = do { local $/; <$fh> }; -close($fh); - -die("No data in input.\n") unless (defined($data) && $data ne ""); - - -# Filter out certain unneeded elements -$data =~ s/]*>//ig; -$data =~ s/<\/font>//ig; -$data =~ s/<\/?center>//ig; -$data =~ s/
//ig; -$data =~ s/ / /ig; - -### Get some general information -my $otree = parse_html($data); -if ($opt_dump) { - print Dumper(fnode($otree, "html")); - exit; -} - -my %class = (); -my $body = fnode($otree, "body"); -if (defined($body) && defined($$body{"nodes"})) { - foreach my $n (@{$$body{"nodes"}}) { - if ($$n{"name"} eq "text") { - push(@{$class{"info"}}, $$n{"text"}); - } - elsif ($$n{"name"} eq "b") { - push(@{$class{"data"}}, $n); - } - } -} - -# Filter out some more, for easier tree access during table parsing -$data =~ s/<\/?b>//ig; -my $tree = parse_html($data); -my $node = fnode(fnode($tree, "body"), "table"); -die("No table element found in document. Perhaps the format has changed? :(\n") unless defined($node); - -### Parse through the HTML document node tree to find the data we need -my $cid = 0; -my $q = $$node{"nodes"}; -my $hourTimes = []; -my $hourDefs = {}; -my $hourTable = {}; -my $hourFillTable = {}; -my $maxDays = 0; -my $firstHour = 0; -my $lastHour = 0; -my $totalHours = 0; - - -sub parseHourData($$) +sub parse_hour_data($$) { my ($l, $rowspan) = @_; my $chours = $rowspan / 2; # The table is actually in half cells @@ -359,7 +285,7 @@ } } -sub parseHour($) +sub parse_hour_header($) { if ($_[0] =~ /(\d+):(\d+)/) { @@ -430,6 +356,85 @@ return join("", @out); } + +### +### Main program +### + +while (defined(my $arg = shift)) { + if (substr($arg, 0, 1) eq "-") { + if ($arg =~ /^-($modes)$/o) { + $opt_mode = $1; + } + elsif ($arg eq "-dump") { + $opt_dump = 1; + } + elsif ($arg eq "-o") { + $opt_outfile = shift or die("Output filename option -o requires an argument.\n"); + } else { + die("Invalid option '$arg'.\n"); + } + } else { + $opt_filename = $arg; + } +} + +die("Usage: $0 [options] + + -php Output a PHP include file with data in arrays (default) + -xml Output a simple XML file. + + -o Set output filename. Default is to use stdout. + + -dump Dump HTML tree to stdout and quit. + +") unless defined($opt_filename); + + +my $data; +open(my $fh, '<:encoding(iso-8859-1)', $opt_filename) or die("Error opening '$opt_filename': $!\n"); +$data = do { local $/; <$fh> }; +close($fh); + +die("No data in input.\n") unless (defined($data) && $data ne ""); + + +# Filter out certain unneeded elements +$data =~ s/]*>//ig; +$data =~ s/<\/font>//ig; +$data =~ s/<\/?center>//ig; +$data =~ s/
//ig; +$data =~ s/ / /ig; + +### Get some general information +my $otree = parse_html($data); +if ($opt_dump) { + print Dumper(fnode($otree, "html")); + exit; +} + +my %class = (); +my $body = fnode($otree, "body"); +if (defined($body) && defined($$body{"nodes"})) { + foreach my $n (@{$$body{"nodes"}}) { + if ($$n{"name"} eq "text") { + push(@{$class{"info"}}, $$n{"text"}); + } + elsif ($$n{"name"} eq "b") { + push(@{$class{"data"}}, $n); + } + } +} + +# Filter out some more, for easier tree access during table parsing +$data =~ s/<\/?b>//ig; +my $tree = parse_html($data); +my $node = fnode(fnode($tree, "body"), "table"); +die("No table element found in document. Perhaps the format has changed? :(\n") unless defined($node); + + +### Parse through the HTML document node tree to find the data we need + # Skip zero position this way (can't use foreach here) for (my $i = 1; $i < scalar(@{$q}); $i++) { @@ -443,12 +448,12 @@ { if ($$n{"args"} =~ /colspan=6\s+rowspan=(\d+)/) { - parseHourData($l, $1); + parse_hour_data($l, $1); } elsif ($$n{"args"} =~ /rowspan=2\s+align/) { - my $qstart = parseHour($$l[0]{"nodes"}[0]{"nodes"}[0]{"text"}); - my $qend = parseHour($$l[1]{"nodes"}[0]{"nodes"}[0]{"text"}); + my $qstart = parse_hour_header($$l[0]{"nodes"}[0]{"nodes"}[0]{"text"}); + my $qend = parse_hour_header($$l[1]{"nodes"}[0]{"nodes"}[0]{"text"}); if (defined($qstart) && defined($qend)) { push(@$hourTimes, {"start" => $qstart, "end" => $qend});