Mercurial > hg > lukkari
view parsedata.pl @ 187:3fca160af8a5
Update site framework include.
author | Matti Hamalainen <ccr@tnsp.org> |
---|---|
date | Wed, 30 Dec 2015 11:01:56 +0200 |
parents | 9c3100ab29cc |
children | 9f7eb4db99b4 |
line wrap: on
line source
#!/usr/bin/perl -w # # Fetch and parse HTML format class timetable into more sane formats # (C) Copyright 2010-2015 Matti Hämäläinen <ccr@tnsp.org> # use strict; use Data::Dumper; use HTML::Entities; ### ### Some globals ### my $modes = "php|xml"; my $opt_mode = "php"; my $opt_dump = 0; my $opt_filename; my $opt_outfile; my $cid = 0; my $hourTimes = []; my $hourDefs = {}; my $hourTable = {}; my $hourFillTable = {}; my $maxDays = 0; my $firstHour = 0; my $lastHour = 0; my $totalHours = 0; sub str_trim($) { my $str = $_[0]; if (defined($str)) { $str =~ s/^\s*//; $str =~ s/\s*$//; } return $str; } sub pop_token_a($) { my $tokens = $_[0]; return shift(@$tokens); } sub pop_token($) { return str_trim(pop_token_a($_[0])); } sub parse_html_str($) { my $tokens = $_[0]; my $token = pop_token($tokens); my $str = ""; $token =~ s/^\s*//; return undef unless (substr($token, 0, 1) eq '"'); $token = substr($token, 1); while (defined($token)) { my $tmp = $token; $tmp =~ s/\s*$//; if (substr($tmp, -1) eq '"') { $str .= substr($tmp, 0, -1); return $str; } else { $str .= $token; } $token = shift(@$tokens); } return undef; } sub parse_html_tree($$); sub parse_html_tree($$) { my ($tokens, $tree) = @_; while (my $token = pop_token($tokens)) { if ($token =~ /^<[!\/]?[a-zA-Z]+/) { $token = lc($token); if ($token =~ /^<\!.*>$/) { # Ignore comments etc. } elsif ($token =~ /^<([a-z]+)(.*)>$/) { my ($name, $args) = ($1, $2); if ($name eq "meta" || $name eq "img") { my $tmp = {}; parse_html_tree($tokens, $tree); return $tree; } else { my $tmp = { "name" => $name, "args" => str_trim($args) }; parse_html_tree($tokens, $tmp); push(@{$$tree{"nodes"}}, $tmp); } } elsif ($token =~ /^<\/([a-z]+)>$/) { return $tree; } else { die("HORROR TERROR ELITE: $token\n"); } } else { $token = str_trim(decode_entities($token)); push(@{$$tree{"nodes"}}, { "name" => "text", "args" => "", "text" => $token }) if length($token) > 0; } } return $tree; } sub parse_html($) { return undef unless defined($_[0]); my $str = $_[0]; my $res = { "name" => "", "args" => "" }; $str =~ tr/\r/ /; $str =~ tr/\n/ /; my @tokens = grep { !($_ =~ /^\s*$/) } split(/(<\/?[a-zA-Z]+.*?>)/, $str); parse_html_tree(\@tokens, $res); return $res; } sub html_find_node($$$); sub html_find_node($$$) { my ($node, $name, $args) = @_; if (defined($node)) { if (ref($node) eq "ARRAY") { foreach my $n (@$node) { my $tmp = html_find_node($n, $name, $args); # Must do it like this, in order not to break the loop return $tmp if defined($tmp); } } elsif (ref($node) eq "HASH") { if (defined($$node{"name"})) { if ($$node{"name"} eq $name) { if ($args ne "") { if (defined($$node{"args"}) && $$node{"args"} =~ /$args/) { } else { return html_find_node($$node{"nodes"}, $name, $args); } } return $node; } else { return html_find_node($$node{"nodes"}, $name, $args); } } } } return undef; } sub fnode($$) { return html_find_node($_[0], $_[1], ""); } sub fnodea($$$) { return html_find_node($_[0], $_[1], $_[2]); } sub escape($) { my $s = $_[0]; $s =~ s/(['"])/\\$1/g; return $s; } sub html_collapse($$); sub html_collapse($$) { my ($node, $strip) = @_; my $str = ""; if ($$node{"name"} eq "text") { $str .= $$node{"text"}; } else { $str .= "<".$$node{"name"}.">" unless ($strip); foreach my $n (@{$$node{"nodes"}}) { $str .= html_collapse($n, $strip); } $str .= "</".$$node{"name"}.">" unless ($strip); } return $str; } sub parse_hour_data($$) { my ($l, $rowspan) = @_; my $chours = $rowspan / 2; # The table is actually in half cells my $cdata = []; my $cturns = 0; my $cgrouped = 0; # Pull in data for the class/hour cell foreach my $h (@{$l}) { if (defined($$h{"nodes"})) { foreach my $b (@{$$h{"nodes"}}) { if (defined($$b{"nodes"})) { my $text = $$b{"nodes"}[0]{"text"}; $text =~ s/\.$//; if ($text =~ /^vuorov/i) { $cturns = 1; } else { push(@$cdata, $text); } } } } } # Increased ID if there is data in this class/hour cell my $tid; if (scalar(@$cdata) > 0) { $cid++; $tid = $cid; } else { $tid = 0; } # Determine current day my $cday = 0; for (my $x = 0; $x < 7; $x++) { if (!defined($$hourFillTable{$lastHour}{$x})) { $cday = $x; last; } } for (my $t = 0; $t < $chours; $t++) { $$hourFillTable{$lastHour + $t}{$cday} = $tid; } if ($tid) { $maxDays = $cday + 1 if ($cday + 1 > $maxDays); # Grouped, if there is another class ID in second slot $cgrouped = 1 if ($$cdata[1] =~ /^([A-Z]\d{5,6}[A-Z]*|[A-Z0-9]{6,8})$/); if ($cgrouped) { my $cdata1 = []; my $cdata2 = []; for (my $i = 0; $i < length($cdata); $i += 2) { push(@$cdata1, $$cdata[$i]) if defined($$cdata[$i]); push(@$cdata2, $$cdata[$i+1]) if defined($$cdata[$i+1]); } $$hourDefs{$cid} = { "turns" => $cturns, "grouped" => $cgrouped, "start" => $lastHour, "hours" => $chours, "data" => [ $cdata1, $cdata2 ] }; } else { $$hourDefs{$cid} = { "turns" => $cturns, "grouped" => $cgrouped, "start" => $lastHour, "hours" => $chours, "data" => [ $cdata ] }; } push(@{$$hourTable{$cday}}, $tid); $totalHours += $chours; } } sub parse_hour_header($) { if ($_[0] =~ /(\d+):(\d+)/) { return ((int($1) * 60 + int($2)) * 60); } return undef; } sub get_hour_data_struct($$); sub get_hour_data_struct($$) { my @out = (); my ($tmp, $first) = @_; if (ref($tmp) eq "ARRAY") { my @str = (); foreach my $item (@{$tmp}) { push(@str, get_hour_data_struct($item, 0)); } if (scalar(@str) > 0) { push(@out, "array(".join(", ", @str).")") if ($opt_mode eq "php"); push(@out, "<group>".join("", @str)."</group>") if ($opt_mode eq "xml"); } } elsif (ref($tmp) eq "HASH") { my @str = (); foreach my $key (keys %{$tmp}) { push(@out, "\"".$key."\" => ".get_hour_data_struct($$tmp{$key}, 1)) if ($opt_mode eq "php"); push(@out, "<".$key.">".get_hour_data_struct($$tmp{$key}, 1)."</".$key.">") if ($opt_mode eq "xml"); } if (scalar(@str) > 0) { push(@out, "array(".join(", ", @str).")") if ($opt_mode eq "php"); push(@out, join("", @str)) if ($opt_mode eq "xml"); } } elsif ($tmp =~ /^\d+$/) { if ($opt_mode eq "php") { push(@out, $tmp); } else { push(@out, $first ? $tmp : "<item>".$tmp."</item>"); } } else { if ($opt_mode eq "php") { push(@out, "\"".$tmp."\""); } else { push(@out, $first ? $tmp : "<item>".$tmp."</item>"); } } return join(", ", @out) if ($opt_mode eq "php"); return join("", @out); } ### ### Main program ### while (defined(my $arg = shift)) { if (substr($arg, 0, 1) eq "-") { if ($arg =~ /^-($modes)$/o) { $opt_mode = $1; } elsif ($arg eq "-dump") { $opt_dump = 1; } elsif ($arg eq "-o") { $opt_outfile = shift or die("Output filename option -o requires an argument.\n"); } else { die("Invalid option '$arg'.\n"); } } else { $opt_filename = $arg; } } die("Usage: $0 [options] <filename> -php Output a PHP include file with data in arrays (default) -xml Output a simple XML file. -o <filename> Set output filename. Default is to use stdout. -dump Dump HTML tree to stdout and quit. ") unless defined($opt_filename); my $data; open(my $fh, '<:encoding(iso-8859-1)', $opt_filename) or die("Error opening '$opt_filename': $!\n"); $data = do { local $/; <$fh> }; close($fh); die("No data in input.\n") unless (defined($data) && $data ne ""); # Filter out certain unneeded elements $data =~ s/<font[^>]*>//ig; $data =~ s/<\/font>//ig; $data =~ s/<\/?center>//ig; $data =~ s/<br>//ig; $data =~ s/ / /ig; ### Get some general information my $otree = parse_html($data); if ($opt_dump) { print Dumper(fnode($otree, "html")); exit; } my %class = (); my $body = fnode($otree, "body"); if (defined($body) && defined($$body{"nodes"})) { foreach my $n (@{$$body{"nodes"}}) { if ($$n{"name"} eq "text") { push(@{$class{"info"}}, $$n{"text"}); } elsif ($$n{"name"} eq "b") { push(@{$class{"data"}}, $n); } } } # Filter out some more, for easier tree access during table parsing $data =~ s/<\/?b>//ig; my $tree = parse_html($data); my $node = fnodea(fnode($tree, "body"), "table", "border=\"3\" rules=\"all\" cellpadding=\"1\" cellspacing=\"1\""); die("No table element found in document. Perhaps the format has changed? :(\n") unless defined($node); ### Parse through the HTML document node tree to find the data we need # Skip zero position this way (can't use foreach here) my $q = $$node{"nodes"}; for (my $i = 1; $i < scalar(@{$q}); $i++) { my $d = $$q[$i]{"nodes"}; if (defined($d)) { foreach my $n (@{$d}) { my $l = $$n{"nodes"}[0]{"nodes"}; if (defined($l)) { if ($$n{"args"} =~ /colspan=12\s+rowspan=(\d+)/) { parse_hour_data($l, $1); } elsif ($$n{"args"} =~ /rowspan=2\s+align/) { my $qstart = parse_hour_header($$l[0]{"nodes"}[0]{"nodes"}[0]{"text"}); my $qend = parse_hour_header($$l[1]{"nodes"}[0]{"nodes"}[0]{"text"}); if (defined($qstart) && defined($qend)) { push(@$hourTimes, {"start" => $qstart, "end" => $qend}); } } } } $lastHour++; } } ### Go through hour table, find last day and hour of the week, crop my $flag = 1; for (my $y = 0; $y < $lastHour && $flag; $y++) { for (my $x = 0; $x < $maxDays && $flag; $x++) { $flag = 0 if (defined($$hourFillTable{$y}{$x}) && $$hourFillTable{$y}{$x} != 0); } $firstHour++ if ($flag); } $flag = 1; for (my $y = $lastHour - 1; $y >= 0 && $flag; $y--) { for (my $x = 0; $x < $maxDays && $flag; $x++) { $flag = 0 if (defined($$hourFillTable{$y}{$x}) && $$hourFillTable{$y}{$x} != 0); } $lastHour-- if ($flag); } ### Open output file, if specified if (defined($opt_outfile)) { open(STDOUT, '>', $opt_outfile) or die("Could not open output file '$opt_outfile'.\n"); } binmode STDOUT, ':encoding(utf-8)'; ### Output data in desired format if ($opt_mode eq "php") { print "<?\n". "\$classInfo = array(\n". " \"general\" => array(".join(", ", map { "\"".escape($_)."\""; } @{$class{"info"}})."),\n". " \"info\" => array(".join(", ", map { "\"".escape(html_collapse($_, 1))."\""; } @{$class{"data"}})."),\n". " \"tags\" => array(".join(", ", map { "\"".escape(html_collapse($_, 0))."\""; } @{$class{"data"}})."),\n". " \"maxDays\" => $maxDays,\n". " \"firstHour\" => $firstHour,\n". " \"lastHour\" => $lastHour,\n". " \"totalHours\" => $totalHours\n". ");\n\n"; print "\$classHourTimes = array(\n"; foreach my $chour (@$hourTimes) { print " array(\"start\" => ".$$chour{"start"}.", \"end\" => ".$$chour{"end"}."),\n"; } print ");\n\n"; print "\$classHourDefs = array(\n"; foreach my $cid (sort { $a <=> $b } keys %{$hourDefs}) { print " $cid => array(".get_hour_data_struct($$hourDefs{$cid}, 0)."),\n"; } print ");\n\n"; print "\$classDayTable = array(\n"; for (my $y = 0; $y < $maxDays; $y++) { if (defined($$hourTable{$y})) { print " $y => array(".join(", ", @{$$hourTable{$y}})."),\n"; } } print ");\n?>\n"; } elsif ($opt_mode eq "xml") { print "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n". "<timetable>\n". " <class>\n". " <general>".join("", map { "<item>".$_."</item>"; } @{$class{"info"}})."</general>\n". " <info>".join("", map { "<item>".html_collapse($_, 1)."</item>"; } @{$class{"data"}})."</info>\n". " <maxdays>$maxDays</maxdays>\n". " <firsthour>$firstHour</firsthour>\n". " <lasthour>$lastHour</lasthour>\n". " <totalhours>$totalHours</totalhours>\n". " </class>\n"; print " <hours>\n"; my $cid = 0; foreach my $chour (@$hourTimes) { print " <hour id=\"".$cid."\"><start>".$$chour{"start"}."</start><end>".$$chour{"end"}."</end></hour>\n"; $cid++; } print " </hours>\n\n"; print " <classes>\n"; foreach $cid (sort { $a <=> $b } keys %{$hourDefs}) { print " <class id=\"$cid\">".get_hour_data_struct($$hourDefs{$cid}, 0)."</class>\n"; } print " </classes>\n\n"; print " <days>\n"; for (my $y = 0; $y < $maxDays; $y++) { if (defined($$hourTable{$y})) { print " <day id=\"$y\">".join("", map { "<class>".$_."</class>" } @{$$hourTable{$y}})."</day>\n"; } } print " </days>\n"; print "</timetable>\n"; } close (STDOUT);