Mercurial > hg > lukkari
view parsedata.pl @ 72:6fd715063abc
Clean up some parsing operations.
author | Matti Hamalainen <ccr@tnsp.org> |
---|---|
date | Wed, 24 Oct 2012 06:31:32 +0300 |
parents | 119f0cef6498 |
children | b51ad733b624 |
line wrap: on
line source
#!/usr/bin/perl -w # # Fetch and parse HTML format class timetable into more sane formats # (C) Copyright 2010-2012 Matti Hämäläinen <ccr@tnsp.org> # use strict; use Data::Dumper; use HTML::Entities; sub urlencode($) { my $value = $_[0]; $value =~ s/([^a-zA-Z_0-9 ])/"%" . uc(sprintf "%lx" , unpack("C", $1))/eg; $value =~ tr/ /+/; return $value; } sub str_trim($) { my $str = $_[0]; if (defined($str)) { $str =~ s/^\s*//; $str =~ s/\s*$//; } return $str; } sub pop_token_a($) { my $tokens = $_[0]; return shift(@$tokens); } sub pop_token($) { return str_trim(pop_token_a($_[0])); } sub parse_html_str($) { my $tokens = $_[0]; my $token = pop_token($tokens); my $str = ""; $token =~ s/^\s*//; return undef unless (substr($token, 0, 1) eq '"'); $token = substr($token, 1); while (defined($token)) { my $tmp = $token; $tmp =~ s/\s*$//; if (substr($tmp, -1) eq '"') { $str .= substr($tmp, 0, -1); return $str; } else { $str .= $token; } $token = shift(@$tokens); } return undef; } sub parse_html_tree($$); sub parse_html_tree($$) { my ($tokens, $tree) = @_; while (my $token = pop_token($tokens)) { if ($token =~ /^<[!\/]?[a-zA-Z]+/) { $token = lc($token); if ($token =~ /^<\!.*>$/) { # Ignore comments etc. } elsif ($token =~ /^<([a-z]+)(.*)>$/) { my ($name, $args) = ($1, $2); if ($name eq "meta" || $name eq "img") { my $tmp = {}; parse_html_tree($tokens, $tree); return $tree; } else { my $tmp = { "name" => $name, "args" => str_trim($args) }; parse_html_tree($tokens, $tmp); push(@{$$tree{"nodes"}}, $tmp); } } elsif ($token =~ /^<\/([a-z]+)>$/) { return $tree; } else { die("HORROR TERROR ELITE: $token\n"); } } else { $token = str_trim(decode_entities($token)); push(@{$$tree{"nodes"}}, { "name" => "text", "args" => "", "text" => $token }) if length($token) > 0; } } return $tree; } sub parse_html($) { return undef unless defined($_[0]); my $str = $_[0]; my $res = { "name" => "", "args" => "" }; $str =~ tr/\r/ /; $str =~ tr/\n/ /; my @tokens = grep { !($_ =~ /^\s*$/) } split(/(<\/?[a-zA-Z]+.*?>)/, $str); parse_html_tree(\@tokens, $res); return $res; } sub html_find_node($$$); sub html_find_node($$$) { my ($node, $name, $args) = @_; if (defined($node)) { if (ref($node) eq "ARRAY") { foreach my $n (@$node) { my $tmp = html_find_node($n, $name, $args); # Must do it like this, in order not to break the loop return $tmp if defined($tmp); } } elsif (ref($node) eq "HASH") { if (defined($$node{"name"})) { if ($$node{"name"} eq $name) { if ($args ne "") { if (defined($$node{"args"}) && $$node{"args"} =~ /$args/) { } else { return html_find_node($$node{"nodes"}, $name, $args); } } return $node; } else { return html_find_node($$node{"nodes"}, $name, $args); } } } } return undef; } sub fnode($$) { return html_find_node($_[0], $_[1], ""); } sub fnodea($$$) { return html_find_node($_[0], $_[1], $_[2]); } sub escape($) { my $s = $_[0]; $s =~ s/(['"])/\\$1/g; return $s; } sub html_collapse($$); sub html_collapse($$) { my ($node, $strip) = @_; my $str = ""; if ($$node{"name"} eq "text") { $str .= $$node{"text"}; } else { $str .= "<".$$node{"name"}.">" unless ($strip); foreach my $n (@{$$node{"nodes"}}) { $str .= html_collapse($n, $strip); } $str .= "</".$$node{"name"}.">" unless ($strip); } return $str; } ### ### Main program ### my $modes = "php|xml"; my $opt_mode = "php"; my $opt_dump = 0; my $opt_filename; my $opt_outfile; while (defined(my $arg = shift)) { if (substr($arg, 0, 1) eq "-") { if ($arg =~ /^-($modes)$/o) { $opt_mode = $1; } elsif ($arg eq "-dump") { $opt_dump = 1; } elsif ($arg eq "-o") { $opt_outfile = shift or die("Output filename option -o requires an argument.\n"); } else { die("Invalid option '$arg'.\n"); } } else { $opt_filename = $arg; } } die("Usage: $0 [options] <filename> -php Output a PHP include file with data in arrays (default) -xml Output a simple XML file. -o <filename> Set output filename. Default is to use stdout. -dump Dump HTML tree to stdout and quit. ") unless defined($opt_filename); my $data; open(my $fh, '<:encoding(iso-8859-1)', $opt_filename) or die("Error opening '$opt_filename': $!\n"); $data = do { local $/; <$fh> }; close($fh); die("No data in input.\n") unless (defined($data) && $data ne ""); # Filter out certain unneeded elements $data =~ s/<font[^>]*>//ig; $data =~ s/<\/font>//ig; $data =~ s/<\/?center>//ig; $data =~ s/<br>//ig; $data =~ s/ / /ig; ### Get some general information my $otree = parse_html($data); if ($opt_dump) { print Dumper(fnode($otree, "html")); exit; } my %class = (); my $body = fnode($otree, "body"); if (defined($body) && defined($$body{"nodes"})) { foreach my $n (@{$$body{"nodes"}}) { if ($$n{"name"} eq "text") { push(@{$class{"info"}}, $$n{"text"}); } elsif ($$n{"name"} eq "b") { push(@{$class{"data"}}, $n); } } } # Filter out some more, for easier tree access during table parsing $data =~ s/<\/?b>//ig; my $tree = parse_html($data); my $node = fnode(fnode($tree, "body"), "table"); die("No table element found in document. Perhaps the format has changed? :(\n") unless defined($node); ### Parse through the HTML document node tree to find the data we need my $cid = 0; my $q = $$node{"nodes"}; my $hourTimes = []; my $hourDefs = {}; my $hourTable = {}; my $hourFillTable = {}; my $maxDays = 0; my $firstHour = 0; my $lastHour = 0; my $totalHours = 0; sub parseHourData($$) { my ($l, $rowspan) = @_; my $chours = $rowspan / 2; # The table is actually in half cells my $cdata = []; my $cgrouped = 0; # Pull in data for the class/hour cell foreach my $h (@{$l}) { if (defined($$h{"nodes"})) { foreach my $b (@{$$h{"nodes"}}) { if (defined($$b{"nodes"})) { my $text = $$b{"nodes"}[0]{"text"}; $text =~ s/\.$//; $cgrouped = 1 if ($text =~ /vuorov/); push(@$cdata, $text); } } } } # Increased ID if there is data in this class/hour cell my $tid; if (scalar(@$cdata) > 0) { $cid++; $tid = $cid; } else { $tid = 0; } # Determine current day my $cday = 0; for (my $x = 0; $x < 7; $x++) { if (!defined($$hourFillTable{$lastHour}{$x})) { $cday = $x; last; } } for (my $t = 0; $t < $chours; $t++) { $$hourFillTable{$lastHour + $t}{$cday} = $tid; } if ($tid) { $maxDays = $cday + 1 if ($cday + 1 > $maxDays); # Grouped, if there is another class ID in second slot $cgrouped = 1 if ($$cdata[1] =~ /^[A-Z]\d{6}$/); if ($cgrouped) { my $cdata1 = []; my $cdata2 = []; for (my $i = 0; $i < length($cdata); $i += 2) { push(@$cdata1, $$cdata[$i]) if defined($$cdata[$i]); push(@$cdata2, $$cdata[$i+1]) if defined($$cdata[$i+1]); } $$hourDefs{$cid} = { "grouped" => $cgrouped, "start" => $lastHour, "hours" => $chours, "data" => [ $cdata1, $cdata2 ] }; } else { $$hourDefs{$cid} = { "grouped" => $cgrouped, "start" => $lastHour, "hours" => $chours, "data" => [ $cdata ] }; } push(@{$$hourTable{$cday}}, $tid); $totalHours += $chours; } } sub parseHour($) { if ($_[0] =~ /(\d+):(\d+)/) { return ((int($1) * 60 + int($2)) * 60); } return undef; } sub getDataStruct($); sub getDataStruct($) { my @out = (); my $tmp = $_[0]; if (ref($tmp) eq "ARRAY") { my @str = (); foreach my $item (@{$tmp}) { push(@str, getDataStruct($item)); } push(@out, "array(".join(", ", @str).")"); } elsif (ref($tmp) eq "HASH") { my @str = (); foreach my $key (keys %{$tmp}) { push(@out, "\"".$key."\" => ".getDataStruct($$tmp{$key})); } push(@out, "array(".join(", ", @str).")"); } elsif ($tmp =~ /^\d+$/) { push(@out, $tmp); } else { push(@out, "\"".$tmp."\""); } return join(", ", @out); } # Skip zero position this way (can't use foreach here) for (my $i = 1; $i < scalar(@{$q}); $i++) { my $d = $$q[$i]{"nodes"}; if (defined($d)) { foreach my $n (@{$d}) { my $l = $$n{"nodes"}[0]{"nodes"}; if (defined($l)) { if ($$n{"args"} =~ /colspan=6\s+rowspan=(\d+)/) { parseHourData($l, $1); } elsif ($$n{"args"} =~ /rowspan=2\s+align/) { my $qstart = parseHour($$l[0]{"nodes"}[0]{"nodes"}[0]{"text"}); my $qend = parseHour($$l[1]{"nodes"}[0]{"nodes"}[0]{"text"}); if (defined($qstart) && defined($qend)) { push(@$hourTimes, {"start" => $qstart, "end" => $qend}); } } } } $lastHour++; } } ### Go through hour table, find last day and hour of the week, crop my $flag = 1; for (my $y = 0; $y < $lastHour && $flag; $y++) { for (my $x = 0; $x < $maxDays && $flag; $x++) { $flag = 0 if (defined($$hourFillTable{$y}{$x}) && $$hourFillTable{$y}{$x} != 0); } $firstHour++ if ($flag); } $flag = 1; for (my $y = $lastHour - 1; $y >= 0 && $flag; $y--) { for (my $x = 0; $x < $maxDays && $flag; $x++) { $flag = 0 if (defined($$hourFillTable{$y}{$x}) && $$hourFillTable{$y}{$x} != 0); } $lastHour-- if ($flag); } ### Open output file, if specified if (defined($opt_outfile)) { open(STDOUT, '>', $opt_outfile) or die("Could not open output file '$opt_outfile'.\n"); } binmode STDOUT, ':encoding(utf-8)'; ### Output data in desired format if ($opt_mode eq "php") { print "<?\n". "\$classInfo = array(\n". " \"general\" => array(".join(", ", map { "\"".escape($_)."\""; } @{$class{"info"}})."),\n". " \"info\" => array(".join(", ", map { "\"".escape(html_collapse($_, 1))."\""; } @{$class{"data"}})."),\n". " \"tags\" => array(".join(", ", map { "\"".escape(html_collapse($_, 0))."\""; } @{$class{"data"}})."),\n". " \"maxDays\" => $maxDays,\n". " \"firstHour\" => $firstHour,\n". " \"lastHour\" => $lastHour,\n". " \"totalHours\" => $totalHours\n". ");\n\n"; print "\$classHourTimes = array(\n"; foreach my $chour (@$hourTimes) { print " array(\"start\" => ".$$chour{"start"}.", \"end\" => ".$$chour{"end"}."),\n"; } print ");\n\n"; print "\$classHourDefs = array(\n"; foreach my $cid (sort { $a <=> $b } keys %{$hourDefs}) { print " $cid => array(".getDataStruct($$hourDefs{$cid})."),\n"; } print ");\n\n"; print "\$classDayTable = array(\n"; for (my $y = 0; $y < $maxDays; $y++) { if (defined($$hourTable{$y})) { print " $y => array(".join(", ", @{$$hourTable{$y}})."),\n"; } } print ");\n?>\n"; } elsif ($opt_mode eq "xml") { print "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n". "<timetable>\n". " <class>\n". " <general>".join("", map { "<node>".encode_entities($_)."</node>"; } @{$class{"info"}})."</general>\n". " <info>".join("", map { "<node>".encode_entities(html_collapse($_, 1))."</node>"; } @{$class{"data"}})."</info>\n". " <maxdays>$maxDays</maxdays>\n". " <firsthour>$firstHour</firsthour>\n". " <lasthour>$lastHour</lasthour>\n". " <totalhours>$totalHours</totalhours>\n". " </class>\n"; print " <hours>\n"; foreach my $chour (@$hourTimes) { print " <hour><start>".$$chour{"start"}."</start><end>".$$chour{"end"}."</end></hour>\n"; } print " </hours>\n\n"; print " <classes>\n"; foreach my $cid (sort { $a <=> $b } keys %{$hourDefs}) { print " <class id=\"$cid\" "; foreach my $key (keys %{$$hourDefs{$cid}}) { my $a = $$hourDefs{$cid}{$key}; if (ref($a) eq "ARRAY") { print "<$key>".join("", map { "\"".escape($_)."\""; } @$a)."</$key>"; } elsif ($a =~ /^\d+$/) { print "$key=\"".$a."\" "; } else { print "$key=\"".escape($a)."\" "; } } print " </class>\n"; } print " </classes>\n\n"; print " <days>\n"; for (my $y = 0; $y < $maxDays; $y++) { if (defined($$hourTable{$y})) { print " <day id=\"$y\">".join("", map { "<class>".$_."</class>" } @{$$hourTable{$y}})."</day>\n"; } } print " </days>\n"; print "</timetable>\n"; } close (STDOUT);