Mercurial > hg > lukkari
diff fetchdata.pl @ 1:21fde93375e9
Add beta code.
author | Matti Hamalainen <ccr@tnsp.org> |
---|---|
date | Tue, 11 Jan 2011 20:43:12 +0200 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fetchdata.pl Tue Jan 11 20:43:12 2011 +0200 @@ -0,0 +1,438 @@ +#!/usr/bin/perl -w +# +# Fetch and parse HTML format class timetable into more sane formats +# (C) Copyright 2010-2010 Matti Hämäläinen <ccr@tnsp.org> +# +use strict; +use Data::Dumper; +use LWP::UserAgent; +use HTML::Entities; + +my $userAgent = "Lukkari/0.7"; + + +sub urlencode($) +{ + my $value = $_[0]; + $value =~ s/([^a-zA-Z_0-9 ])/"%" . uc(sprintf "%lx" , unpack("C", $1))/eg; + $value =~ tr/ /+/; + return $value; +} + + +sub str_trim($) +{ + my $str = $_[0]; + if (defined($str)) { + $str =~ s/^\s*//; + $str =~ s/\s*$//; + } + return $str; +} + + +sub pop_token_a($) +{ + my $tokens = $_[0]; + return shift(@$tokens); +} + + +sub pop_token($) +{ + return str_trim(pop_token_a($_[0])); +} + + +sub parse_html_str($) +{ + my $tokens = $_[0]; + my $token = pop_token($tokens); + my $str = ""; + $token =~ s/^\s*//; + + return undef unless (substr($token, 0, 1) eq '"'); + $token = substr($token, 1); + + while (defined($token)) { + if (substr($token, -1) eq '"') { + $str .= substr($token, 0, -1); + return $str; + } else { + $str .= $token; + } + $token = shift(@$tokens); + } + return undef; +} + + +sub parse_html_tree($$); + +sub parse_html_tree($$) +{ + my ($tokens, $tree) = @_; + + while (my $token = pop_token($tokens)) { + if ($token =~ /^<[!\/]?[a-zA-Z]+/) { + $token = lc($token); + if ($token =~ /^<\!.*>$/) { + # Ignore comments etc. + } elsif ($token =~ /^<([a-z]+)(.*)>$/) { + my ($name, $args) = ($1, $2); + if ($name eq "meta" || $name eq "img") { + my $tmp = {}; + parse_html_tree($tokens, $tree); + return $tree; + } else { + my $tmp = { "name" => $name, "args" => str_trim($args) }; + parse_html_tree($tokens, $tmp); + push(@{$$tree{"nodes"}}, $tmp); + } + } elsif ($token =~ /^<\/([a-z]+)>$/) { + return $tree; + } else { + die("HORROR TERROR ELITE: $token\n"); + } + } else { + $token = str_trim(decode_entities($token)); + push(@{$$tree{"nodes"}}, { "name" => "text", "args" => "", "text" => $token }) if length($token) > 0; + } + } + + return $tree; +} + + +sub parse_html($) +{ + return undef unless defined($_[0]); + my $str = $_[0]; + my $res = { "name" => "", "args" => "" }; + $str =~ tr/\r/ /; + $str =~ tr/\n/ /; + my @tokens = grep { !($_ =~ /^\s*$/) } split(/(<\/?[a-zA-Z]+.*?>)/, $str); + parse_html_tree(\@tokens, $res); + return $res; +} + + +sub http_fetch($) +{ + my $agent = LWP::UserAgent->new; + $agent->agent($userAgent); + $agent->timeout(10); + + my $req = HTTP::Request->new(GET => $_[0]); + $req->user_agent($userAgent); + my $res = $agent->request($req); + + if ($res->is_success) { + return $res->content; + } else { + print STDERR "HTTP request failed: [".$res->code."] ".$res->message."\n"; + return undef; + } +} + + +sub html_find_node($$$); + +sub html_find_node($$$) +{ + my ($node, $name, $args) = @_; + + if (defined($node)) { + if (ref($node) eq "ARRAY") { + foreach my $n (@$node) { + my $tmp = html_find_node($n, $name, $args); + # Must do it like this, in order not to break the loop + return $tmp if defined($tmp); + } + } elsif (ref($node) eq "HASH") { + if (defined($$node{"name"})) { + if ($$node{"name"} eq $name) { + if ($args ne "") { + if (defined($$node{"args"}) && $$node{"args"} =~ /$args/) { + } else { + return undef; + } + } + return $node; + } else { + return html_find_node($$node{"nodes"}, $name, $args); + } + } + } + } + return undef; +} + + +sub fnode($$) +{ + return html_find_node($_[0], $_[1], ""); +} + + +sub fnodea($$$) +{ + return html_find_node($_[0], $_[1], $_[2]); +} + + +sub escape($) +{ + my $s = $_[0]; + $s =~ s/(['"])/\\$1/g; + return $s; +} + + +sub html_collapse($$); + +sub html_collapse($$) +{ + my ($node, $strip) = @_; + my $str = ""; + + if ($$node{"name"} eq "text") { + $str .= $$node{"text"}; + } else { + $str .= "<".$$node{"name"}.">" unless ($strip); + foreach my $n (@{$$node{"nodes"}}) { + $str .= html_collapse($n, $strip); + } + $str .= "</".$$node{"name"}.">" unless ($strip); + } + + return $str; +} + + +### +### Main program +### +my $modes = "simple|php|xml"; +my $opt_mode = "php"; +my $opt_dump = 0; +my $opt_filename; +my $opt_outfile; + +while (defined(my $arg = shift)) { + if (substr($arg, 0, 1) eq "-") { + if ($arg =~ /^-($modes)$/o) { + $opt_mode = $1; + } + elsif ($arg eq "-dump") { + $opt_dump = 1; + } + elsif ($arg eq "-o") { + $opt_outfile = shift or die("Output filename option -o requires an argument.\n"); + } else { + die("Invalid option '$arg'.\n"); + } + } else { + $opt_filename = $arg; + } +} + +die("Usage: $0 [options] <filename|URI> + + -php Output a PHP include file with data in arrays (default) + -simple Output simple tabled output for easy parsing. + -xml Output XML. + + -o <filename> Set output filename. Default is to use stdout. + + -dump Dump HTML tree to stdout and quit. + +") unless defined($opt_filename); + + +my $data; +if ($opt_filename =~ /^(http|https):/) { + $data = http_fetch($opt_filename) or die("Could not fetch: $opt_filename\n"); +} else { + open(my $fh, '<', $opt_filename) or die("Error opening '$opt_filename': $!\n"); + $data = do { local $/; <$fh> }; + close($fh); +} + +die("No data in input.\n") unless (defined($data) && $data ne ""); + + +# Filter out certain unneeded elements +$data =~ s/<font[^>]*>//ig; +$data =~ s/<\/font>//ig; +$data =~ s/<\/?center>//ig; +$data =~ s/<br>//ig; +$data =~ s/ / /ig; + +### Get some general information +my $otree = parse_html($data); +if ($opt_dump) { + print Dumper(fnode($otree, "html")); + exit; +} + +my %class = (); +my $body = fnode($otree, "body"); +if (defined($body) && defined($$body{"nodes"})) { + foreach my $n (@{$$body{"nodes"}}) { + if ($$n{"name"} eq "text") { + push(@{$class{"info"}}, $$n{"text"}); + } + elsif ($$n{"name"} eq "b") { + push(@{$class{"data"}}, $n); + } + } +} + +# Filter out some more, for easier tree access during table parsing +$data =~ s/<\/?b>//ig; +my $tree = parse_html($data); +my $node = fnode(fnode($tree, "body"), "table"); +die("No table element found in document. Perhaps the format has changed? :(\n") unless defined($node); + + +### Parse through the HTML document node tree to find the data we need +my $id = 0; +my $q = $$node{"nodes"}; +my $tunnit = {}; +my $taulu = {}; +my $maxdays = 6; +my $maxhours = 0; + +# Skip zero this way +for (my $i = 1; $i < scalar(@{$q}); $i++) { + my $d = $$q[$i]{"nodes"}; + if (defined($d)) { + foreach my $n (@{$d}) { + my $l = $$n{"nodes"}[0]{"nodes"}; + if (defined($l) && $$n{"args"} =~ /colspan=6\s+rowspan=(\d+)/) { + my $tuntia = $1 / 2; + my $data = []; + my $grouped = 0; + foreach my $h (@{$l}) { + if (defined($$h{"nodes"})) { + foreach my $b (@{$$h{"nodes"}}) { + if (defined($$b{"nodes"})) { + my $text = $$b{"nodes"}[0]{"text"}; + $text =~ s/\.$//; + + $grouped = 1 if ($text =~ /vuorov/); + + push(@$data, $text); + } + } + } + } + + my $tid; + if (scalar(@$data) > 0) { + $id++; + $tid = $id; + } else { + $tid = 0; + } + + my $tpd = 0; + for (my $x = 0; $x < $maxdays; $x++) { + if (!defined($$taulu{$maxhours}{$x})) { + $tpd = $x; + last; + } + } + for (my $t = 0; $t < $tuntia; $t++) { + $$taulu{$maxhours + $t}{$tpd} = $tid; + } + + if (scalar(@$data) > 0) { + # Grouped, if there is another class ID in second slot + $grouped = 1 if ($$data[1] =~ /^[A-Z]\d{6}$/); + $$tunnit{$id} = { "grouped" => $grouped, "day" => $tpd, "start" => $maxhours, "hours" => $tuntia, "data" => $data }; + } + } + } + $maxhours++; + } +} + + +### Open output file, if specified +if (defined($opt_outfile)) { + open(STDOUT, '>', $opt_outfile) or die("Could not open output file '$opt_outfile'.\n"); +} + + +### Output data in desired format +if ($opt_mode eq "php") { + print "<?\n". + "\$classInfo = array(\n". + " \"general\" => array(".join(", ", map { "\"".escape($_)."\""; } @{$class{"info"}})."),\n". + " \"info\" => array(".join(", ", map { "\"".escape(html_collapse($_, 1))."\""; } @{$class{"data"}})."),\n". + " \"info_tags\" => array(".join(", ", map { "\"".escape(html_collapse($_, 0))."\""; } @{$class{"data"}})."),\n". + " \"maxdays\" => $maxdays,\n". + " \"maxhours\" => $maxhours,\n". + ");\n\n"; + + print "\$classDefs = array(\n"; + foreach my $id (sort { $a <=> $b } keys %{$tunnit}) { + print " $id => array("; + foreach my $key (keys %{$$tunnit{$id}}) { + my $a = $$tunnit{$id}{$key}; + print "\"$key\" => "; + if (ref($a) eq "ARRAY") { + print "array(".join(", ", map { "\"".escape($_)."\""; } @$a).")"; + } + elsif ($a =~ /^\d+$/) { + print $a; + } else { + print "\"".escape($a)."\""; + } + print ", "; + } + print "),\n"; + } + + print ");\n". + "\n". + "\$classHourTable = array(\n"; + for (my $y = 0; $y < $maxhours; $y++) { + my $str = ""; + for (my $x = 0; $x < $maxdays; $x++) { + $str .= ", " unless ($str eq ""); + $str .= sprintf "%3d", $$taulu{$y}{$x}; + } + print " array(".$str."),\n"; + } + print ");\n?>\n"; +} + +elsif ($opt_mode eq "xml") { + print "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n". + "<timetable>\n". + " <class>\n". + " <general>".join("", map { "<node>".encode_entities($_)."</node>"; } @{$class{"info"}})."</general>\n". + " <info>".join("", map { "<node>".encode_entities(html_collapse($_, 1))."</node>"; } @{$class{"data"}})."</info>\n". + " <maxdays>$maxdays</maxdays>\n". + " <maxhours>$maxhours</maxhours>\n". + " </class>\n"; + + + print "</timetable>\n"; +} + + + +elsif ($opt_mode eq "simple") { + for (my $y = 0; $y < $maxhours; $y++) { + for (my $x = 0; $x < $maxdays; $x++) { + printf "%3d ", $$taulu{$y}{$x}; + } + print "\n"; + } +} + +close (STDOUT);