# HG changeset patch # User Matti Hamalainen # Date 1294772042 -7200 # Node ID 3d0904d29591e0a9bfeb361aff59c99f43aa3df9 # Parent 2471bb8912997ca288129d2b5e05079719c6f356 Renamed fetchdata utility to parsedata. diff -r 2471bb891299 -r 3d0904d29591 fetchdata.pl --- a/fetchdata.pl Tue Jan 11 20:53:50 2011 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,438 +0,0 @@ -#!/usr/bin/perl -w -# -# Fetch and parse HTML format class timetable into more sane formats -# (C) Copyright 2010-2010 Matti Hämäläinen -# -use strict; -use Data::Dumper; -use LWP::UserAgent; -use HTML::Entities; - -my $userAgent = "Lukkari/0.7"; - - -sub urlencode($) -{ - my $value = $_[0]; - $value =~ s/([^a-zA-Z_0-9 ])/"%" . uc(sprintf "%lx" , unpack("C", $1))/eg; - $value =~ tr/ /+/; - return $value; -} - - -sub str_trim($) -{ - my $str = $_[0]; - if (defined($str)) { - $str =~ s/^\s*//; - $str =~ s/\s*$//; - } - return $str; -} - - -sub pop_token_a($) -{ - my $tokens = $_[0]; - return shift(@$tokens); -} - - -sub pop_token($) -{ - return str_trim(pop_token_a($_[0])); -} - - -sub parse_html_str($) -{ - my $tokens = $_[0]; - my $token = pop_token($tokens); - my $str = ""; - $token =~ s/^\s*//; - - return undef unless (substr($token, 0, 1) eq '"'); - $token = substr($token, 1); - - while (defined($token)) { - if (substr($token, -1) eq '"') { - $str .= substr($token, 0, -1); - return $str; - } else { - $str .= $token; - } - $token = shift(@$tokens); - } - return undef; -} - - -sub parse_html_tree($$); - -sub parse_html_tree($$) -{ - my ($tokens, $tree) = @_; - - while (my $token = pop_token($tokens)) { - if ($token =~ /^<[!\/]?[a-zA-Z]+/) { - $token = lc($token); - if ($token =~ /^<\!.*>$/) { - # Ignore comments etc. - } elsif ($token =~ /^<([a-z]+)(.*)>$/) { - my ($name, $args) = ($1, $2); - if ($name eq "meta" || $name eq "img") { - my $tmp = {}; - parse_html_tree($tokens, $tree); - return $tree; - } else { - my $tmp = { "name" => $name, "args" => str_trim($args) }; - parse_html_tree($tokens, $tmp); - push(@{$$tree{"nodes"}}, $tmp); - } - } elsif ($token =~ /^<\/([a-z]+)>$/) { - return $tree; - } else { - die("HORROR TERROR ELITE: $token\n"); - } - } else { - $token = str_trim(decode_entities($token)); - push(@{$$tree{"nodes"}}, { "name" => "text", "args" => "", "text" => $token }) if length($token) > 0; - } - } - - return $tree; -} - - -sub parse_html($) -{ - return undef unless defined($_[0]); - my $str = $_[0]; - my $res = { "name" => "", "args" => "" }; - $str =~ tr/\r/ /; - $str =~ tr/\n/ /; - my @tokens = grep { !($_ =~ /^\s*$/) } split(/(<\/?[a-zA-Z]+.*?>)/, $str); - parse_html_tree(\@tokens, $res); - return $res; -} - - -sub http_fetch($) -{ - my $agent = LWP::UserAgent->new; - $agent->agent($userAgent); - $agent->timeout(10); - - my $req = HTTP::Request->new(GET => $_[0]); - $req->user_agent($userAgent); - my $res = $agent->request($req); - - if ($res->is_success) { - return $res->content; - } else { - print STDERR "HTTP request failed: [".$res->code."] ".$res->message."\n"; - return undef; - } -} - - -sub html_find_node($$$); - -sub html_find_node($$$) -{ - my ($node, $name, $args) = @_; - - if (defined($node)) { - if (ref($node) eq "ARRAY") { - foreach my $n (@$node) { - my $tmp = html_find_node($n, $name, $args); - # Must do it like this, in order not to break the loop - return $tmp if defined($tmp); - } - } elsif (ref($node) eq "HASH") { - if (defined($$node{"name"})) { - if ($$node{"name"} eq $name) { - if ($args ne "") { - if (defined($$node{"args"}) && $$node{"args"} =~ /$args/) { - } else { - return undef; - } - } - return $node; - } else { - return html_find_node($$node{"nodes"}, $name, $args); - } - } - } - } - return undef; -} - - -sub fnode($$) -{ - return html_find_node($_[0], $_[1], ""); -} - - -sub fnodea($$$) -{ - return html_find_node($_[0], $_[1], $_[2]); -} - - -sub escape($) -{ - my $s = $_[0]; - $s =~ s/(['"])/\\$1/g; - return $s; -} - - -sub html_collapse($$); - -sub html_collapse($$) -{ - my ($node, $strip) = @_; - my $str = ""; - - if ($$node{"name"} eq "text") { - $str .= $$node{"text"}; - } else { - $str .= "<".$$node{"name"}.">" unless ($strip); - foreach my $n (@{$$node{"nodes"}}) { - $str .= html_collapse($n, $strip); - } - $str .= "" unless ($strip); - } - - return $str; -} - - -### -### Main program -### -my $modes = "simple|php|xml"; -my $opt_mode = "php"; -my $opt_dump = 0; -my $opt_filename; -my $opt_outfile; - -while (defined(my $arg = shift)) { - if (substr($arg, 0, 1) eq "-") { - if ($arg =~ /^-($modes)$/o) { - $opt_mode = $1; - } - elsif ($arg eq "-dump") { - $opt_dump = 1; - } - elsif ($arg eq "-o") { - $opt_outfile = shift or die("Output filename option -o requires an argument.\n"); - } else { - die("Invalid option '$arg'.\n"); - } - } else { - $opt_filename = $arg; - } -} - -die("Usage: $0 [options] - - -php Output a PHP include file with data in arrays (default) - -simple Output simple tabled output for easy parsing. - -xml Output XML. - - -o Set output filename. Default is to use stdout. - - -dump Dump HTML tree to stdout and quit. - -") unless defined($opt_filename); - - -my $data; -if ($opt_filename =~ /^(http|https):/) { - $data = http_fetch($opt_filename) or die("Could not fetch: $opt_filename\n"); -} else { - open(my $fh, '<', $opt_filename) or die("Error opening '$opt_filename': $!\n"); - $data = do { local $/; <$fh> }; - close($fh); -} - -die("No data in input.\n") unless (defined($data) && $data ne ""); - - -# Filter out certain unneeded elements -$data =~ s/]*>//ig; -$data =~ s/<\/font>//ig; -$data =~ s/<\/?center>//ig; -$data =~ s/
//ig; -$data =~ s/ / /ig; - -### Get some general information -my $otree = parse_html($data); -if ($opt_dump) { - print Dumper(fnode($otree, "html")); - exit; -} - -my %class = (); -my $body = fnode($otree, "body"); -if (defined($body) && defined($$body{"nodes"})) { - foreach my $n (@{$$body{"nodes"}}) { - if ($$n{"name"} eq "text") { - push(@{$class{"info"}}, $$n{"text"}); - } - elsif ($$n{"name"} eq "b") { - push(@{$class{"data"}}, $n); - } - } -} - -# Filter out some more, for easier tree access during table parsing -$data =~ s/<\/?b>//ig; -my $tree = parse_html($data); -my $node = fnode(fnode($tree, "body"), "table"); -die("No table element found in document. Perhaps the format has changed? :(\n") unless defined($node); - - -### Parse through the HTML document node tree to find the data we need -my $id = 0; -my $q = $$node{"nodes"}; -my $tunnit = {}; -my $taulu = {}; -my $maxdays = 6; -my $maxhours = 0; - -# Skip zero this way -for (my $i = 1; $i < scalar(@{$q}); $i++) { - my $d = $$q[$i]{"nodes"}; - if (defined($d)) { - foreach my $n (@{$d}) { - my $l = $$n{"nodes"}[0]{"nodes"}; - if (defined($l) && $$n{"args"} =~ /colspan=6\s+rowspan=(\d+)/) { - my $tuntia = $1 / 2; - my $data = []; - my $grouped = 0; - foreach my $h (@{$l}) { - if (defined($$h{"nodes"})) { - foreach my $b (@{$$h{"nodes"}}) { - if (defined($$b{"nodes"})) { - my $text = $$b{"nodes"}[0]{"text"}; - $text =~ s/\.$//; - - $grouped = 1 if ($text =~ /vuorov/); - - push(@$data, $text); - } - } - } - } - - my $tid; - if (scalar(@$data) > 0) { - $id++; - $tid = $id; - } else { - $tid = 0; - } - - my $tpd = 0; - for (my $x = 0; $x < $maxdays; $x++) { - if (!defined($$taulu{$maxhours}{$x})) { - $tpd = $x; - last; - } - } - for (my $t = 0; $t < $tuntia; $t++) { - $$taulu{$maxhours + $t}{$tpd} = $tid; - } - - if (scalar(@$data) > 0) { - # Grouped, if there is another class ID in second slot - $grouped = 1 if ($$data[1] =~ /^[A-Z]\d{6}$/); - $$tunnit{$id} = { "grouped" => $grouped, "day" => $tpd, "start" => $maxhours, "hours" => $tuntia, "data" => $data }; - } - } - } - $maxhours++; - } -} - - -### Open output file, if specified -if (defined($opt_outfile)) { - open(STDOUT, '>', $opt_outfile) or die("Could not open output file '$opt_outfile'.\n"); -} - - -### Output data in desired format -if ($opt_mode eq "php") { - print " array(".join(", ", map { "\"".escape($_)."\""; } @{$class{"info"}})."),\n". - " \"info\" => array(".join(", ", map { "\"".escape(html_collapse($_, 1))."\""; } @{$class{"data"}})."),\n". - " \"info_tags\" => array(".join(", ", map { "\"".escape(html_collapse($_, 0))."\""; } @{$class{"data"}})."),\n". - " \"maxdays\" => $maxdays,\n". - " \"maxhours\" => $maxhours,\n". - ");\n\n"; - - print "\$classDefs = array(\n"; - foreach my $id (sort { $a <=> $b } keys %{$tunnit}) { - print " $id => array("; - foreach my $key (keys %{$$tunnit{$id}}) { - my $a = $$tunnit{$id}{$key}; - print "\"$key\" => "; - if (ref($a) eq "ARRAY") { - print "array(".join(", ", map { "\"".escape($_)."\""; } @$a).")"; - } - elsif ($a =~ /^\d+$/) { - print $a; - } else { - print "\"".escape($a)."\""; - } - print ", "; - } - print "),\n"; - } - - print ");\n". - "\n". - "\$classHourTable = array(\n"; - for (my $y = 0; $y < $maxhours; $y++) { - my $str = ""; - for (my $x = 0; $x < $maxdays; $x++) { - $str .= ", " unless ($str eq ""); - $str .= sprintf "%3d", $$taulu{$y}{$x}; - } - print " array(".$str."),\n"; - } - print ");\n?>\n"; -} - -elsif ($opt_mode eq "xml") { - print "\n". - "\n". - " \n". - " ".join("", map { "".encode_entities($_).""; } @{$class{"info"}})."\n". - " ".join("", map { "".encode_entities(html_collapse($_, 1)).""; } @{$class{"data"}})."\n". - " $maxdays\n". - " $maxhours\n". - " \n"; - - - print "\n"; -} - - - -elsif ($opt_mode eq "simple") { - for (my $y = 0; $y < $maxhours; $y++) { - for (my $x = 0; $x < $maxdays; $x++) { - printf "%3d ", $$taulu{$y}{$x}; - } - print "\n"; - } -} - -close (STDOUT); diff -r 2471bb891299 -r 3d0904d29591 parsedata.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/parsedata.pl Tue Jan 11 20:54:02 2011 +0200 @@ -0,0 +1,438 @@ +#!/usr/bin/perl -w +# +# Fetch and parse HTML format class timetable into more sane formats +# (C) Copyright 2010-2010 Matti Hämäläinen +# +use strict; +use Data::Dumper; +use LWP::UserAgent; +use HTML::Entities; + +my $userAgent = "Lukkari/0.7"; + + +sub urlencode($) +{ + my $value = $_[0]; + $value =~ s/([^a-zA-Z_0-9 ])/"%" . uc(sprintf "%lx" , unpack("C", $1))/eg; + $value =~ tr/ /+/; + return $value; +} + + +sub str_trim($) +{ + my $str = $_[0]; + if (defined($str)) { + $str =~ s/^\s*//; + $str =~ s/\s*$//; + } + return $str; +} + + +sub pop_token_a($) +{ + my $tokens = $_[0]; + return shift(@$tokens); +} + + +sub pop_token($) +{ + return str_trim(pop_token_a($_[0])); +} + + +sub parse_html_str($) +{ + my $tokens = $_[0]; + my $token = pop_token($tokens); + my $str = ""; + $token =~ s/^\s*//; + + return undef unless (substr($token, 0, 1) eq '"'); + $token = substr($token, 1); + + while (defined($token)) { + if (substr($token, -1) eq '"') { + $str .= substr($token, 0, -1); + return $str; + } else { + $str .= $token; + } + $token = shift(@$tokens); + } + return undef; +} + + +sub parse_html_tree($$); + +sub parse_html_tree($$) +{ + my ($tokens, $tree) = @_; + + while (my $token = pop_token($tokens)) { + if ($token =~ /^<[!\/]?[a-zA-Z]+/) { + $token = lc($token); + if ($token =~ /^<\!.*>$/) { + # Ignore comments etc. + } elsif ($token =~ /^<([a-z]+)(.*)>$/) { + my ($name, $args) = ($1, $2); + if ($name eq "meta" || $name eq "img") { + my $tmp = {}; + parse_html_tree($tokens, $tree); + return $tree; + } else { + my $tmp = { "name" => $name, "args" => str_trim($args) }; + parse_html_tree($tokens, $tmp); + push(@{$$tree{"nodes"}}, $tmp); + } + } elsif ($token =~ /^<\/([a-z]+)>$/) { + return $tree; + } else { + die("HORROR TERROR ELITE: $token\n"); + } + } else { + $token = str_trim(decode_entities($token)); + push(@{$$tree{"nodes"}}, { "name" => "text", "args" => "", "text" => $token }) if length($token) > 0; + } + } + + return $tree; +} + + +sub parse_html($) +{ + return undef unless defined($_[0]); + my $str = $_[0]; + my $res = { "name" => "", "args" => "" }; + $str =~ tr/\r/ /; + $str =~ tr/\n/ /; + my @tokens = grep { !($_ =~ /^\s*$/) } split(/(<\/?[a-zA-Z]+.*?>)/, $str); + parse_html_tree(\@tokens, $res); + return $res; +} + + +sub http_fetch($) +{ + my $agent = LWP::UserAgent->new; + $agent->agent($userAgent); + $agent->timeout(10); + + my $req = HTTP::Request->new(GET => $_[0]); + $req->user_agent($userAgent); + my $res = $agent->request($req); + + if ($res->is_success) { + return $res->content; + } else { + print STDERR "HTTP request failed: [".$res->code."] ".$res->message."\n"; + return undef; + } +} + + +sub html_find_node($$$); + +sub html_find_node($$$) +{ + my ($node, $name, $args) = @_; + + if (defined($node)) { + if (ref($node) eq "ARRAY") { + foreach my $n (@$node) { + my $tmp = html_find_node($n, $name, $args); + # Must do it like this, in order not to break the loop + return $tmp if defined($tmp); + } + } elsif (ref($node) eq "HASH") { + if (defined($$node{"name"})) { + if ($$node{"name"} eq $name) { + if ($args ne "") { + if (defined($$node{"args"}) && $$node{"args"} =~ /$args/) { + } else { + return undef; + } + } + return $node; + } else { + return html_find_node($$node{"nodes"}, $name, $args); + } + } + } + } + return undef; +} + + +sub fnode($$) +{ + return html_find_node($_[0], $_[1], ""); +} + + +sub fnodea($$$) +{ + return html_find_node($_[0], $_[1], $_[2]); +} + + +sub escape($) +{ + my $s = $_[0]; + $s =~ s/(['"])/\\$1/g; + return $s; +} + + +sub html_collapse($$); + +sub html_collapse($$) +{ + my ($node, $strip) = @_; + my $str = ""; + + if ($$node{"name"} eq "text") { + $str .= $$node{"text"}; + } else { + $str .= "<".$$node{"name"}.">" unless ($strip); + foreach my $n (@{$$node{"nodes"}}) { + $str .= html_collapse($n, $strip); + } + $str .= "" unless ($strip); + } + + return $str; +} + + +### +### Main program +### +my $modes = "simple|php|xml"; +my $opt_mode = "php"; +my $opt_dump = 0; +my $opt_filename; +my $opt_outfile; + +while (defined(my $arg = shift)) { + if (substr($arg, 0, 1) eq "-") { + if ($arg =~ /^-($modes)$/o) { + $opt_mode = $1; + } + elsif ($arg eq "-dump") { + $opt_dump = 1; + } + elsif ($arg eq "-o") { + $opt_outfile = shift or die("Output filename option -o requires an argument.\n"); + } else { + die("Invalid option '$arg'.\n"); + } + } else { + $opt_filename = $arg; + } +} + +die("Usage: $0 [options] + + -php Output a PHP include file with data in arrays (default) + -simple Output simple tabled output for easy parsing. + -xml Output XML. + + -o Set output filename. Default is to use stdout. + + -dump Dump HTML tree to stdout and quit. + +") unless defined($opt_filename); + + +my $data; +if ($opt_filename =~ /^(http|https):/) { + $data = http_fetch($opt_filename) or die("Could not fetch: $opt_filename\n"); +} else { + open(my $fh, '<', $opt_filename) or die("Error opening '$opt_filename': $!\n"); + $data = do { local $/; <$fh> }; + close($fh); +} + +die("No data in input.\n") unless (defined($data) && $data ne ""); + + +# Filter out certain unneeded elements +$data =~ s/]*>//ig; +$data =~ s/<\/font>//ig; +$data =~ s/<\/?center>//ig; +$data =~ s/
//ig; +$data =~ s/ / /ig; + +### Get some general information +my $otree = parse_html($data); +if ($opt_dump) { + print Dumper(fnode($otree, "html")); + exit; +} + +my %class = (); +my $body = fnode($otree, "body"); +if (defined($body) && defined($$body{"nodes"})) { + foreach my $n (@{$$body{"nodes"}}) { + if ($$n{"name"} eq "text") { + push(@{$class{"info"}}, $$n{"text"}); + } + elsif ($$n{"name"} eq "b") { + push(@{$class{"data"}}, $n); + } + } +} + +# Filter out some more, for easier tree access during table parsing +$data =~ s/<\/?b>//ig; +my $tree = parse_html($data); +my $node = fnode(fnode($tree, "body"), "table"); +die("No table element found in document. Perhaps the format has changed? :(\n") unless defined($node); + + +### Parse through the HTML document node tree to find the data we need +my $id = 0; +my $q = $$node{"nodes"}; +my $tunnit = {}; +my $taulu = {}; +my $maxdays = 6; +my $maxhours = 0; + +# Skip zero this way +for (my $i = 1; $i < scalar(@{$q}); $i++) { + my $d = $$q[$i]{"nodes"}; + if (defined($d)) { + foreach my $n (@{$d}) { + my $l = $$n{"nodes"}[0]{"nodes"}; + if (defined($l) && $$n{"args"} =~ /colspan=6\s+rowspan=(\d+)/) { + my $tuntia = $1 / 2; + my $data = []; + my $grouped = 0; + foreach my $h (@{$l}) { + if (defined($$h{"nodes"})) { + foreach my $b (@{$$h{"nodes"}}) { + if (defined($$b{"nodes"})) { + my $text = $$b{"nodes"}[0]{"text"}; + $text =~ s/\.$//; + + $grouped = 1 if ($text =~ /vuorov/); + + push(@$data, $text); + } + } + } + } + + my $tid; + if (scalar(@$data) > 0) { + $id++; + $tid = $id; + } else { + $tid = 0; + } + + my $tpd = 0; + for (my $x = 0; $x < $maxdays; $x++) { + if (!defined($$taulu{$maxhours}{$x})) { + $tpd = $x; + last; + } + } + for (my $t = 0; $t < $tuntia; $t++) { + $$taulu{$maxhours + $t}{$tpd} = $tid; + } + + if (scalar(@$data) > 0) { + # Grouped, if there is another class ID in second slot + $grouped = 1 if ($$data[1] =~ /^[A-Z]\d{6}$/); + $$tunnit{$id} = { "grouped" => $grouped, "day" => $tpd, "start" => $maxhours, "hours" => $tuntia, "data" => $data }; + } + } + } + $maxhours++; + } +} + + +### Open output file, if specified +if (defined($opt_outfile)) { + open(STDOUT, '>', $opt_outfile) or die("Could not open output file '$opt_outfile'.\n"); +} + + +### Output data in desired format +if ($opt_mode eq "php") { + print " array(".join(", ", map { "\"".escape($_)."\""; } @{$class{"info"}})."),\n". + " \"info\" => array(".join(", ", map { "\"".escape(html_collapse($_, 1))."\""; } @{$class{"data"}})."),\n". + " \"info_tags\" => array(".join(", ", map { "\"".escape(html_collapse($_, 0))."\""; } @{$class{"data"}})."),\n". + " \"maxdays\" => $maxdays,\n". + " \"maxhours\" => $maxhours,\n". + ");\n\n"; + + print "\$classDefs = array(\n"; + foreach my $id (sort { $a <=> $b } keys %{$tunnit}) { + print " $id => array("; + foreach my $key (keys %{$$tunnit{$id}}) { + my $a = $$tunnit{$id}{$key}; + print "\"$key\" => "; + if (ref($a) eq "ARRAY") { + print "array(".join(", ", map { "\"".escape($_)."\""; } @$a).")"; + } + elsif ($a =~ /^\d+$/) { + print $a; + } else { + print "\"".escape($a)."\""; + } + print ", "; + } + print "),\n"; + } + + print ");\n". + "\n". + "\$classHourTable = array(\n"; + for (my $y = 0; $y < $maxhours; $y++) { + my $str = ""; + for (my $x = 0; $x < $maxdays; $x++) { + $str .= ", " unless ($str eq ""); + $str .= sprintf "%3d", $$taulu{$y}{$x}; + } + print " array(".$str."),\n"; + } + print ");\n?>\n"; +} + +elsif ($opt_mode eq "xml") { + print "\n". + "\n". + " \n". + " ".join("", map { "".encode_entities($_).""; } @{$class{"info"}})."\n". + " ".join("", map { "".encode_entities(html_collapse($_, 1)).""; } @{$class{"data"}})."\n". + " $maxdays\n". + " $maxhours\n". + " \n"; + + + print "\n"; +} + + + +elsif ($opt_mode eq "simple") { + for (my $y = 0; $y < $maxhours; $y++) { + for (my $x = 0; $x < $maxdays; $x++) { + printf "%3d ", $$taulu{$y}{$x}; + } + print "\n"; + } +} + +close (STDOUT);