# HG changeset patch # User Matti Hamalainen # Date 1401701351 -10800 # Node ID db2b1f74d9949cb77fc5e2d3a00b6374c475d86e # Parent c8fa73ba67da8a86a9fbb0024da897a90f89f89c Initial import of weather data fetching backend, written in Perl. diff -r c8fa73ba67da -r db2b1f74d994 fetch_weather.pl --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/fetch_weather.pl Mon Jun 02 12:29:11 2014 +0300 @@ -0,0 +1,269 @@ +#!/usr/bin/perl -w +use strict; +use LWP::UserAgent; +use HTML::Entities; +use XML::Simple; +use Text::Iconv; +use Date::Parse; +use Data::Dumper; + +sub str_trim($) +{ + my $str = $_[0]; + if (defined($str)) { + $str =~ s/^\s*//; + $str =~ s/\s*$//; + } + return $str; +} + + +sub pop_token_a($) +{ + my $tokens = $_[0]; + return shift(@$tokens); +} + + +sub pop_token($) +{ + return str_trim(pop_token_a($_[0])); +} + + +sub parse_html_str($) +{ + my $tokens = $_[0]; + my $token = pop_token($tokens); + my $str = ""; + $token =~ s/^\s*//; + + return undef unless (substr($token, 0, 1) eq '"'); + $token = substr($token, 1); + + while (defined($token)) { + my $tmp = $token; + $tmp =~ s/\s*$//; + if (substr($tmp, -1) eq '"') { + $str .= substr($tmp, 0, -1); + return $str; + } else { + $str .= $token; + } + $token = shift(@$tokens); + } + return undef; +} + + +sub parse_html_tree($$); + +sub parse_html_tree($$) +{ + my ($tokens, $tree) = @_; + + while (my $token = pop_token($tokens)) { + if ($token =~ /^<[!\/]?[a-zA-Z]+/) { + $token = lc($token); + if ($token =~ /^<\!.*>$/) { + # Ignore comments etc. + } elsif ($token =~ /^<([a-z]+[1-9]*)(.*)>$/) { + my ($name, $args) = ($1, $2); + if ($name eq "meta" || $name eq "img") { + my $tmp = {}; + parse_html_tree($tokens, $tree); + return $tree; + } else { + my $tmp = { "name" => $name, "args" => str_trim($args) }; + parse_html_tree($tokens, $tmp); + push(@{$$tree{"nodes"}}, $tmp); + } + } elsif ($token =~ /^<\/([a-z]+[1-9]*)>$/) { + return $tree; + } else { + die("HORROR TERROR ELITE: '$token'\n"); + } + } else { + $token = str_trim(decode_entities($token)); + push(@{$$tree{"nodes"}}, { "name" => "text", "args" => "", "text" => $token }) if length($token) > 0; + } + } + + return $tree; +} + + +sub parse_html($) +{ + return undef unless defined($_[0]); + my $str = $_[0]; + my $res = { "name" => "", "args" => "" }; + $str =~ tr/\r/ /; + $str =~ tr/\n/ /; + my @tokens = grep { !($_ =~ /^\s*$/) } split(/(<\/?[a-zA-Z]+.*?>)/, $str); + parse_html_tree(\@tokens, $res); + return $res; +} + +sub html_find_node($$$); + +sub html_find_node($$$) +{ + my ($node, $name, $args) = @_; + + if (defined($node)) { + if (ref($node) eq "ARRAY") { + foreach my $n (@$node) { + my $tmp = html_find_node($n, $name, $args); + # Must do it like this, in order not to break the loop + return $tmp if defined($tmp); + } + } elsif (ref($node) eq "HASH") { + if (defined($$node{"name"})) { + if ($$node{"name"} eq $name) { + if ($args ne "") { + if (defined($$node{"args"}) && $$node{"args"} =~ /$args/) { + } else { + return html_find_node($$node{"nodes"}, $name, $args); + } + } + return $node; + } else { + return html_find_node($$node{"nodes"}, $name, $args); + } + } + } + } + return undef; +} + + +sub fnode($$) +{ + return html_find_node($_[0], $_[1], ""); +} + + +sub fnodea($$$) +{ + return html_find_node($_[0], $_[1], $_[2]); +} + + +sub fetch_http($) +{ + my $tmpAgent = LWP::UserAgent->new; + $tmpAgent->agent("Mozilla/4.0 (compatible; MSIE 6.0; MSIE 5.5; Windows NT 6.0) Opera 10.63 [en]"); + $tmpAgent->timeout(10); + + my $tmpRequest = HTTP::Request->new(GET => $_[0]); + + return $tmpAgent->request($tmpRequest); +} + + +sub get_node($$$) +{ + return defined($_[0]->[$_[2]]{"nodes"}[0]{$_[1]}) ? $_[0]->[$_[2]]{"nodes"}[0]{$_[1]} : ""; +} + + +sub parse_timestamp($$) +{ + return $_[1] + str2time($_[0]) - time(); +} + + +### +### Main program begins +### +die( +"Weather Fetch v0.1 by ccr/TNSP \n". +"Usage: $0 \n" +) unless scalar(@ARGV) >= 2; + +my $opt_outfile = shift; +my $opt_api_key = shift; +my $weatherdata = {}; + +### Fetch tiehallinto data +for (my $i = 1; $i <= 22; $i++) +{ + my $res = fetch_http("http://alk.tiehallinto.fi/alk/tiesaa/tiesaa_maak_".$i.".html"); + if ($res->code >= 200 && $res->code <= 201) + { + my $data = $res->decoded_content; + + # Filter out crap tags we don't want or need + $data =~ s/\n/§/g; + $data =~ s///ig; + $data =~ s/]*>.*?<\/map>//ig; + $data =~ s/]*>.*?<\/form>//ig; + $data =~ s/]*>.*?<\/script>//ig; + $data =~ s/]*>//ig; + $data =~ s/]*>//ig; + $data =~ s/<\/font>//ig; + $data =~ s/]*>//ig; + $data =~ s/<\/span>//ig; + $data =~ s/<\/?b>//ig; + + $data =~ s/
//ig; + $data =~ s/ / /ig; + $data =~ s/§/\n/g; + + # Parse the HTML mess + my $otree = parse_html($data); + + # Find our desired element nodes + my $odata = fnodea(fnode($otree, "body"), "div", "class=elementc"); + my $oupdate = fnode($odata, "p"); + my $time_base = time(); + if ($oupdate) { + my $tmp = $oupdate->{"nodes"}[0]{"text"}; + if ($tmp =~ /:\s+(\d\d\.\d\d\.\d\d\d\d)\s+(\d\d:\d\d)/) { + $time_base = str2time($1); + } + } + + my $oelems = fnode($odata, "table")->{"nodes"}; + for (my $n = 1; $n < scalar(@$oelems); $n++) + { + my $fdata = @$oelems[$n]->{"nodes"}; + $weatherdata->{$fdata->[0]{"nodes"}[0]{"text"}} = + [ + parse_timestamp(get_node($fdata, "text", 1), $time_base), + get_node($fdata, "text", 2), + get_node($fdata, "text", 3), + get_node($fdata, "text", 4), + get_node($fdata, "text", 5), + ]; + } + print STDERR "."; + } +} + +### Fetch FMI data +my $res = fetch_http("http://data.fmi.fi/fmi-apikey/".$opt_api_key. + "/wfs?request=getFeature&storedquery_id=fmi::observations::weather::cities::multipointcoverage". + "¶meters=temperature,humidity"); + +if ($res->code >= 200 && $res->code <= 201) +{ + my $xml = XMLin($res->decoded_content, + KeyAttr => { server => 'name' }, + ForceArray => [ 'server', 'address' ]); +} + +### Output +if (defined($opt_outfile)) { + open(STDOUT, '>', $opt_outfile) or die("Could not open output file '$opt_outfile'.\n"); +} + +binmode STDOUT, ':encoding(utf-8)'; + +foreach my $key (sort { $a cmp $b } keys %$weatherdata) +{ + print STDOUT $key."|".join("|", @{$weatherdata->{$key}})."\n"; +} + +close(STDOUT);