Mercurial > hg > egg-tcls
view fetch_weather.pl @ 201:92f4a489b7ef
Add data type index.
author | Matti Hamalainen <ccr@tnsp.org> |
---|---|
date | Wed, 18 Jun 2014 20:46:15 +0300 |
parents | bfa9bbcad2e5 |
children | 2ac661d551b3 |
line wrap: on
line source
#!/usr/bin/perl -w ########################################################################## # # Fetch Weather v0.1 by Matti 'ccr' Hamalainen <ccr@tnsp.org> # (C) Copyright 2014 Tecnic Software productions (TNSP) # # Should be ran as a cronjob, and configured properly. # 15 * * * * /absolute/path/to/fetch_weather.pl </path/to/configfile> # # Configuration file example is in fetch_weather.config # # This script is freely distributable under GNU GPL (version 2) license. # ########################################################################## use strict; use LWP::UserAgent; use HTML::Entities; use XML::Simple; use Text::Iconv; use Date::Format; use Date::Parse; use Data::Dumper; ### ### Configuration settings ### my %settings = ( "debug" => 0, "opt_fmi" => 0, "opt_tiehallinto" => 0, "fmi_api_key" => "", "outfile" => "", "http_user_agent" => "Mozilla/4.0 (compatible; MSIE 6.0; MSIE 5.5; Windows NT 6.0) Opera 10.63 [en]", ); ### ### Helper functions ### sub mlog($) { print STDERR $_[0]; } sub fetch_http($) { my $agent = LWP::UserAgent->new; $agent->agent($settings{"http_user_agent"}); $agent->timeout(10); my $req = HTTP::Request->new(GET => $_[0]); return $agent->request($req); } sub parse_timestamp($$) { my ($str, $offs) = @_; if ($str =~ /^(\d+):(\d+)$/) { return $offs + (60 * 60 * $1) + ($2 * 60); } else { return $offs; } } sub format_time_gmt($) { # 2012-02-27T00:00:00Z return time2str("%Y-%m-%dT%TZ", $_[0], "UTC"); } sub str_trim($) { my $str = $_[0]; if (defined($str)) { $str =~ s/^\s*//; $str =~ s/\s*$//; } return $str; } ### ### Loose HTML parser ### sub pop_token_a($) { my $tokens = $_[0]; return shift(@$tokens); } sub pop_token($) { return str_trim(pop_token_a($_[0])); } sub parse_html_str($) { my $tokens = $_[0]; my $token = pop_token($tokens); my $str = ""; $token =~ s/^\s*//; return undef unless (substr($token, 0, 1) eq '"'); $token = substr($token, 1); while (defined($token)) { my $tmp = $token; $tmp =~ s/\s*$//; if (substr($tmp, -1) eq '"') { $str .= substr($tmp, 0, -1); return $str; } else { $str .= $token; } $token = shift(@$tokens); } return undef; } sub parse_html_tree($$); sub parse_html_tree($$) { my ($tokens, $tree) = @_; while (my $token = pop_token($tokens)) { if ($token =~ /^<[!\/]?[a-zA-Z]+/) { $token = lc($token); if ($token =~ /^<\!.*>$/) { # Ignore comments etc. } elsif ($token =~ /^<([a-z]+[1-9]*)(.*)>$/) { my ($name, $args) = ($1, $2); if ($name eq "meta" || $name eq "img") { my $tmp = {}; parse_html_tree($tokens, $tree); return $tree; } else { my $tmp = { "name" => $name, "args" => str_trim($args) }; parse_html_tree($tokens, $tmp); push(@{$$tree{"nodes"}}, $tmp); } } elsif ($token =~ /^<\/([a-z]+[1-9]*)>$/) { return $tree; } else { die("HORROR TERROR ELITE: '$token'\n"); } } else { $token = str_trim(decode_entities($token)); push(@{$$tree{"nodes"}}, { "name" => "text", "args" => "", "text" => $token }) if length($token) > 0; } } return $tree; } sub parse_html($) { return undef unless defined($_[0]); my $str = $_[0]; my $res = { "name" => "", "args" => "" }; $str =~ tr/\r/ /; $str =~ tr/\n/ /; my @tokens = grep { !($_ =~ /^\s*$/) } split(/(<\/?[a-zA-Z]+.*?>)/, $str); parse_html_tree(\@tokens, $res); return $res; } sub html_find_node($$$); sub html_find_node($$$) { my ($node, $name, $args) = @_; if (defined($node)) { if (ref($node) eq "ARRAY") { foreach my $n (@$node) { my $tmp = html_find_node($n, $name, $args); # Must do it like this, in order not to break the loop return $tmp if defined($tmp); } } elsif (ref($node) eq "HASH") { if (defined($$node{"name"})) { if ($$node{"name"} eq $name) { if ($args ne "") { if (defined($$node{"args"}) && $$node{"args"} =~ /$args/) { } else { return html_find_node($$node{"nodes"}, $name, $args); } } return $node; } else { return html_find_node($$node{"nodes"}, $name, $args); } } } } return undef; } ### ### Helper functions for locating/traversing nodes ### in the parsed data tree structure. ### sub fnode($$) { return html_find_node($_[0], $_[1], ""); } sub fnodea($$$) { return html_find_node($_[0], $_[1], $_[2]); } sub get_node($$$) { return defined($_[0]->[$_[2]]{"nodes"}[0]{$_[1]}) ? $_[0]->[$_[2]]{"nodes"}[0]{$_[1]} : ""; } ### ### Configuration handling ### sub opt_chk_bool($) { if (defined($settings{$_[0]})) { my $val = $settings{$_[0]}; return ($val == 1 || $val eq "true" || $val eq "on" || $val eq "1"); } else { return 0; } } sub opt_chk_valid($$) { if (defined($settings{$_[0]})) { my $val = $settings{$_[0]}; return length($val) >= $_[1]; } else { return 0; } } sub opt_get($) { if (defined($settings{$_[0]})) { return $settings{$_[0]}; } else { return undef; } } sub opt_read_config($) { my $filename = $_[0]; my $errors = 0; my $line = 0; open(CONFFILE, "<", $filename) or die("Could not open configuration '".$filename."'!\n"); while (<CONFFILE>) { $line++; chomp; if (/(^\s*#|^\s*$)/) { # Ignore comments and empty lines } elsif (/^\s*\"?([a-zA-Z0-9_]+)\"?\s*=>?\s*(\d+),?\s*$/) { my $key = lc($1); my $value = $2; if (defined($settings{$key})) { $settings{$key} = $value; } else { mlog("[$filename:$line] Unknown setting '$key' = $value\n"); $errors = 1; } } elsif (/^\s*\"?([a-zA-Z0-9_]+)\"?\s*=>?\s*\"(.*?)\",?\s*$/) { my $key = lc($1); my $value = $2; if (defined($settings{$key})) { $settings{$key} = $value; } else { mlog("[$filename:$line] Unknown setting '$key' = '$value'\n"); $errors = 1; } } else { mlog("[$filename:$line] Syntax error: $_\n"); $errors = 1; } } close(CONFFILE); return $errors; } ### ### Main program begins ### my $weatherdata = {}; die( "Weather Fetch v0.1 by ccr/TNSP <ccr\@tnsp.org>\n". "Usage: $0 <config file>\n" ) unless scalar(@ARGV) >= 1; my $cfgfile = shift; opt_read_config($cfgfile) == 0 or die("Errors while parsing configuration file '".$cfgfile."'.\n"); ### ### Fetch tiehallinto road weather measurement data ### if (opt_chk_bool("opt_tiehallinto")) { for (my $i = 1; $i <= 22; $i++) { my $res = fetch_http("http://alk.tiehallinto.fi/alk/tiesaa/tiesaa_maak_".$i.".html"); if ($res->code >= 200 && $res->code <= 201) { my $data = $res->decoded_content; # Filter out crap tags we don't want or need $data =~ s/\n/§/g; $data =~ s/<!--.*?-->//ig; $data =~ s/<map[^>]*>.*?<\/map>//ig; $data =~ s/<form[^>]*>.*?<\/form>//ig; $data =~ s/<script[^>]*>.*?<\/script>//ig; $data =~ s/<meta[^>]*>//ig; $data =~ s/<font[^>]*>//ig; $data =~ s/<\/font>//ig; $data =~ s/<span[^>]*>//ig; $data =~ s/<\/span>//ig; $data =~ s/<\/?b>//ig; $data =~ s/<br>//ig; $data =~ s/ / /ig; $data =~ s/§/\n/g; # Parse the HTML mess my $otree = parse_html($data); # Find our desired element nodes my $odata = fnodea(fnode($otree, "body"), "div", "class=elementc"); my $oupdate = fnode($odata, "p"); my $time_base = str2time("00:00"); if ($oupdate) { my $tmp = $oupdate->{"nodes"}[0]{"text"}; if ($tmp =~ /:\s+(\d\d)\.(\d\d)\.(\d\d\d\d)\s+(\d\d:\d\d)/) { $time_base = str2time($3."-".$2."-".$1); } } my $oelems = fnode($odata, "table")->{"nodes"}; for (my $n = 1; $n < scalar(@$oelems); $n++) { my $fdata = @$oelems[$n]->{"nodes"}; $weatherdata->{$fdata->[0]{"nodes"}[0]{"text"}} = [ 0, parse_timestamp(get_node($fdata, "text", 1), $time_base), get_node($fdata, "text", 2), get_node($fdata, "text", 3), get_node($fdata, "text", 4), ]; } } } } ### ### Fetch FMI data ### if (opt_chk_bool("opt_fmi")) { die("FMI data scrape enabled, but no API key set.\n") unless opt_chk_valid("fmi_api_key", 10); my $uri = "http://data.fmi.fi/fmi-apikey/".opt_get("fmi_api_key"). "/wfs?request=getFeature&storedquery_id=fmi::observations::weather::". "multipointcoverage". # "timevaluepair". "&starttime=".format_time_gmt(time() - 10*60)."&endtime=".format_time_gmt(time()). "¶meters=temperature,humidity&maxlocations=50&bbox=19,59,32,70"; print STDERR "FMI URI: ".$uri."\n" if opt_chk_bool("debug"); my $res = fetch_http($uri); if ($res->code >= 200 && $res->code <= 201) { my $xml = XMLin($res->decoded_content); my $time_base = time(); my $fdata = $xml->{"wfs:member"}{"omso:GridSeriesObservation"}; my $fshit = $fdata->{"om:result"}{"gmlcov:MultiPointCoverage"}; my @farray = (); foreach my $fline (split(/\n/, $fshit->{"gml:domainSet"}{"gmlcov:SimpleMultiPoint"}{"gmlcov:positions"})) { if ($fline =~ /^\s*([\+\-]?\d+\.\d*)\s+([\+\-]?\d+\.\d*)\s+(\d+)\s*$/) { push(@farray, {"lat" => $1, "long" => $2, "time" => $3}); } } my $findex = 0; foreach my $fline (split(/\n/, $fshit->{"gml:rangeSet"}{"gml:DataBlock"}{"gml:doubleOrNilReasonTupleList"})) { if ($fline =~ /^\s*([\+\-]?\d+\.\d*|NaN)\s+([\+\-]?\d+\.\d*|NaN)\s*$/) { $farray[$findex]{"temp"} = $1 if ($1 ne "NaN"); $farray[$findex]{"humidity"} = $2 if ($2 ne "NaN"); $findex++; } } # XXX Hashify the array into lat/long keys # This is horrible :S my $fcrap = $fdata->{"om:featureOfInterest"}{"sams:SF_SpatialSamplingFeature"}{"sams:shape"}{"gml:MultiPoint"}{"gml:pointMember"}; foreach my $fnode (@{$fcrap}) { my $floc = $fnode->{"gml:Point"}; if ($floc->{"gml:pos"} =~ /^\s*([\+\-]?\d+\.\d*)\s+([\+\-]?\d+\.\d*)\s*$/) { my ($flat, $flong) = ($1, $2); # Should use a hash - foreach my $flol (@farray) { if ($flol->{"lat"} == $flat && $flol->{"long"} == $flong) { $weatherdata->{$floc->{"gml:name"}} = [ 1, $flol->{"time"}, $flol->{"temp"}, "", "", defined($flol->{"humidity"}) ? $flol->{"humidity"} : "", ]; } } } } } } ### ### Output ### if (opt_chk_valid("outfile", 1)) { open(STDOUT, '>', opt_get("outfile")) or die("Could not open output file '".opt_get("outfile")."'.\n"); } binmode STDOUT, ':encoding(utf-8)'; foreach my $key (sort { $a cmp $b } keys %$weatherdata) { print STDOUT $key."|".join("|", @{$weatherdata->{$key}})."\n"; } close(STDOUT);