view fetch_weather.pl @ 325:fe12434d6cbd

weather: Bump version.
author Matti Hamalainen <ccr@tnsp.org>
date Thu, 21 May 2015 10:00:59 +0300
parents 19c64798de91
children b8ae5e99341b
line wrap: on
line source

#!/usr/bin/perl -w
##########################################################################
#
# Fetch Weather v0.3 by Matti 'ccr' Hamalainen <ccr@tnsp.org>
# (C) Copyright 2014-2015 Tecnic Software productions (TNSP)
# This script is freely distributable under GNU GPL (version 2) license.
#
# Should be ran as a cronjob, and configured properly.
# */10 * * * *     perl -w /absolute/path/to/fetch_weather.pl /path/to/configfile
#
# Configuration file example is in fetch_weather.config
#
# Requires various Perl modules, in Debian the packages should be:
# libwww-perl libxml-simple-perl libtimedate-perl
#
#
##########################################################################
use strict;
use utf8;
use LWP::UserAgent;
use HTML::Entities;
use XML::Simple;
use Date::Format;
use Date::Parse;
use Data::Dumper;


###
### Configuration settings
###
my %settings = (
  "debug" => 0,
  "opt_fmi" => 0,
  "opt_tiehallinto" => 0,
  "fmi_api_key" => "",
  "outfile" => "",
  "http_user_agent" => "Mozilla/4.0 (compatible; MSIE 6.0; MSIE 5.5; Windows NT 6.0) Opera 10.63  [en]",
);


###
### Helper functions
###
sub mlog($)
{
  print STDERR $_[0];
}


sub fetch_http($)
{
  my $agent = LWP::UserAgent->new;
  $agent->agent($settings{"http_user_agent"});
  $agent->timeout(20);

  my $req = HTTP::Request->new(GET => $_[0]);
  
  return $agent->request($req);
}


sub parse_timestamp($$)
{
  my ($str, $offs) = @_;
  if ($str =~ /^(\d+):(\d+)$/)
  {
    return $offs + (60 * 60 * $1) + ($2 * 60);
  }
  else
  {
    return $offs;
  }
}


sub format_time_gmt($)
{
  # 2012-02-27T00:00:00Z
  return time2str("%Y-%m-%dT%TZ", $_[0], "UTC");
}


sub str_trim($)
{
  my $str = $_[0];
  if (defined($str))
  {
    $str =~ s/^\s*//;
    $str =~ s/\s*$//;
  }
  return $str;
}


###
### Loose HTML parser
###
sub pop_token_a($)
{
  my $tokens = $_[0];
  return shift(@$tokens);
}


sub pop_token($)
{
  return str_trim(pop_token_a($_[0]));
}


sub parse_html_str($)
{
  my $tokens = $_[0];
  my $token = pop_token($tokens);
  my $str = "";
  $token =~ s/^\s*//;

  return undef unless (substr($token, 0, 1) eq '"');
  $token = substr($token, 1);
  
  while (defined($token)) {
    my $tmp = $token;
    $tmp =~ s/\s*$//;
    if (substr($tmp, -1) eq '"') {
      $str .= substr($tmp, 0, -1);
      return $str;
    } else {
      $str .= $token;
    }
    $token = shift(@$tokens);
  }
  return undef;
}


sub parse_html_tree($$);

sub parse_html_tree($$)
{
  my ($tokens, $tree) = @_;

  while (my $token = pop_token($tokens)) {
    if ($token =~ /^<[!\/]?[a-zA-Z]+/) {
      $token = lc($token);
      if ($token =~ /^<\!.*>$/) {
        # Ignore comments etc.
      } elsif ($token =~ /^<([a-z]+[1-9]*)(.*)>$/) {
        my ($name, $args) = ($1, $2);
        if ($name eq "meta" || $name eq "img") {
          my $tmp = {};
          parse_html_tree($tokens, $tree);
          return $tree;
        } else {
          my $tmp = { "name" => $name, "args" => str_trim($args) };
          parse_html_tree($tokens, $tmp);
          push(@{$$tree{"nodes"}}, $tmp);
        }
      } elsif ($token =~ /^<\/([a-z]+[1-9]*)>$/) {
        return $tree;
      } else {
        print STDERR "ERROR: Failed to parse '$token'\n";
        return undef;
      }
    } else {
      $token = str_trim(decode_entities($token));
      push(@{$$tree{"nodes"}}, { "name" => "text", "args" => "", "text" => $token }) if length($token) > 0;
    }
  }
  
  return $tree;
}


sub parse_html($)
{
  return undef unless defined($_[0]);
  my $str = $_[0];
  my $res = { "name" => "", "args" => "" };
  $str =~ tr/\r/ /;
  $str =~ tr/\n/ /;
  my @tokens = grep { !($_ =~ /^\s*$/) } split(/(<\/?[a-zA-Z]+.*?>)/, $str);
  if (defined(parse_html_tree(\@tokens, $res)))
  {
    return $res;
  }
  else
  {
    return undef;
  }
}


sub html_find_node($$$);

sub html_find_node($$$)
{
  my ($node, $name, $args) = @_;
  
  if (defined($node)) {
    if (ref($node) eq "ARRAY") {
      foreach my $n (@$node) {
        my $tmp = html_find_node($n, $name, $args);
        # Must do it like this, in order not to break the loop
        return $tmp if defined($tmp);
      }
    } elsif (ref($node) eq "HASH") {
      if (defined($$node{"name"})) {
        if ($$node{"name"} eq $name) {
          if ($args ne "") {
            if (defined($$node{"args"}) && $$node{"args"} =~ /$args/) {
            } else {
              return html_find_node($$node{"nodes"}, $name, $args);
            }
          }
          return $node;
        } else {
          return html_find_node($$node{"nodes"}, $name, $args);
        }
      }
    }
  }
  return undef;
}


###
### Helper functions for locating/traversing nodes
### in the parsed data tree structure.
###

sub fnodep(@)
{
  my $res = html_find_node(shift, shift, "");
  if (defined(my $tmp = shift))
  {
    return defined($res) ? $res->{$tmp} : undef;
  }
  else
  {
    return $res;
  }
}


sub fnodea($$$)
{
  return html_find_node($_[0], $_[1], $_[2]);
}


sub get_node($$$)
{
  return defined($_[0]->[$_[2]]{"nodes"}[0]{$_[1]}) ? $_[0]->[$_[2]]{"nodes"}[0]{$_[1]} : "";
}

sub get_node_lc($$$)
{
  return lc(get_node($_[0], $_[1], $_[2]));
}


my %th_rain_states =
(
  "Pouta" => "poutaa",
  "Heikko" => "heikkoa sadetta",
  "Kohtalainen" => "kohtalaista sadetta",
  "Voimakas" => "voimakasta sadetta",
);

my $th_rain_states_k = join("|", map {quotemeta} sort { length($b)<=>length($a) } keys %th_rain_states);

sub translate_rain($)
{
  my $tmp = $_[0];
  $tmp =~ s/($th_rain_states_k)/$th_rain_states{$1}/igo;
  return $tmp;
}

my %th_cloud_states =
(
  0 => "selkeää",
  1 => "melkein selkeää",
  2 => "verrattain selkeää",
  3 => "verrattain selkeää",
  4 => "puolipilvistä",
  5 => "verrattain pilvistä",
  6 => "verrattain pilvistä",
  7 => "melkein pilvistä",
  8 => "pilvistä",
);

sub translate_clouds($)
{
  return "" if ($_[0] eq "NaN" || $_[0] eq "");
  my $tmp = int($_[0]);
  foreach my $n (sort { $a <=> $b } keys %th_cloud_states)
  {
    return $th_cloud_states{$n}." (".$n."/8)" if ($tmp == $n);
  }
  return $tmp;
}


sub plonk_data($)
{
  return defined($_[0]) ? $_[0] : "";
}


sub plonk_data_lc($)
{
  return defined($_[0]) ? lc($_[0]) : "";
}


###
### Configuration handling
###
sub opt_chk_bool($)
{
  if (defined($settings{$_[0]}))
  {
    my $val = $settings{$_[0]};
    return ($val == 1 || $val eq "true" || $val eq "on" || $val eq "1");
  }
  else
  {
    return 0;
  }
}


sub opt_chk_valid($$)
{
  if (defined($settings{$_[0]}))
  {
    my $val = $settings{$_[0]};
    return length($val) >= $_[1];
  }
  else
  {
    return 0;
  }
}


sub opt_get_int($)
{
  if (defined($settings{$_[0]}))
  {
    return int($settings{$_[0]});
  }
  else
  {
    return -1;
  }
}


sub opt_get($)
{
  if (defined($settings{$_[0]}))
  {
    return $settings{$_[0]};
  }
  else
  {
    return undef;
  }
}


sub opt_read_config($)
{
  my $filename = $_[0];
  my $errors = 0;
  my $line = 0;

  open(CONFFILE, "<", $filename) or die("Could not open configuration '".$filename."'!\n");
  while (<CONFFILE>)
  {
    $line++;
    chomp;
    if (/(^\s*#|^\s*$)/) {
      # Ignore comments and empty lines
    } elsif (/^\s*\"?([a-zA-Z0-9_]+)\"?\s*=>?\s*(\d+),?\s*$/) {
      my $key = lc($1);
      my $value = $2;
      if (defined($settings{$key})) {
        $settings{$key} = $value;
      } else {
        mlog("[$filename:$line] Unknown setting '$key' = $value\n");
        $errors = 1;
      }
    } elsif (/^\s*\"?([a-zA-Z0-9_]+)\"?\s*=>?\s*\"(.*?)\",?\s*$/) {
      my $key = lc($1);
      my $value = $2;
      if (defined($settings{$key})) {
        $settings{$key} = $value;
      } else {
        mlog("[$filename:$line] Unknown setting '$key' = '$value'\n");
        $errors = 1;
      }
    } else {
      mlog("[$filename:$line] Syntax error: $_\n");
      $errors = 1;
    }
  }
  close(CONFFILE);
  return $errors;
}


###
### Main program begins
###
my $weatherdata = {};

die(
"Weather Fetch v0.3 by ccr/TNSP <ccr\@tnsp.org>\n".
"Usage: $0 <config file>\n"
) unless scalar(@ARGV) >= 1;

my $cfgfile = shift;
opt_read_config($cfgfile) == 0 or die("Errors while parsing configuration file '".$cfgfile."'.\n");


###
### Fetch tiehallinto road weather measurement data
###
if (opt_chk_bool("opt_tiehallinto"))
{
  for (my $i = 1; $i <= 22; $i++)
  {
    my $uri = "http://alk.tiehallinto.fi/alk/tiesaa/tiesaa_maak_".$i.".html";
    print STDERR "Fetching ".$uri." ...\n" if (opt_get_int("debug") > 1);
    my $res = fetch_http($uri);
    if ($res->code >= 200 && $res->code <= 201)
    {
      my $data = $res->decoded_content;
      
      # Filter out crap tags we don't want or need
      $data =~ s/\n/§/g;
      $data =~ s/<!--.*?-->//ig;
      $data =~ s/<map[^>]*>.*?<\/map>//ig;
      $data =~ s/<form[^>]*>.*?<\/form>//ig;
      $data =~ s/<script[^>]*>.*?<\/script>//ig;
      $data =~ s/<meta[^>]*>//ig;
      $data =~ s/<font[^>]*>//ig;
      $data =~ s/<\/font>//ig;
      $data =~ s/<span[^>]*>//ig;
      $data =~ s/<\/span>//ig;
      $data =~ s/<\/?b>//ig;

      $data =~ s/<br>//ig;
      $data =~ s/&nbsp;/ /ig;
      $data =~ s/§/\n/g;

      # Parse the HTML mess
      my $otree = parse_html($data);
      if (!defined($otree))
      {
        print STDERR "ERROR: Failed to parse file '".$uri."'.\n";
        next;
      }

      print STDERR "Parsed : ".$uri." as:\n".Dumper($otree)."\n--\n" if (opt_get_int("debug") > 2);

      # Find our desired element nodes
      my $odata = fnodea(fnodep($otree, "body"), "div", "class=elementc");
      my $oupdate = fnodep($odata, "p");
      my $time_base = str2time("00:00");
      if ($oupdate) {
        my $tmp = $oupdate->{"nodes"}[0]{"text"};
        if ($tmp =~ /:\s+(\d\d)\.(\d\d)\.(\d\d\d\d)\s+(\d\d:\d\d)/) {
          $time_base = str2time($3."-".$2."-".$1);
        }
      }
      
      my $oelems = fnodep($odata, "table", "nodes");
      if (defined($oelems))
      {
        for (my $n = 1; $n < scalar(@$oelems); $n++)
        {
          my $fdata = @$oelems[$n]->{"nodes"};
          $weatherdata->{get_node($fdata, "text", 0)} =
          [
            # type, timestamp, temperature
            0,
            parse_timestamp(get_node($fdata, "text", 1), $time_base),
            get_node_lc($fdata, "text", 2),
            # and the rest
            get_node_lc($fdata, "text", 3),
            translate_rain(get_node($fdata, "text", 4)),
            get_node_lc($fdata, "text", 5),
          ];
        }
      }
    }
    else
    {
      print STDERR "Failed to fetch ".$uri." (\n" if (opt_get_int("debug") > 0);
    }
  }

  print STDERR "Tiehallinto data blob:\n".Dumper($weatherdata)."\n--\n" if (opt_get_int("debug") > 1);
}


###
### Fetch FMI data
###
if (opt_chk_bool("opt_fmi"))
{
  die("FMI data scrape enabled, but no API key set.\n") unless opt_chk_valid("fmi_api_key", 10);
  my @fmitems = ("temperature", "humidity", "windspeedms", "totalcloudcover");

  my $uri = "http://data.fmi.fi/fmi-apikey/".opt_get("fmi_api_key").
    "/wfs?request=getFeature&storedquery_id=fmi::observations::weather::".
    "multipointcoverage".
#    "timevaluepair".
    "&starttime=".format_time_gmt(time() - 10*60)."&endtime=".format_time_gmt(time()).
    "&parameters=".join(",", @fmitems)."&maxlocations=100&bbox=19,59,32,75";

  print STDERR "FMI URI: ".$uri."\n" if (opt_get_int("debug") > 0);

  my $res = fetch_http($uri);
  if ($res->code >= 200 && $res->code <= 201)
  {
    my $xml = XMLin($res->decoded_content);
    my $time_base = time();
    
    if (defined($xml->{"wfs:member"}{"omso:GridSeriesObservation"}))
    {
      my $fdata = $xml->{"wfs:member"}{"omso:GridSeriesObservation"};
      my $fshit = $fdata->{"om:result"}{"gmlcov:MultiPointCoverage"};
      my @farray = ();

      foreach my $fline (split(/\n/, $fshit->{"gml:domainSet"}{"gmlcov:SimpleMultiPoint"}{"gmlcov:positions"}))
      {
        if ($fline =~ /^\s*([\+\-]?\d+\.\d*)\s+([\+\-]?\d+\.\d*)\s+(\d+)\s*$/)
        {
          push(@farray, {"lat" => $1, "long" => $2, "time" => $3});
        }
      }

      my $findex = 0;    
      foreach my $fline (split(/\n/, $fshit->{"gml:rangeSet"}{"gml:DataBlock"}{"gml:doubleOrNilReasonTupleList"}))
      {
        my @fmatches = ($fline =~ /\s*([\+\-]?\d+\.\d*|NaN)\s*/ig);
        if (scalar(@fmatches) > 0)
        {
          die("Not enough items in scalar line (".scalar(@fmatches). " vs ".scalar(@fmitems).
            "): ".$fline."\n") if (scalar(@fmatches) != scalar(@fmitems));
          for (my $fni = 0; $fni < scalar(@fmitems); $fni++)
          {
            $farray[$findex]{$fmitems[$fni]} = $fmatches[$fni] if (lc($fmatches[$fni]) ne "nan");
          }
          $findex++;
        }
      }
      # XXX Hashify the array into lat/long keys
      
      # This is horrible :S
      my $fcrap = $fdata->{"om:featureOfInterest"}{"sams:SF_SpatialSamplingFeature"}{"sams:shape"}{"gml:MultiPoint"}{"gml:pointMember"};
      foreach my $xnode (@{$fcrap})
      {
        my $floc = $xnode->{"gml:Point"};
        if ($floc->{"gml:pos"} =~ /^\s*([\+\-]?\d+\.\d*)\s+([\+\-]?\d+\.\d*)\s*$/)
        {
          my ($flat, $flong) = ($1, $2);
          # Should use a hash -
          foreach my $flol (@farray)
          {
            if ($flol->{"lat"} == $flat && $flol->{"long"} == $flong)
            {
              $weatherdata->{$floc->{"gml:name"}} =
              [
                1,
                plonk_data($flol->{"time"}),
                plonk_data($flol->{"temperature"}),

                plonk_data($flol->{"humidity"}),
                plonk_data($flol->{"windspeedms"}),
                translate_clouds(plonk_data($flol->{"totalcloudcover"})),
              ];
            }
          }
        }
      }
    } else {
      # defined
      print STDERR "Invalid XML received:\n";
      print STDERR $res->decoded_content."\n\n";
    }
  } else {
    print STDERR "Error fetching FMI XML: ".$res->status_line."\n";
  }
}


###
### Output
###
if (opt_chk_valid("outfile", 1)) {
  open(STDOUT, '>', opt_get("outfile")) or die("Could not open output file '".opt_get("outfile")."'.\n");
}

binmode STDOUT, ':encoding(utf-8)';

foreach my $key (sort { $a cmp $b } keys %$weatherdata)
{
  print STDOUT $key."|".join("|", @{$weatherdata->{$key}})."\n";
}

close(STDOUT);