view fetch_weather.pl @ 698:6ba9f961e463 default tip

quotedb: Bump version and copyright.
author Matti Hamalainen <ccr@tnsp.org>
date Mon, 18 Sep 2023 11:38:41 +0300
parents fd45c52d4297
children
line wrap: on
line source

#!/usr/bin/perl -w
##########################################################################
#
# Fetch Weather v1.3.0 by Matti 'ccr' Hamalainen <ccr@tnsp.org>
# (C) Copyright 2014-2023 Tecnic Software productions (TNSP)
# This script is freely distributable under GNU GPL (version 2) license.
#
# Should be ran as a cronjob, and configured properly.
# */10 * * * *     perl -w /absolute/path/to/fetch_weather.pl /path/to/configfile
#
# Configuration file example is in "config.fetch_weather.example"
# For debugging/testing, try ./fetch_weather.pl to see commandline options.
#
# Requires various Perl modules, in Debian the packages should be:
# libwww-perl libxml-simple-perl libtimedate-perl libjson-perl
# libfile-slurper-perl
#
##########################################################################
use 5.18.0;
use strict;
use warnings;
use utf8;
use Encode;
use LWP::UserAgent;
use HTTP::Message;
use HTML::Entities;
use Compress::Zlib;
use XML::Simple;
use Date::Format;
use Date::Parse;
use Data::Dumper;
use File::Slurper qw(read_text write_binary);
use JSON;


###
### Configuration settings
###
my %settings = (
  "force_update" => 0,
  "debug" => 0,
  "dump" => 0,
  "opt_fmi" => 0,
  "opt_tiehallinto" => 0,
  "purge_threshold" => 60,
  "outfile" => "",
  "http_user_agent" => "Mozilla/4.0 (compatible; MSIE 6.0; MSIE 5.5; Windows NT 6.0) Opera 10.63  [en]",
  "tiehallinto_meta" => "tiehallinto.meta",
  "tiehallinto_meta_period" => 7,
  "tiehallinto_rw_url" => "https://tie.digitraffic.fi/api/weather/v1/stations/data",
  "tiehallinto_meta_url" => "https://tie.digitraffic.fi/api/weather/v1/stations",
  "fmi_weather_base_url" => "https://opendata.fmi.fi/wfs",
  "fmi_weather_extra_params" => "&maxlocations=300&bbox=19,59,32,75",
);


###
### Helper functions
###
sub mlog($)
{
  print STDERR $_[0];
}


sub fetch_http($)
{
  my $agent = LWP::UserAgent->new;
  $agent->agent(opt_get("http_user_agent"));
  $agent->timeout(20);

  my $req = HTTP::Request->new(GET => $_[0]);
  $req->header('Accept-Encoding' => scalar HTTP::Message::decodable());

  print STDERR "# FETCHING URL: ".$_[0]."\n" if (opt_get_int("debug") > 0);

  my $res = $agent->request($req);

  if (opt_get_int("debug") > 0)
  {
    print STDERR "# Response: ".$res->code.": ".$res->message."\n";
    if ($res->code >= 200 && $res->code <= 201)
    {
      print STDERR
        "# Content-charset: ".$res->content_charset."\n".
        "# Content-encoding: ".$res->content_encoding."\n".
        "# Is decoded_content UTF8? ".(utf8::is_utf8($res->decoded_content) ? "yes" : "NO!")."\n";
    }
  }

  return $res;
}


sub str_trim($)
{
  my $tmp = $_[0];
  $tmp =~ s/^\s*//;
  $tmp =~ s/\s*$//;
  return $tmp;
}


sub format_time_gmt($)
{
  # 2012-02-27T00:00:00Z
  return time2str("%Y-%m-%dT%TZ", $_[0], "UTC");
}


sub force_decode_utf8($)
{
  if (!utf8::is_utf8($_[0]))
  {
    return decode("utf8", $_[0]);
  }
  else
  {
    return $_[0];
  }
}


sub plonk_data_var($$$)
{
  return defined($_[0]->{$_[1]}) ? $_[0]->{$_[1]} : $_[2];
}


sub plonk_data_value($$)
{
  if (defined($_[0]) && defined($_[0]{$_[1]}) && defined($_[0]{$_[1]}{"value"}))
  {
    return $_[0]{$_[1]}{"value"};
  }
  else
  {
    return "";
  }
}


###
### Configuration handling
###
sub opt_chk_bool($)
{
  if (defined($settings{$_[0]}))
  {
    my $val = $settings{$_[0]};
    return ($val == 1 || $val eq "true" || $val eq "on" || $val eq "1");
  }
  else
  {
    return 0;
  }
}


sub opt_chk_valid($$)
{
  if (defined($settings{$_[0]}))
  {
    my $val = $settings{$_[0]};
    return length($val) >= $_[1];
  }
  else
  {
    return 0;
  }
}


sub opt_get_int($)
{
  if (defined($settings{$_[0]}))
  {
    return int($settings{$_[0]});
  }
  else
  {
    return -1;
  }
}


sub opt_get($)
{
  if (defined($settings{$_[0]}))
  {
    return $settings{$_[0]};
  }
  else
  {
    return undef;
  }
}


sub opt_read_config($)
{
  my $filename = $_[0];
  my $errors = 0;
  my $line = 0;

  open(CONFFILE, "<", $filename) or die("Could not open configuration '".$filename."'!\n");
  while (<CONFFILE>)
  {
    $line++;
    chomp;
    if (/(^\s*#|^\s*$)/)
    {
      # Ignore comments and empty lines
    }
    elsif (/^\s*\"?([a-zA-Z0-9_]+)\"?\s*=>?\s*(\d+),?\s*$/)
    {
      my $key = lc($1);
      my $value = $2;
      if (defined($settings{$key}))
      {
        $settings{$key} = $value;
      }
      else
      {
        mlog("[$filename:$line] Unknown setting '$key' = $value\n");
        $errors = 1;
      }
    }
    elsif (/^\s*\"?([a-zA-Z0-9_]+)\"?\s*=>?\s*\"(.*?)\",?\s*$/)
    {
      my $key = lc($1);
      my $value = $2;
      if (defined($settings{$key}))
      {
        $settings{$key} = $value;
      }
      else
      {
        mlog("[$filename:$line] Unknown setting '$key' = '$value'\n");
        $errors = 1;
      }
    }
    else
    {
      mlog("[$filename:$line] Syntax error: $_\n");
      $errors = 1;
    }
  }
  close(CONFFILE);
  return $errors;
}


###
### Main program begins
###
my $weatherdata = {};
my $opt_cfgfile;

while (scalar(@ARGV) > 0)
{
  my $arg = shift;
  if ($arg eq "-force")
  {
    $settings{"force_update"} = 1;
  }
  elsif ($arg eq "-debug")
  {
    $settings{"debug"} = 1;
  }
  elsif ($arg eq "-dump")
  {
    $settings{"dump"} = 1;
  }
  else
  {
    die("Configuration file already specified!\n") if defined($opt_cfgfile);
    $opt_cfgfile = $arg;
  }
}

if (!defined($opt_cfgfile))
{
  die(
    "Weather Fetch v1.3.0 by ccr/TNSP <ccr\@tnsp.org>\n".
    "Usage: $0 <config file> [options]\n".
    "\n".
    " -force      : Force updating of all data\n".
    " -debug      : Enable debug\n".
    " -dump       : Dump received raw data\n".
    "\n"
  );
}

opt_read_config($opt_cfgfile) == 0 or die("Errors while parsing configuration file '".$opt_cfgfile."'.\n");
print STDERR "Forcing update of all data.\n" if opt_chk_bool("force_update");


###
### Load already cached data
###
if (opt_chk_valid("outfile", 1) && !opt_chk_bool("force_update"))
{
  my $filename = opt_get("outfile");
  if (-e "$filename")
  {
    my $str = force_decode_utf8(read_text($filename));
    if (defined($str))
    {
      foreach my $line (split(/\s*\n\s*/, $str))
      {
        my @mtmp = split(/\|/, $line, -1);
        if (scalar(\@mtmp) >= 3)
        {
          $weatherdata->{shift @mtmp} = \@mtmp;
        }
      }
      print STDERR scalar(keys %$weatherdata)." old records reloaded.\n" if (opt_get_int("debug") > 0);
    }
  }
}


###
### Fetch Tiehallinto data
###
if (opt_chk_bool("opt_tiehallinto"))
{
  my $uri = opt_get("tiehallinto_rw_url");
  print STDERR "Fetching Tiehallinto road weather data from ".$uri."\n" if (opt_get_int("debug") > 0);
  my $res = fetch_http($uri);
  if ($res->code >= 200 && $res->code <= 201)
  {
    my $data_json = JSON->new->decode(force_decode_utf8($res->decoded_content));

    print Dumper($data_json) if opt_chk_bool("dump");

    if (!defined($data_json->{"dataUpdatedTime"}) || !defined($data_json->{"stations"}))
    {
      print STDERR "ERROR: REST/JSON call result did not contain required data.\n";
    }
    else
    {
      # Check if we need to update the static meta data
      my $meta_file = opt_get("tiehallinto_meta");
      my $meta_stamp = (-e $meta_file) ? (stat($meta_file))[9] : -1;
      my $fetch_meta = ($meta_stamp + 60*60*24 * opt_get_int("tiehallinto_meta_period")) < time();

      # Fetch or read the cache
      my $meta_str;
      if ($fetch_meta || opt_chk_bool("force_update"))
      {
        my $uri = opt_get("tiehallinto_meta_url");
        print STDERR "Fetching Tiehallinto static meta data from ".$uri."\n" if (opt_get_int("debug") > 1);
        my $res = fetch_http($uri);
        die("Failed to fetch $uri data.\n") unless ($res->code <= 200 && $res->code <= 201);

        $meta_str = force_decode_utf8($res->decoded_content);
        $fetch_meta = 1;
      }
      else
      {
        print STDERR "Using CACHED Tiehallinto static meta data from '".$meta_file."'.\n" if (opt_get_int("debug") > 0);
        $meta_str = force_decode_utf8(read_text($meta_file));
      }

      print STDERR "Is meta_str UTF8? ".(utf8::is_utf8($meta_str) ? "yes" : "NO!")."\n" if (opt_get_int("debug") > 0);

      # Parse the data ..
      my $meta_data = {};
      my $meta_json = JSON->new->decode($meta_str);

      print Dumper($meta_json) if opt_chk_bool("dump");

      if ($fetch_meta)
      {
        # Save new cache, in more optimal form, if needed.
        print STDERR "Storing to cache '".$meta_file."'.\n" if (opt_get_int("debug") > 0);
        write_binary($meta_file, JSON->new->utf8->encode($meta_json));
      }

      foreach my $ms (@{$meta_json->{"features"}})
      {
        # Filter functional stations?
        if (defined($ms->{"properties"}) &&
            defined($ms->{"geometry"}{"coordinates"}) &&
            $ms->{"properties"}{"collectionStatus"} eq "GATHERING" &&
            $ms->{"properties"}{"name"} !~ /^TEST_/
            )
        {
          $meta_data->{$ms->{"id"}} = $ms;
        }
      }

      my $nrecords = 0;
      foreach my $wdata (@{$data_json->{"stations"}})
      {
        my $wid = $wdata->{"id"};
        if (defined($meta_data->{$wid}) &&
            defined($wdata->{"sensorValues"}))
        {
          my $sensors = {};
          foreach my $sensor (@{$wdata->{"sensorValues"}})
          {
            $sensors->{$sensor->{"name"}} = $sensor;
          }

          my $wqname = $meta_data->{$wid}{"properties"}{"name"};
          $wqname =~ s#_# #g;
          $wqname =~ s/(^|\s)(vt|yt|st|kt)(\d+)/"Tie ".$3/ge;
          $wqname .= " (TH)";

          $nrecords++;
          $weatherdata->{$wqname} =
          [
            # Basic data
            $meta_data->{$wid}{"geometry"}{"coordinates"}[1],
            $meta_data->{$wid}{"geometry"}{"coordinates"}[0],

            # This is kind of wrong, because the actual measuredTime exists
            # for each sensor separately ... but .. whatever.
            str2time(plonk_data_var($wdata, "dataUpdatedTime", "1970-01-01T14:15:16")),

            plonk_data_value($sensors, "ILMA"),
            plonk_data_value($sensors, "ILMAN_KOSTEUS"),
            plonk_data_value($sensors, "KESKITUULI"),
            plonk_data_value($sensors, "TUULENSUUNTA"),

            "", # total cloud cover
            plonk_data_value($sensors, "TIE_1"),
            plonk_data_value($sensors, "SADE"),
            plonk_data_value($sensors, "NÄKYVYYS_KM"),
            plonk_data_value($sensors, "SADE_INTENSITEETTI"),
          ];
        }
        else
        {
          print STDERR "Station ID #".$wid." not defined?\n" if (opt_get_int("debug") > 0);
          #.Dumper($meta_data->{$wid});
        }
      }
      print STDERR $nrecords." records from Tiehallinto.\n" if (opt_get_int("debug") > 0);
    }
  }
}


###
### Fetch FMI data
###
if (opt_chk_bool("opt_fmi"))
{
  my @fmitems = (
    "temperature", "humidity", "windspeedms",
    "winddirection", "totalcloudcover",
    );

  my $uri = opt_get("fmi_weather_base_url").
    "?request=getFeature".
    "&storedquery_id=fmi::observations::weather::multipointcoverage".
    "&starttime=".format_time_gmt(time() - 10*60)."&endtime=".format_time_gmt(time()).
    "&parameters=".join(",", @fmitems).
    opt_get("fmi_weather_extra_params");

  print STDERR "FMI URI: ".$uri."\n" if (opt_get_int("debug") > 0);

  my $res = fetch_http($uri);
  if ($res->code >= 200 && $res->code <= 201)
  {
    my $xml = XMLin(force_decode_utf8($res->decoded_content));
    my $time_base = time();

    print Dumper($xml) if opt_chk_bool("dump");

    if (defined($xml->{"wfs:member"}{"omso:GridSeriesObservation"}))
    {
      my $fdata = $xml->{"wfs:member"}{"omso:GridSeriesObservation"};
      my $fshit = $fdata->{"om:result"}{"gmlcov:MultiPointCoverage"};

      my @position_lines = split(/\n/, $fshit->{"gml:domainSet"}{"gmlcov:SimpleMultiPoint"}{"gmlcov:positions"});
      my @data_lines = split(/\n/, $fshit->{"gml:rangeSet"}{"gml:DataBlock"}{"gml:doubleOrNilReasonTupleList"});
      my @farray = ();

      if (scalar(@position_lines) == scalar(@data_lines))
      {
        for (my $nline = 0; $nline < scalar(@position_lines); $nline++)
        {
          my $dline = str_trim($data_lines[$nline]);
          my $pline = str_trim($position_lines[$nline]);

          my @fmatches = ($dline =~ /\s*([\+\-]?\d+\.\d*|NaN)\s*/ig);
          if (scalar(@fmatches) != scalar(@fmitems))
          {
            print STDERR "Not enough items in scalar line #".$nline." (".
              scalar(@fmatches). " vs ".scalar(@fmitems)."): ".$dline."\n"
              if (opt_get_int("debug") > 0);
          }
          else
          {
            my $vtmp = {};
            for (my $fni = 0; $fni < scalar(@fmitems); $fni++)
            {
              $$vtmp{$fmitems[$fni]} = $fmatches[$fni] if (lc($fmatches[$fni]) ne "nan");
            }
            if ($pline =~ /^\s*([\+\-]?\d+\.\d*)\s+([\+\-]?\d+\.\d*)\s+(\d+)\s*$/)
            {
              $$vtmp{"lat"} = $1;
              $$vtmp{"long"} = $2;
              $$vtmp{"time"} = $3;
              push(@farray, $vtmp);
            }
            else
            {
              print STDERR "Data mismatch #".$nline.": ".$pline."\n";
            }
          }
        }
      }
      else
      {
        print STDERR "Position and data line counts do not match ".
          scalar(@position_lines)." <> ".scalar(@data_lines)."\n";
        goto skip_it;
      }
      # XXX Hashify the array into lat/long keys

      # This is horrible :S
      my $nrecords = 0;
      foreach my $xnode (@{$fdata->{"om:featureOfInterest"}{"sams:SF_SpatialSamplingFeature"}{"sams:shape"}{"gml:MultiPoint"}{"gml:pointMember"}})
      {
        my $floc = $xnode->{"gml:Point"};
        if ($floc->{"gml:name"} ne "" &&
            $floc->{"gml:pos"} =~ /^\s*([\+\-]?\d+\.\d*)\s+([\+\-]?\d+\.\d*)\s*$/)
        {
          my ($flat, $flong) = ($1, $2);

          # Should use a hash -
          foreach my $frec (@farray)
          {
            # If lat/long matches, and location is not yet defined, or
            # if timestamp is newer, store to location data
            if ($frec->{"lat"} == $flat && $frec->{"long"} == $flong &&
                 (!defined($weatherdata->{$floc->{"gml:name"}}) || 
                 $frec->{"time"} >= $weatherdata->{$floc->{"gml:name"}}[2])
               )
            {
              $nrecords++ unless defined($weatherdata->{$floc->{"gml:name"}});

              $weatherdata->{$floc->{"gml:name"}} =
              [
                # Basic data
                plonk_data_var($frec, "lat", 0),
                plonk_data_var($frec, "long", 0),

                plonk_data_var($frec, "time", ""),
                plonk_data_var($frec, "temperature", ""),
                plonk_data_var($frec, "humidity", ""),
                plonk_data_var($frec, "windspeedms", ""),
                plonk_data_var($frec, "winddirection", ""),

                plonk_data_var($frec, "totalcloudcover", ""),
                
              ];
            }
          }
        }
      }
      print STDERR $nrecords." records from FMI.\n" if (opt_get_int("debug") > 0);
    }
    else
    {
      # defined
      print STDERR "Invalid XML received:\n";
      print STDERR $res->decoded_content."\n\n";
    }
  }
  else
  {
    print STDERR "Error fetching FMI XML: ".$res->status_line."\n";
  }
}


### Skip here if the FMI shit fails due to broken data
skip_it:


###
### Purge too old entries
###
if (opt_chk_valid("purge_threshold", 1))
{
  my $purge = opt_get_int("purge_threshold");
  if ($purge > 0)
  {
    my $wqtime = time();
    my $nold = scalar(keys %$weatherdata);

    foreach my $key (keys %$weatherdata)
    {
      if ($wqtime - $weatherdata->{$key}[2] > (60 * $purge))
      {
        delete $$weatherdata{$key};
      }
    }

    my $nnew = scalar(keys %$weatherdata);
    print STDERR "Purged data older than ".$purge." minutes, ".$nold." -> ".$nnew." = ".($nold - $nnew)." removed.\n" if (opt_get_int("debug") > 0);
  }
}


###
### Output
###
if (opt_chk_valid("outfile", 1))
{
  my $filename = opt_get("outfile");
  print STDERR "Dumping data to output file '".$filename."'\n" if (opt_get_int("debug") > 0);
  close(STDOUT);
  open(STDOUT, '>', $filename) or die("Could not open output file '".$filename."': $!\n");
}

binmode STDOUT, ':encoding(utf-8)';

foreach my $key (sort { $a cmp $b } keys %$weatherdata)
{
  print STDOUT $key."|".join("|", @{$weatherdata->{$key}})."\n";
}

close(STDOUT);