changeset 158:db2b1f74d994

Initial import of weather data fetching backend, written in Perl.
author Matti Hamalainen <ccr@tnsp.org>
date Mon, 02 Jun 2014 12:29:11 +0300
parents c8fa73ba67da
children bbc7860c22a6
files fetch_weather.pl
diffstat 1 files changed, 269 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fetch_weather.pl	Mon Jun 02 12:29:11 2014 +0300
@@ -0,0 +1,269 @@
+#!/usr/bin/perl -w
+use strict;
+use LWP::UserAgent;
+use HTML::Entities;
+use XML::Simple;
+use Text::Iconv;
+use Date::Parse;
+use Data::Dumper;
+
+sub str_trim($)
+{
+  my $str = $_[0];
+  if (defined($str)) {
+    $str =~ s/^\s*//;
+    $str =~ s/\s*$//;
+  }
+  return $str;
+}
+
+
+sub pop_token_a($)
+{
+  my $tokens = $_[0];
+  return shift(@$tokens);
+}
+
+
+sub pop_token($)
+{
+  return str_trim(pop_token_a($_[0]));
+}
+
+
+sub parse_html_str($)
+{
+  my $tokens = $_[0];
+  my $token = pop_token($tokens);
+  my $str = "";
+  $token =~ s/^\s*//;
+
+  return undef unless (substr($token, 0, 1) eq '"');
+  $token = substr($token, 1);
+  
+  while (defined($token)) {
+    my $tmp = $token;
+    $tmp =~ s/\s*$//;
+    if (substr($tmp, -1) eq '"') {
+      $str .= substr($tmp, 0, -1);
+      return $str;
+    } else {
+      $str .= $token;
+    }
+    $token = shift(@$tokens);
+  }
+  return undef;
+}
+
+
+sub parse_html_tree($$);
+
+sub parse_html_tree($$)
+{
+  my ($tokens, $tree) = @_;
+
+  while (my $token = pop_token($tokens)) {
+    if ($token =~ /^<[!\/]?[a-zA-Z]+/) {
+      $token = lc($token);
+      if ($token =~ /^<\!.*>$/) {
+        # Ignore comments etc.
+      } elsif ($token =~ /^<([a-z]+[1-9]*)(.*)>$/) {
+        my ($name, $args) = ($1, $2);
+        if ($name eq "meta" || $name eq "img") {
+          my $tmp = {};
+          parse_html_tree($tokens, $tree);
+          return $tree;
+        } else {
+          my $tmp = { "name" => $name, "args" => str_trim($args) };
+          parse_html_tree($tokens, $tmp);
+          push(@{$$tree{"nodes"}}, $tmp);
+        }
+      } elsif ($token =~ /^<\/([a-z]+[1-9]*)>$/) {
+        return $tree;
+      } else {
+        die("HORROR TERROR ELITE: '$token'\n");
+      }
+    } else {
+      $token = str_trim(decode_entities($token));
+      push(@{$$tree{"nodes"}}, { "name" => "text", "args" => "", "text" => $token }) if length($token) > 0;
+    }
+  }
+  
+  return $tree;
+}
+
+
+sub parse_html($)
+{
+  return undef unless defined($_[0]);
+  my $str = $_[0];
+  my $res = { "name" => "", "args" => "" };
+  $str =~ tr/\r/ /;
+  $str =~ tr/\n/ /;
+  my @tokens = grep { !($_ =~ /^\s*$/) } split(/(<\/?[a-zA-Z]+.*?>)/, $str);
+  parse_html_tree(\@tokens, $res);
+  return $res;
+}
+
+sub html_find_node($$$);
+
+sub html_find_node($$$)
+{
+  my ($node, $name, $args) = @_;
+  
+  if (defined($node)) {
+    if (ref($node) eq "ARRAY") {
+      foreach my $n (@$node) {
+        my $tmp = html_find_node($n, $name, $args);
+        # Must do it like this, in order not to break the loop
+        return $tmp if defined($tmp);
+      }
+    } elsif (ref($node) eq "HASH") {
+      if (defined($$node{"name"})) {
+        if ($$node{"name"} eq $name) {
+          if ($args ne "") {
+            if (defined($$node{"args"}) && $$node{"args"} =~ /$args/) {
+            } else {
+              return html_find_node($$node{"nodes"}, $name, $args);
+            }
+          }
+          return $node;
+        } else {
+          return html_find_node($$node{"nodes"}, $name, $args);
+        }
+      }
+    }
+  }
+  return undef;
+}
+
+
+sub fnode($$)
+{
+  return html_find_node($_[0], $_[1], "");
+}
+
+
+sub fnodea($$$)
+{
+  return html_find_node($_[0], $_[1], $_[2]);
+}
+
+
+sub fetch_http($)
+{
+  my $tmpAgent = LWP::UserAgent->new;
+  $tmpAgent->agent("Mozilla/4.0 (compatible; MSIE 6.0; MSIE 5.5; Windows NT 6.0) Opera 10.63  [en]");
+  $tmpAgent->timeout(10);
+
+  my $tmpRequest = HTTP::Request->new(GET => $_[0]);
+  
+  return $tmpAgent->request($tmpRequest);
+}
+
+
+sub get_node($$$)
+{
+  return defined($_[0]->[$_[2]]{"nodes"}[0]{$_[1]}) ? $_[0]->[$_[2]]{"nodes"}[0]{$_[1]} : "";
+}
+
+
+sub parse_timestamp($$)
+{
+  return $_[1] + str2time($_[0]) - time();
+}
+
+
+###
+### Main program begins
+###
+die(
+"Weather Fetch v0.1 by ccr/TNSP <ccr\@tnsp.org>\n".
+"Usage: $0 <output_data_file> <fmi_api_key>\n"
+) unless scalar(@ARGV) >= 2;
+
+my $opt_outfile = shift;
+my $opt_api_key = shift;
+my $weatherdata = {};
+
+### Fetch tiehallinto data
+for (my $i = 1; $i <= 22; $i++)
+{
+  my $res = fetch_http("http://alk.tiehallinto.fi/alk/tiesaa/tiesaa_maak_".$i.".html");
+  if ($res->code >= 200 && $res->code <= 201)
+  {
+    my $data = $res->decoded_content;
+    
+    # Filter out crap tags we don't want or need
+    $data =~ s/\n/§/g;
+    $data =~ s/<!--.*?-->//ig;
+    $data =~ s/<map[^>]*>.*?<\/map>//ig;
+    $data =~ s/<form[^>]*>.*?<\/form>//ig;
+    $data =~ s/<script[^>]*>.*?<\/script>//ig;
+    $data =~ s/<meta[^>]*>//ig;
+    $data =~ s/<font[^>]*>//ig;
+    $data =~ s/<\/font>//ig;
+    $data =~ s/<span[^>]*>//ig;
+    $data =~ s/<\/span>//ig;
+    $data =~ s/<\/?b>//ig;
+
+    $data =~ s/<br>//ig;
+    $data =~ s/&nbsp;/ /ig;
+    $data =~ s/§/\n/g;
+
+    # Parse the HTML mess
+    my $otree = parse_html($data);
+
+    # Find our desired element nodes
+    my $odata = fnodea(fnode($otree, "body"), "div", "class=elementc");
+    my $oupdate = fnode($odata, "p");
+    my $time_base = time();
+    if ($oupdate) {
+      my $tmp = $oupdate->{"nodes"}[0]{"text"};
+      if ($tmp =~ /:\s+(\d\d\.\d\d\.\d\d\d\d)\s+(\d\d:\d\d)/) {
+        $time_base = str2time($1);
+      }
+    }
+    
+    my $oelems = fnode($odata, "table")->{"nodes"};
+    for (my $n = 1; $n < scalar(@$oelems); $n++)
+    {
+      my $fdata = @$oelems[$n]->{"nodes"};
+      $weatherdata->{$fdata->[0]{"nodes"}[0]{"text"}} =
+      [
+        parse_timestamp(get_node($fdata, "text", 1), $time_base),
+        get_node($fdata, "text", 2),
+        get_node($fdata, "text", 3),
+        get_node($fdata, "text", 4),
+        get_node($fdata, "text", 5),
+      ];
+    }
+    print STDERR ".";
+  }
+}
+
+### Fetch FMI data
+my $res = fetch_http("http://data.fmi.fi/fmi-apikey/".$opt_api_key.
+  "/wfs?request=getFeature&storedquery_id=fmi::observations::weather::cities::multipointcoverage".
+  "&parameters=temperature,humidity");
+
+if ($res->code >= 200 && $res->code <= 201)
+{
+  my $xml = XMLin($res->decoded_content,
+    KeyAttr => { server => 'name' },
+    ForceArray => [ 'server', 'address' ]);
+}
+
+### Output
+if (defined($opt_outfile)) {
+  open(STDOUT, '>', $opt_outfile) or die("Could not open output file '$opt_outfile'.\n");
+}
+
+binmode STDOUT, ':encoding(utf-8)';
+
+foreach my $key (sort { $a cmp $b } keys %$weatherdata)
+{
+  print STDOUT $key."|".join("|", @{$weatherdata->{$key}})."\n";
+}
+
+close(STDOUT);