Mercurial > hg > egg-tcls
comparison fetch_weather.pl @ 158:db2b1f74d994
Initial import of weather data fetching backend, written in Perl.
author | Matti Hamalainen <ccr@tnsp.org> |
---|---|
date | Mon, 02 Jun 2014 12:29:11 +0300 |
parents | |
children | 4f8a163b2bc1 |
comparison
equal
deleted
inserted
replaced
157:c8fa73ba67da | 158:db2b1f74d994 |
---|---|
1 #!/usr/bin/perl -w | |
2 use strict; | |
3 use LWP::UserAgent; | |
4 use HTML::Entities; | |
5 use XML::Simple; | |
6 use Text::Iconv; | |
7 use Date::Parse; | |
8 use Data::Dumper; | |
9 | |
10 sub str_trim($) | |
11 { | |
12 my $str = $_[0]; | |
13 if (defined($str)) { | |
14 $str =~ s/^\s*//; | |
15 $str =~ s/\s*$//; | |
16 } | |
17 return $str; | |
18 } | |
19 | |
20 | |
21 sub pop_token_a($) | |
22 { | |
23 my $tokens = $_[0]; | |
24 return shift(@$tokens); | |
25 } | |
26 | |
27 | |
28 sub pop_token($) | |
29 { | |
30 return str_trim(pop_token_a($_[0])); | |
31 } | |
32 | |
33 | |
34 sub parse_html_str($) | |
35 { | |
36 my $tokens = $_[0]; | |
37 my $token = pop_token($tokens); | |
38 my $str = ""; | |
39 $token =~ s/^\s*//; | |
40 | |
41 return undef unless (substr($token, 0, 1) eq '"'); | |
42 $token = substr($token, 1); | |
43 | |
44 while (defined($token)) { | |
45 my $tmp = $token; | |
46 $tmp =~ s/\s*$//; | |
47 if (substr($tmp, -1) eq '"') { | |
48 $str .= substr($tmp, 0, -1); | |
49 return $str; | |
50 } else { | |
51 $str .= $token; | |
52 } | |
53 $token = shift(@$tokens); | |
54 } | |
55 return undef; | |
56 } | |
57 | |
58 | |
59 sub parse_html_tree($$); | |
60 | |
61 sub parse_html_tree($$) | |
62 { | |
63 my ($tokens, $tree) = @_; | |
64 | |
65 while (my $token = pop_token($tokens)) { | |
66 if ($token =~ /^<[!\/]?[a-zA-Z]+/) { | |
67 $token = lc($token); | |
68 if ($token =~ /^<\!.*>$/) { | |
69 # Ignore comments etc. | |
70 } elsif ($token =~ /^<([a-z]+[1-9]*)(.*)>$/) { | |
71 my ($name, $args) = ($1, $2); | |
72 if ($name eq "meta" || $name eq "img") { | |
73 my $tmp = {}; | |
74 parse_html_tree($tokens, $tree); | |
75 return $tree; | |
76 } else { | |
77 my $tmp = { "name" => $name, "args" => str_trim($args) }; | |
78 parse_html_tree($tokens, $tmp); | |
79 push(@{$$tree{"nodes"}}, $tmp); | |
80 } | |
81 } elsif ($token =~ /^<\/([a-z]+[1-9]*)>$/) { | |
82 return $tree; | |
83 } else { | |
84 die("HORROR TERROR ELITE: '$token'\n"); | |
85 } | |
86 } else { | |
87 $token = str_trim(decode_entities($token)); | |
88 push(@{$$tree{"nodes"}}, { "name" => "text", "args" => "", "text" => $token }) if length($token) > 0; | |
89 } | |
90 } | |
91 | |
92 return $tree; | |
93 } | |
94 | |
95 | |
96 sub parse_html($) | |
97 { | |
98 return undef unless defined($_[0]); | |
99 my $str = $_[0]; | |
100 my $res = { "name" => "", "args" => "" }; | |
101 $str =~ tr/\r/ /; | |
102 $str =~ tr/\n/ /; | |
103 my @tokens = grep { !($_ =~ /^\s*$/) } split(/(<\/?[a-zA-Z]+.*?>)/, $str); | |
104 parse_html_tree(\@tokens, $res); | |
105 return $res; | |
106 } | |
107 | |
108 sub html_find_node($$$); | |
109 | |
110 sub html_find_node($$$) | |
111 { | |
112 my ($node, $name, $args) = @_; | |
113 | |
114 if (defined($node)) { | |
115 if (ref($node) eq "ARRAY") { | |
116 foreach my $n (@$node) { | |
117 my $tmp = html_find_node($n, $name, $args); | |
118 # Must do it like this, in order not to break the loop | |
119 return $tmp if defined($tmp); | |
120 } | |
121 } elsif (ref($node) eq "HASH") { | |
122 if (defined($$node{"name"})) { | |
123 if ($$node{"name"} eq $name) { | |
124 if ($args ne "") { | |
125 if (defined($$node{"args"}) && $$node{"args"} =~ /$args/) { | |
126 } else { | |
127 return html_find_node($$node{"nodes"}, $name, $args); | |
128 } | |
129 } | |
130 return $node; | |
131 } else { | |
132 return html_find_node($$node{"nodes"}, $name, $args); | |
133 } | |
134 } | |
135 } | |
136 } | |
137 return undef; | |
138 } | |
139 | |
140 | |
141 sub fnode($$) | |
142 { | |
143 return html_find_node($_[0], $_[1], ""); | |
144 } | |
145 | |
146 | |
147 sub fnodea($$$) | |
148 { | |
149 return html_find_node($_[0], $_[1], $_[2]); | |
150 } | |
151 | |
152 | |
153 sub fetch_http($) | |
154 { | |
155 my $tmpAgent = LWP::UserAgent->new; | |
156 $tmpAgent->agent("Mozilla/4.0 (compatible; MSIE 6.0; MSIE 5.5; Windows NT 6.0) Opera 10.63 [en]"); | |
157 $tmpAgent->timeout(10); | |
158 | |
159 my $tmpRequest = HTTP::Request->new(GET => $_[0]); | |
160 | |
161 return $tmpAgent->request($tmpRequest); | |
162 } | |
163 | |
164 | |
165 sub get_node($$$) | |
166 { | |
167 return defined($_[0]->[$_[2]]{"nodes"}[0]{$_[1]}) ? $_[0]->[$_[2]]{"nodes"}[0]{$_[1]} : ""; | |
168 } | |
169 | |
170 | |
171 sub parse_timestamp($$) | |
172 { | |
173 return $_[1] + str2time($_[0]) - time(); | |
174 } | |
175 | |
176 | |
177 ### | |
178 ### Main program begins | |
179 ### | |
180 die( | |
181 "Weather Fetch v0.1 by ccr/TNSP <ccr\@tnsp.org>\n". | |
182 "Usage: $0 <output_data_file> <fmi_api_key>\n" | |
183 ) unless scalar(@ARGV) >= 2; | |
184 | |
185 my $opt_outfile = shift; | |
186 my $opt_api_key = shift; | |
187 my $weatherdata = {}; | |
188 | |
189 ### Fetch tiehallinto data | |
190 for (my $i = 1; $i <= 22; $i++) | |
191 { | |
192 my $res = fetch_http("http://alk.tiehallinto.fi/alk/tiesaa/tiesaa_maak_".$i.".html"); | |
193 if ($res->code >= 200 && $res->code <= 201) | |
194 { | |
195 my $data = $res->decoded_content; | |
196 | |
197 # Filter out crap tags we don't want or need | |
198 $data =~ s/\n/§/g; | |
199 $data =~ s/<!--.*?-->//ig; | |
200 $data =~ s/<map[^>]*>.*?<\/map>//ig; | |
201 $data =~ s/<form[^>]*>.*?<\/form>//ig; | |
202 $data =~ s/<script[^>]*>.*?<\/script>//ig; | |
203 $data =~ s/<meta[^>]*>//ig; | |
204 $data =~ s/<font[^>]*>//ig; | |
205 $data =~ s/<\/font>//ig; | |
206 $data =~ s/<span[^>]*>//ig; | |
207 $data =~ s/<\/span>//ig; | |
208 $data =~ s/<\/?b>//ig; | |
209 | |
210 $data =~ s/<br>//ig; | |
211 $data =~ s/ / /ig; | |
212 $data =~ s/§/\n/g; | |
213 | |
214 # Parse the HTML mess | |
215 my $otree = parse_html($data); | |
216 | |
217 # Find our desired element nodes | |
218 my $odata = fnodea(fnode($otree, "body"), "div", "class=elementc"); | |
219 my $oupdate = fnode($odata, "p"); | |
220 my $time_base = time(); | |
221 if ($oupdate) { | |
222 my $tmp = $oupdate->{"nodes"}[0]{"text"}; | |
223 if ($tmp =~ /:\s+(\d\d\.\d\d\.\d\d\d\d)\s+(\d\d:\d\d)/) { | |
224 $time_base = str2time($1); | |
225 } | |
226 } | |
227 | |
228 my $oelems = fnode($odata, "table")->{"nodes"}; | |
229 for (my $n = 1; $n < scalar(@$oelems); $n++) | |
230 { | |
231 my $fdata = @$oelems[$n]->{"nodes"}; | |
232 $weatherdata->{$fdata->[0]{"nodes"}[0]{"text"}} = | |
233 [ | |
234 parse_timestamp(get_node($fdata, "text", 1), $time_base), | |
235 get_node($fdata, "text", 2), | |
236 get_node($fdata, "text", 3), | |
237 get_node($fdata, "text", 4), | |
238 get_node($fdata, "text", 5), | |
239 ]; | |
240 } | |
241 print STDERR "."; | |
242 } | |
243 } | |
244 | |
245 ### Fetch FMI data | |
246 my $res = fetch_http("http://data.fmi.fi/fmi-apikey/".$opt_api_key. | |
247 "/wfs?request=getFeature&storedquery_id=fmi::observations::weather::cities::multipointcoverage". | |
248 "¶meters=temperature,humidity"); | |
249 | |
250 if ($res->code >= 200 && $res->code <= 201) | |
251 { | |
252 my $xml = XMLin($res->decoded_content, | |
253 KeyAttr => { server => 'name' }, | |
254 ForceArray => [ 'server', 'address' ]); | |
255 } | |
256 | |
257 ### Output | |
258 if (defined($opt_outfile)) { | |
259 open(STDOUT, '>', $opt_outfile) or die("Could not open output file '$opt_outfile'.\n"); | |
260 } | |
261 | |
262 binmode STDOUT, ':encoding(utf-8)'; | |
263 | |
264 foreach my $key (sort { $a cmp $b } keys %$weatherdata) | |
265 { | |
266 print STDOUT $key."|".join("|", @{$weatherdata->{$key}})."\n"; | |
267 } | |
268 | |
269 close(STDOUT); |