Mercurial > hg > egg-tcls
annotate fetch_feeds.tcl @ 656:7192d94f8c28
fetch_feeds: Copy improved HTTP request code from urllog script.
author | Matti Hamalainen <ccr@tnsp.org> |
---|---|
date | Fri, 19 Feb 2021 19:30:30 +0200 |
parents | 4b985abf5aba |
children | f46c152183a2 |
rev | line source |
---|---|
0 | 1 #!/usr/bin/tclsh |
1 | 2 # |
3 # NOTICE! Change above path to correct tclsh binary path! | |
4 # | |
268
96310b1c88fa
feeds: Improve config resiliency.
Matti Hamalainen <ccr@tnsp.org>
parents:
265
diff
changeset
|
5 ############################################################################## |
0 | 6 # |
425
e5810c52d376
Bump some copyright years and versions.
Matti Hamalainen <ccr@tnsp.org>
parents:
424
diff
changeset
|
7 # FeedCheck fetcher v1.1 by Matti 'ccr' Hamalainen <ccr@tnsp.org> |
578 | 8 # (C) Copyright 2008-2021 Tecnic Software productions (TNSP) |
0 | 9 # |
10 # This script is freely distributable under GNU GPL (version 2) license. | |
11 # | |
460
dbe249968591
fetch_feeds: Add support for SNI in TLS and make note about requiring tcl-tls 1.7.13+
Matti Hamalainen <ccr@tnsp.org>
parents:
425
diff
changeset
|
12 # NOTICE! NOTICE! This script REQUIRES tcl-tls 1.7.13+ if you wish to |
dbe249968591
fetch_feeds: Add support for SNI in TLS and make note about requiring tcl-tls 1.7.13+
Matti Hamalainen <ccr@tnsp.org>
parents:
425
diff
changeset
|
13 # support SSL/TLS https for URL checking. And you probably do. |
dbe249968591
fetch_feeds: Add support for SNI in TLS and make note about requiring tcl-tls 1.7.13+
Matti Hamalainen <ccr@tnsp.org>
parents:
425
diff
changeset
|
14 # |
268
96310b1c88fa
feeds: Improve config resiliency.
Matti Hamalainen <ccr@tnsp.org>
parents:
265
diff
changeset
|
15 ############################################################################## |
0 | 16 |
265
908edc54005a
feeds: Move configuration to separate file.
Matti Hamalainen <ccr@tnsp.org>
parents:
159
diff
changeset
|
17 ### The configuration should be in config.feeds in same directory |
908edc54005a
feeds: Move configuration to separate file.
Matti Hamalainen <ccr@tnsp.org>
parents:
159
diff
changeset
|
18 ### as this script. Or change the line below to point where ever |
908edc54005a
feeds: Move configuration to separate file.
Matti Hamalainen <ccr@tnsp.org>
parents:
159
diff
changeset
|
19 ### you wish. See "config.feeds.example" for an example config file. |
908edc54005a
feeds: Move configuration to separate file.
Matti Hamalainen <ccr@tnsp.org>
parents:
159
diff
changeset
|
20 source [file dirname [info script]]/config.feeds |
0 | 21 |
422
880a07485275
Add utl_ctime() to utillib and use it elsewhere.
Matti Hamalainen <ccr@tnsp.org>
parents:
350
diff
changeset
|
22 ### Required utillib.tcl |
880a07485275
Add utl_ctime() to utillib and use it elsewhere.
Matti Hamalainen <ccr@tnsp.org>
parents:
350
diff
changeset
|
23 source [file dirname [info script]]/utillib.tcl |
880a07485275
Add utl_ctime() to utillib and use it elsewhere.
Matti Hamalainen <ccr@tnsp.org>
parents:
350
diff
changeset
|
24 |
0 | 25 |
268
96310b1c88fa
feeds: Improve config resiliency.
Matti Hamalainen <ccr@tnsp.org>
parents:
265
diff
changeset
|
26 ############################################################################## |
423
44c9128097cd
feeds: Remember to require sqlite3 package.
Matti Hamalainen <ccr@tnsp.org>
parents:
422
diff
changeset
|
27 package require sqlite3 |
0 | 28 package require http |
271 | 29 |
655
4b985abf5aba
fetch_feeds: Move HTTP initialization to end of script.
Matti Hamalainen <ccr@tnsp.org>
parents:
607
diff
changeset
|
30 |
4b985abf5aba
fetch_feeds: Move HTTP initialization to end of script.
Matti Hamalainen <ccr@tnsp.org>
parents:
607
diff
changeset
|
31 ############################################################################## |
4b985abf5aba
fetch_feeds: Move HTTP initialization to end of script.
Matti Hamalainen <ccr@tnsp.org>
parents:
607
diff
changeset
|
32 ### Utility functions |
4b985abf5aba
fetch_feeds: Move HTTP initialization to end of script.
Matti Hamalainen <ccr@tnsp.org>
parents:
607
diff
changeset
|
33 |
4b985abf5aba
fetch_feeds: Move HTTP initialization to end of script.
Matti Hamalainen <ccr@tnsp.org>
parents:
607
diff
changeset
|
34 proc fetch_sanitize_encoding {uencoding} { |
4b985abf5aba
fetch_feeds: Move HTTP initialization to end of script.
Matti Hamalainen <ccr@tnsp.org>
parents:
607
diff
changeset
|
35 regsub -- "^\[a-z\]\[a-z\]_\[A-Z\]\[A-Z\]\." $uencoding "" uencoding |
4b985abf5aba
fetch_feeds: Move HTTP initialization to end of script.
Matti Hamalainen <ccr@tnsp.org>
parents:
607
diff
changeset
|
36 set uencoding [string tolower $uencoding] |
4b985abf5aba
fetch_feeds: Move HTTP initialization to end of script.
Matti Hamalainen <ccr@tnsp.org>
parents:
607
diff
changeset
|
37 regsub -- "^iso-" $uencoding "iso" uencoding |
4b985abf5aba
fetch_feeds: Move HTTP initialization to end of script.
Matti Hamalainen <ccr@tnsp.org>
parents:
607
diff
changeset
|
38 return $uencoding |
265
908edc54005a
feeds: Move configuration to separate file.
Matti Hamalainen <ccr@tnsp.org>
parents:
159
diff
changeset
|
39 } |
271 | 40 |
268
96310b1c88fa
feeds: Improve config resiliency.
Matti Hamalainen <ccr@tnsp.org>
parents:
265
diff
changeset
|
41 |
321
d8b957796121
feeds: Refactor the feeds fetching.
Matti Hamalainen <ccr@tnsp.org>
parents:
296
diff
changeset
|
42 proc fetch_dorequest { urlStr urlStatus urlSCode urlCode urlData urlMeta } { |
596 | 43 upvar $urlStatus ustatus |
44 upvar $urlSCode uscode | |
45 upvar $urlCode ucode | |
46 upvar $urlData udata | |
47 upvar $urlMeta umeta | |
321
d8b957796121
feeds: Refactor the feeds fetching.
Matti Hamalainen <ccr@tnsp.org>
parents:
296
diff
changeset
|
48 |
656
7192d94f8c28
fetch_feeds: Copy improved HTTP request code from urllog script.
Matti Hamalainen <ccr@tnsp.org>
parents:
655
diff
changeset
|
49 set urlHeaders {} |
7192d94f8c28
fetch_feeds: Copy improved HTTP request code from urllog script.
Matti Hamalainen <ccr@tnsp.org>
parents:
655
diff
changeset
|
50 lappend urlHeaders "Accept-Encoding" "identity" |
7192d94f8c28
fetch_feeds: Copy improved HTTP request code from urllog script.
Matti Hamalainen <ccr@tnsp.org>
parents:
655
diff
changeset
|
51 #lappend urlHeaders "Connection" "keep-alive" |
7192d94f8c28
fetch_feeds: Copy improved HTTP request code from urllog script.
Matti Hamalainen <ccr@tnsp.org>
parents:
655
diff
changeset
|
52 |
7192d94f8c28
fetch_feeds: Copy improved HTTP request code from urllog script.
Matti Hamalainen <ccr@tnsp.org>
parents:
655
diff
changeset
|
53 ### Perform request |
7192d94f8c28
fetch_feeds: Copy improved HTTP request code from urllog script.
Matti Hamalainen <ccr@tnsp.org>
parents:
655
diff
changeset
|
54 if {[catch {set utoken [::http::geturl $urlStr -timeout 6000 -binary 1 -headers $urlHeaders]} uerrmsg]} { |
321
d8b957796121
feeds: Refactor the feeds fetching.
Matti Hamalainen <ccr@tnsp.org>
parents:
296
diff
changeset
|
55 puts "HTTP request failed: $uerrmsg" |
d8b957796121
feeds: Refactor the feeds fetching.
Matti Hamalainen <ccr@tnsp.org>
parents:
296
diff
changeset
|
56 return 0 |
d8b957796121
feeds: Refactor the feeds fetching.
Matti Hamalainen <ccr@tnsp.org>
parents:
296
diff
changeset
|
57 } |
d8b957796121
feeds: Refactor the feeds fetching.
Matti Hamalainen <ccr@tnsp.org>
parents:
296
diff
changeset
|
58 |
656
7192d94f8c28
fetch_feeds: Copy improved HTTP request code from urllog script.
Matti Hamalainen <ccr@tnsp.org>
parents:
655
diff
changeset
|
59 ### Check status |
321
d8b957796121
feeds: Refactor the feeds fetching.
Matti Hamalainen <ccr@tnsp.org>
parents:
296
diff
changeset
|
60 set ustatus [::http::status $utoken] |
656
7192d94f8c28
fetch_feeds: Copy improved HTTP request code from urllog script.
Matti Hamalainen <ccr@tnsp.org>
parents:
655
diff
changeset
|
61 set uscode [::http::code $utoken] |
7192d94f8c28
fetch_feeds: Copy improved HTTP request code from urllog script.
Matti Hamalainen <ccr@tnsp.org>
parents:
655
diff
changeset
|
62 set ucode [::http::ncode $utoken] |
7192d94f8c28
fetch_feeds: Copy improved HTTP request code from urllog script.
Matti Hamalainen <ccr@tnsp.org>
parents:
655
diff
changeset
|
63 |
7192d94f8c28
fetch_feeds: Copy improved HTTP request code from urllog script.
Matti Hamalainen <ccr@tnsp.org>
parents:
655
diff
changeset
|
64 if {$ustatus != "ok"} { |
7192d94f8c28
fetch_feeds: Copy improved HTTP request code from urllog script.
Matti Hamalainen <ccr@tnsp.org>
parents:
655
diff
changeset
|
65 puts "Error in HTTP request: $ustatus / $uscode ($urlStr)" |
321
d8b957796121
feeds: Refactor the feeds fetching.
Matti Hamalainen <ccr@tnsp.org>
parents:
296
diff
changeset
|
66 return 0 |
d8b957796121
feeds: Refactor the feeds fetching.
Matti Hamalainen <ccr@tnsp.org>
parents:
296
diff
changeset
|
67 } |
d8b957796121
feeds: Refactor the feeds fetching.
Matti Hamalainen <ccr@tnsp.org>
parents:
296
diff
changeset
|
68 |
656
7192d94f8c28
fetch_feeds: Copy improved HTTP request code from urllog script.
Matti Hamalainen <ccr@tnsp.org>
parents:
655
diff
changeset
|
69 ### Get data |
321
d8b957796121
feeds: Refactor the feeds fetching.
Matti Hamalainen <ccr@tnsp.org>
parents:
296
diff
changeset
|
70 set udata [::http::data $utoken] |
d8b957796121
feeds: Refactor the feeds fetching.
Matti Hamalainen <ccr@tnsp.org>
parents:
296
diff
changeset
|
71 array set umeta [::http::meta $utoken] |
d8b957796121
feeds: Refactor the feeds fetching.
Matti Hamalainen <ccr@tnsp.org>
parents:
296
diff
changeset
|
72 ::http::cleanup $utoken |
d8b957796121
feeds: Refactor the feeds fetching.
Matti Hamalainen <ccr@tnsp.org>
parents:
296
diff
changeset
|
73 |
656
7192d94f8c28
fetch_feeds: Copy improved HTTP request code from urllog script.
Matti Hamalainen <ccr@tnsp.org>
parents:
655
diff
changeset
|
74 ### Sanitize the metadata KEYS |
7192d94f8c28
fetch_feeds: Copy improved HTTP request code from urllog script.
Matti Hamalainen <ccr@tnsp.org>
parents:
655
diff
changeset
|
75 foreach {ukey uvalue} [array get umeta] { |
7192d94f8c28
fetch_feeds: Copy improved HTTP request code from urllog script.
Matti Hamalainen <ccr@tnsp.org>
parents:
655
diff
changeset
|
76 set ukey [string tolower $ukey] |
7192d94f8c28
fetch_feeds: Copy improved HTTP request code from urllog script.
Matti Hamalainen <ccr@tnsp.org>
parents:
655
diff
changeset
|
77 set umeta($ukey) $uvalue |
7192d94f8c28
fetch_feeds: Copy improved HTTP request code from urllog script.
Matti Hamalainen <ccr@tnsp.org>
parents:
655
diff
changeset
|
78 } |
7192d94f8c28
fetch_feeds: Copy improved HTTP request code from urllog script.
Matti Hamalainen <ccr@tnsp.org>
parents:
655
diff
changeset
|
79 |
7192d94f8c28
fetch_feeds: Copy improved HTTP request code from urllog script.
Matti Hamalainen <ccr@tnsp.org>
parents:
655
diff
changeset
|
80 ### Perform encoding conversion if necessary |
7192d94f8c28
fetch_feeds: Copy improved HTTP request code from urllog script.
Matti Hamalainen <ccr@tnsp.org>
parents:
655
diff
changeset
|
81 if {$ucode >= 200 && $ucode <= 205} { |
7192d94f8c28
fetch_feeds: Copy improved HTTP request code from urllog script.
Matti Hamalainen <ccr@tnsp.org>
parents:
655
diff
changeset
|
82 set uenc_doc "" |
7192d94f8c28
fetch_feeds: Copy improved HTTP request code from urllog script.
Matti Hamalainen <ccr@tnsp.org>
parents:
655
diff
changeset
|
83 set uenc_http "" |
7192d94f8c28
fetch_feeds: Copy improved HTTP request code from urllog script.
Matti Hamalainen <ccr@tnsp.org>
parents:
655
diff
changeset
|
84 set uencoding "" |
7192d94f8c28
fetch_feeds: Copy improved HTTP request code from urllog script.
Matti Hamalainen <ccr@tnsp.org>
parents:
655
diff
changeset
|
85 |
7192d94f8c28
fetch_feeds: Copy improved HTTP request code from urllog script.
Matti Hamalainen <ccr@tnsp.org>
parents:
655
diff
changeset
|
86 if {[info exists umeta(content-type)] && [regexp -nocase {charset\s*=\s*([a-z0-9._-]+)} $umeta(content-type) -> uenc_http]} { |
7192d94f8c28
fetch_feeds: Copy improved HTTP request code from urllog script.
Matti Hamalainen <ccr@tnsp.org>
parents:
655
diff
changeset
|
87 # Found character set encoding information in HTTP headers |
7192d94f8c28
fetch_feeds: Copy improved HTTP request code from urllog script.
Matti Hamalainen <ccr@tnsp.org>
parents:
655
diff
changeset
|
88 } |
7192d94f8c28
fetch_feeds: Copy improved HTTP request code from urllog script.
Matti Hamalainen <ccr@tnsp.org>
parents:
655
diff
changeset
|
89 |
7192d94f8c28
fetch_feeds: Copy improved HTTP request code from urllog script.
Matti Hamalainen <ccr@tnsp.org>
parents:
655
diff
changeset
|
90 if {[regexp -nocase -- "<meta.\*\?content=\"text/html.\*\?charset=(\[^\"\]*)\".\*\?/\?>" $udata -> uenc_doc]} { |
7192d94f8c28
fetch_feeds: Copy improved HTTP request code from urllog script.
Matti Hamalainen <ccr@tnsp.org>
parents:
655
diff
changeset
|
91 # Found old style HTML meta tag with character set information |
7192d94f8c28
fetch_feeds: Copy improved HTTP request code from urllog script.
Matti Hamalainen <ccr@tnsp.org>
parents:
655
diff
changeset
|
92 } elseif {[regexp -nocase -- "<meta.\*\?charset=\"(\[^\"\]*)\".\*\?/\?>" $udata -> uenc_doc]} { |
7192d94f8c28
fetch_feeds: Copy improved HTTP request code from urllog script.
Matti Hamalainen <ccr@tnsp.org>
parents:
655
diff
changeset
|
93 # Found HTML5 style meta tag with character set information |
7192d94f8c28
fetch_feeds: Copy improved HTTP request code from urllog script.
Matti Hamalainen <ccr@tnsp.org>
parents:
655
diff
changeset
|
94 } |
7192d94f8c28
fetch_feeds: Copy improved HTTP request code from urllog script.
Matti Hamalainen <ccr@tnsp.org>
parents:
655
diff
changeset
|
95 |
7192d94f8c28
fetch_feeds: Copy improved HTTP request code from urllog script.
Matti Hamalainen <ccr@tnsp.org>
parents:
655
diff
changeset
|
96 # Make sanitized versions of the encoding strings |
7192d94f8c28
fetch_feeds: Copy improved HTTP request code from urllog script.
Matti Hamalainen <ccr@tnsp.org>
parents:
655
diff
changeset
|
97 set uenc_http2 [fetch_sanitize_encoding $uenc_http] |
7192d94f8c28
fetch_feeds: Copy improved HTTP request code from urllog script.
Matti Hamalainen <ccr@tnsp.org>
parents:
655
diff
changeset
|
98 set uenc_doc2 [fetch_sanitize_encoding $uenc_doc] |
7192d94f8c28
fetch_feeds: Copy improved HTTP request code from urllog script.
Matti Hamalainen <ccr@tnsp.org>
parents:
655
diff
changeset
|
99 |
7192d94f8c28
fetch_feeds: Copy improved HTTP request code from urllog script.
Matti Hamalainen <ccr@tnsp.org>
parents:
655
diff
changeset
|
100 # Check if the document has specified encoding |
7192d94f8c28
fetch_feeds: Copy improved HTTP request code from urllog script.
Matti Hamalainen <ccr@tnsp.org>
parents:
655
diff
changeset
|
101 set uencoding $uenc_http2 |
7192d94f8c28
fetch_feeds: Copy improved HTTP request code from urllog script.
Matti Hamalainen <ccr@tnsp.org>
parents:
655
diff
changeset
|
102 if {$uencoding == "" && $uenc_doc2 != ""} { |
7192d94f8c28
fetch_feeds: Copy improved HTTP request code from urllog script.
Matti Hamalainen <ccr@tnsp.org>
parents:
655
diff
changeset
|
103 set uencoding $uenc_doc2 |
7192d94f8c28
fetch_feeds: Copy improved HTTP request code from urllog script.
Matti Hamalainen <ccr@tnsp.org>
parents:
655
diff
changeset
|
104 } elseif {$uencoding == ""} { |
7192d94f8c28
fetch_feeds: Copy improved HTTP request code from urllog script.
Matti Hamalainen <ccr@tnsp.org>
parents:
655
diff
changeset
|
105 # If _NO_ known encoding of any kind, assume the default of iso8859-1 |
7192d94f8c28
fetch_feeds: Copy improved HTTP request code from urllog script.
Matti Hamalainen <ccr@tnsp.org>
parents:
655
diff
changeset
|
106 set uencoding "iso8859-1" |
7192d94f8c28
fetch_feeds: Copy improved HTTP request code from urllog script.
Matti Hamalainen <ccr@tnsp.org>
parents:
655
diff
changeset
|
107 } |
7192d94f8c28
fetch_feeds: Copy improved HTTP request code from urllog script.
Matti Hamalainen <ccr@tnsp.org>
parents:
655
diff
changeset
|
108 |
7192d94f8c28
fetch_feeds: Copy improved HTTP request code from urllog script.
Matti Hamalainen <ccr@tnsp.org>
parents:
655
diff
changeset
|
109 #puts "Charsets: http='$uenc_http', doc='$uenc_doc' / sanitized http='$uenc_http2', doc='$uenc_doc2' -> '$uencoding'" |
7192d94f8c28
fetch_feeds: Copy improved HTTP request code from urllog script.
Matti Hamalainen <ccr@tnsp.org>
parents:
655
diff
changeset
|
110 |
7192d94f8c28
fetch_feeds: Copy improved HTTP request code from urllog script.
Matti Hamalainen <ccr@tnsp.org>
parents:
655
diff
changeset
|
111 # Get the document title, if any |
7192d94f8c28
fetch_feeds: Copy improved HTTP request code from urllog script.
Matti Hamalainen <ccr@tnsp.org>
parents:
655
diff
changeset
|
112 if {$uencoding != ""} { |
7192d94f8c28
fetch_feeds: Copy improved HTTP request code from urllog script.
Matti Hamalainen <ccr@tnsp.org>
parents:
655
diff
changeset
|
113 if {[catch {set udata [encoding convertfrom $uencoding $udata]} cerrmsg]} { |
7192d94f8c28
fetch_feeds: Copy improved HTTP request code from urllog script.
Matti Hamalainen <ccr@tnsp.org>
parents:
655
diff
changeset
|
114 puts "Error in charset conversion: $urlStr: $cerrmsg" |
7192d94f8c28
fetch_feeds: Copy improved HTTP request code from urllog script.
Matti Hamalainen <ccr@tnsp.org>
parents:
655
diff
changeset
|
115 return 0 |
7192d94f8c28
fetch_feeds: Copy improved HTTP request code from urllog script.
Matti Hamalainen <ccr@tnsp.org>
parents:
655
diff
changeset
|
116 } |
7192d94f8c28
fetch_feeds: Copy improved HTTP request code from urllog script.
Matti Hamalainen <ccr@tnsp.org>
parents:
655
diff
changeset
|
117 } |
7192d94f8c28
fetch_feeds: Copy improved HTTP request code from urllog script.
Matti Hamalainen <ccr@tnsp.org>
parents:
655
diff
changeset
|
118 return 1 |
7192d94f8c28
fetch_feeds: Copy improved HTTP request code from urllog script.
Matti Hamalainen <ccr@tnsp.org>
parents:
655
diff
changeset
|
119 } else { |
7192d94f8c28
fetch_feeds: Copy improved HTTP request code from urllog script.
Matti Hamalainen <ccr@tnsp.org>
parents:
655
diff
changeset
|
120 return 0 |
7192d94f8c28
fetch_feeds: Copy improved HTTP request code from urllog script.
Matti Hamalainen <ccr@tnsp.org>
parents:
655
diff
changeset
|
121 } |
321
d8b957796121
feeds: Refactor the feeds fetching.
Matti Hamalainen <ccr@tnsp.org>
parents:
296
diff
changeset
|
122 } |
d8b957796121
feeds: Refactor the feeds fetching.
Matti Hamalainen <ccr@tnsp.org>
parents:
296
diff
changeset
|
123 |
d8b957796121
feeds: Refactor the feeds fetching.
Matti Hamalainen <ccr@tnsp.org>
parents:
296
diff
changeset
|
124 |
139
3305e142eecc
Change feed fetcher to use SQLite3 backend.
Matti Hamalainen <ccr@tnsp.org>
parents:
114
diff
changeset
|
125 proc add_entry {uname uprefix uurl utitle} { |
142 | 126 global currclock feeds_db nitems |
292
9f90d6918626
feeds: Also use the html entity conversion from utillib here.
Matti Hamalainen <ccr@tnsp.org>
parents:
271
diff
changeset
|
127 set utmp [utl_convert_html_ent $uurl] |
147 | 128 if {[string match "http://*" $utmp] || [string match "https://*" $utmp]} { |
129 set utest "$utmp" | |
130 } else { | |
321
d8b957796121
feeds: Refactor the feeds fetching.
Matti Hamalainen <ccr@tnsp.org>
parents:
296
diff
changeset
|
131 if {[string range $uprefix end end] != "/" && [string range $utmp 0 0] != "/"} { |
d8b957796121
feeds: Refactor the feeds fetching.
Matti Hamalainen <ccr@tnsp.org>
parents:
296
diff
changeset
|
132 set utest "$uprefix/$utmp" |
d8b957796121
feeds: Refactor the feeds fetching.
Matti Hamalainen <ccr@tnsp.org>
parents:
296
diff
changeset
|
133 } else { |
d8b957796121
feeds: Refactor the feeds fetching.
Matti Hamalainen <ccr@tnsp.org>
parents:
296
diff
changeset
|
134 set utest "$uprefix$utmp" |
d8b957796121
feeds: Refactor the feeds fetching.
Matti Hamalainen <ccr@tnsp.org>
parents:
296
diff
changeset
|
135 } |
147 | 136 } |
139
3305e142eecc
Change feed fetcher to use SQLite3 backend.
Matti Hamalainen <ccr@tnsp.org>
parents:
114
diff
changeset
|
137 |
296 | 138 set usql "SELECT title FROM feeds WHERE url='[utl_escape $utest]' AND feed='[utl_escape $uname]'" |
140
b0648e05c855
Change some variable names, etc.
Matti Hamalainen <ccr@tnsp.org>
parents:
139
diff
changeset
|
139 if {![feeds_db exists $usql]} { |
321
d8b957796121
feeds: Refactor the feeds fetching.
Matti Hamalainen <ccr@tnsp.org>
parents:
296
diff
changeset
|
140 # puts "NEW: $utest : $utitle" |
d8b957796121
feeds: Refactor the feeds fetching.
Matti Hamalainen <ccr@tnsp.org>
parents:
296
diff
changeset
|
141 set usql "INSERT INTO feeds (feed,utime,url,title) VALUES ('[utl_escape $uname]', $currclock, '[utl_escape $utest]', '[utl_escape [utl_convert_html_ent $utitle]]')" |
142 | 142 incr nitems |
140
b0648e05c855
Change some variable names, etc.
Matti Hamalainen <ccr@tnsp.org>
parents:
139
diff
changeset
|
143 if {[catch {feeds_db eval $usql} uerrmsg]} { |
139
3305e142eecc
Change feed fetcher to use SQLite3 backend.
Matti Hamalainen <ccr@tnsp.org>
parents:
114
diff
changeset
|
144 puts "\nError: $uerrmsg on:\n$usql" |
3305e142eecc
Change feed fetcher to use SQLite3 backend.
Matti Hamalainen <ccr@tnsp.org>
parents:
114
diff
changeset
|
145 exit 15 |
3305e142eecc
Change feed fetcher to use SQLite3 backend.
Matti Hamalainen <ccr@tnsp.org>
parents:
114
diff
changeset
|
146 } |
63 | 147 } |
0 | 148 } |
149 | |
150 | |
151 proc add_rss_feed {datauri dataname dataprefix} { | |
321
d8b957796121
feeds: Refactor the feeds fetching.
Matti Hamalainen <ccr@tnsp.org>
parents:
296
diff
changeset
|
152 if {[catch {set utoken [::http::geturl $datauri -binary 1 -timeout 6000 -headers {Accept-Encoding identity}]} uerrmsg]} { |
63 | 153 puts "Error getting $datauri: $uerrmsg" |
154 return 1 | |
155 } | |
156 set upage [::http::data $utoken] | |
157 ::http::cleanup $utoken | |
424
825cac46b1cb
Cosmetic / stray trailing whitespace cleanup.
Matti Hamalainen <ccr@tnsp.org>
parents:
423
diff
changeset
|
158 |
63 | 159 set umatches [regexp -all -nocase -inline -- "<item>.\*\?<title><..CDATA.(.\*\?)\\\]\\\]></title>.\*\?<link>(http.\*\?)</link>.\*\?</item>" $upage] |
160 set nmatches [llength $umatches] | |
161 for {set n 0} {$n < $nmatches} {incr n 3} { | |
162 add_entry $dataname $dataprefix [lindex $umatches [expr $n+2]] [lindex $umatches [expr $n+1]] | |
163 } | |
424
825cac46b1cb
Cosmetic / stray trailing whitespace cleanup.
Matti Hamalainen <ccr@tnsp.org>
parents:
423
diff
changeset
|
164 |
63 | 165 if {$nmatches == 0} { |
166 set umatches [regexp -all -nocase -inline -- "<item>.\*\?<title>(.\*\?)</title>.\*\?<link>(http.\*\?)</link>.\*\?</item>" $upage] | |
167 set nmatches [llength $umatches] | |
168 for {set n 0} {$n < $nmatches} {incr n 3} { | |
169 add_entry $dataname $dataprefix [lindex $umatches [expr $n+2]] [lindex $umatches [expr $n+1]] | |
170 } | |
171 } | |
0 | 172 |
63 | 173 if {$nmatches == 0} { |
174 set umatches [regexp -all -nocase -inline -- "<item \[^>\]*>.\*\?<title>(.\*\?)</title>.\*\?<link>(http.\*\?)</link>.\*\?</item>" $upage] | |
175 set nmatches [llength $umatches] | |
176 for {set n 0} {$n < $nmatches} {incr n 3} { | |
177 add_entry $dataname $dataprefix [lindex $umatches [expr $n+2]] [lindex $umatches [expr $n+1]] | |
178 } | |
179 } | |
143 | 180 |
63 | 181 return 0 |
0 | 182 } |
183 | |
184 | |
185 ############################################################################## | |
69
df3230f8aa46
Translate some comments to english and cosmetic fixes.
Matti Hamalainen <ccr@tnsp.org>
parents:
63
diff
changeset
|
186 ### Fetch and parse Halla-aho's blog page data |
321
d8b957796121
feeds: Refactor the feeds fetching.
Matti Hamalainen <ccr@tnsp.org>
parents:
296
diff
changeset
|
187 proc fetch_halla_aho { } { |
d8b957796121
feeds: Refactor the feeds fetching.
Matti Hamalainen <ccr@tnsp.org>
parents:
296
diff
changeset
|
188 set datauri "http://www.halla-aho.com/scripta/"; |
d8b957796121
feeds: Refactor the feeds fetching.
Matti Hamalainen <ccr@tnsp.org>
parents:
296
diff
changeset
|
189 set dataname "Mestari" |
d8b957796121
feeds: Refactor the feeds fetching.
Matti Hamalainen <ccr@tnsp.org>
parents:
296
diff
changeset
|
190 if {![fetch_dorequest $datauri ustatus uscode ucode upage umeta]} { |
d8b957796121
feeds: Refactor the feeds fetching.
Matti Hamalainen <ccr@tnsp.org>
parents:
296
diff
changeset
|
191 return 0 |
d8b957796121
feeds: Refactor the feeds fetching.
Matti Hamalainen <ccr@tnsp.org>
parents:
296
diff
changeset
|
192 } |
d8b957796121
feeds: Refactor the feeds fetching.
Matti Hamalainen <ccr@tnsp.org>
parents:
296
diff
changeset
|
193 |
63 | 194 set umatches [regexp -all -nocase -inline -- "<a href=\"(\[^\"\]+\.html)\"><b>(\[^<\]+)</b>" $upage] |
195 set nmatches [llength $umatches] | |
196 for {set n 0} {$n < $nmatches} {incr n 3} { | |
197 add_entry $dataname $datauri [lindex $umatches [expr $n+1]] [lindex $umatches [expr $n+2]] | |
198 } | |
0 | 199 |
63 | 200 set umatches [regexp -all -nocase -inline -- "<a href=\"(\[^\"\]+\.html)\">(\[^<\]\[^b\]\[^<\]+)</a>" $upage] |
201 set nmatches [llength $umatches] | |
202 for {set n 0} {$n < $nmatches} {incr n 3} { | |
203 add_entry $dataname $datauri [lindex $umatches [expr $n+1]] [lindex $umatches [expr $n+2]] | |
204 } | |
0 | 205 } |
206 | |
207 | |
208 ### The Adventurers | |
321
d8b957796121
feeds: Refactor the feeds fetching.
Matti Hamalainen <ccr@tnsp.org>
parents:
296
diff
changeset
|
209 proc fetch_adventurers { } { |
d8b957796121
feeds: Refactor the feeds fetching.
Matti Hamalainen <ccr@tnsp.org>
parents:
296
diff
changeset
|
210 set datauri "http://www.peldor.com/chapters/index_sidebar.html"; |
d8b957796121
feeds: Refactor the feeds fetching.
Matti Hamalainen <ccr@tnsp.org>
parents:
296
diff
changeset
|
211 set dataname "The Adventurers" |
d8b957796121
feeds: Refactor the feeds fetching.
Matti Hamalainen <ccr@tnsp.org>
parents:
296
diff
changeset
|
212 if {![fetch_dorequest $datauri ustatus uscode ucode upage umeta]} { |
d8b957796121
feeds: Refactor the feeds fetching.
Matti Hamalainen <ccr@tnsp.org>
parents:
296
diff
changeset
|
213 return 0 |
d8b957796121
feeds: Refactor the feeds fetching.
Matti Hamalainen <ccr@tnsp.org>
parents:
296
diff
changeset
|
214 } |
d8b957796121
feeds: Refactor the feeds fetching.
Matti Hamalainen <ccr@tnsp.org>
parents:
296
diff
changeset
|
215 |
63 | 216 set umatches [regexp -all -nocase -inline -- "<a href=\"(\[^\"\]+)\">(\[^<\]+)</a>" $upage] |
217 set nmatches [llength $umatches] | |
218 for {set n 0} {$n < $nmatches} {incr n 3} { | |
219 add_entry $dataname "http://www.peldor.com/" [lindex $umatches [expr $n+1]] [lindex $umatches [expr $n+2]] | |
220 } | |
0 | 221 } |
222 | |
223 | |
224 ### Order of the Stick | |
321
d8b957796121
feeds: Refactor the feeds fetching.
Matti Hamalainen <ccr@tnsp.org>
parents:
296
diff
changeset
|
225 proc fetch_oots { } { |
521
4656f4fd0aa9
fetch_feeds: Fix OOTS fetching, needs https:// nowadays.
Matti Hamalainen <ccr@tnsp.org>
parents:
484
diff
changeset
|
226 set datauri "https://www.giantitp.com/comics/oots.html"; |
321
d8b957796121
feeds: Refactor the feeds fetching.
Matti Hamalainen <ccr@tnsp.org>
parents:
296
diff
changeset
|
227 set dataname "OOTS" |
d8b957796121
feeds: Refactor the feeds fetching.
Matti Hamalainen <ccr@tnsp.org>
parents:
296
diff
changeset
|
228 if {![fetch_dorequest $datauri ustatus uscode ucode upage umeta]} { |
d8b957796121
feeds: Refactor the feeds fetching.
Matti Hamalainen <ccr@tnsp.org>
parents:
296
diff
changeset
|
229 return 0 |
d8b957796121
feeds: Refactor the feeds fetching.
Matti Hamalainen <ccr@tnsp.org>
parents:
296
diff
changeset
|
230 } |
d8b957796121
feeds: Refactor the feeds fetching.
Matti Hamalainen <ccr@tnsp.org>
parents:
296
diff
changeset
|
231 |
63 | 232 set umatches [regexp -all -nocase -inline -- "<a href=\"(/comics/oots\[0-9\]+\.html)\">(\[^<\]+)</a>" $upage] |
233 set nmatches [llength $umatches] | |
234 for {set n 0} {$n < $nmatches} {incr n 3} { | |
521
4656f4fd0aa9
fetch_feeds: Fix OOTS fetching, needs https:// nowadays.
Matti Hamalainen <ccr@tnsp.org>
parents:
484
diff
changeset
|
235 add_entry $dataname "https://www.giantitp.com" [lindex $umatches [expr $n+1]] [lindex $umatches [expr $n+2]] |
63 | 236 } |
0 | 237 } |
238 | |
239 | |
350
51c08336d7b1
feeds: Add support for Poliisi.fi information reports.
Matti Hamalainen <ccr@tnsp.org>
parents:
323
diff
changeset
|
240 ### Poliisi tiedotteet |
51c08336d7b1
feeds: Add support for Poliisi.fi information reports.
Matti Hamalainen <ccr@tnsp.org>
parents:
323
diff
changeset
|
241 proc fetch_poliisi { datauri dataname dataprefix } { |
51c08336d7b1
feeds: Add support for Poliisi.fi information reports.
Matti Hamalainen <ccr@tnsp.org>
parents:
323
diff
changeset
|
242 if {![fetch_dorequest $datauri ustatus uscode ucode upage umeta]} { |
51c08336d7b1
feeds: Add support for Poliisi.fi information reports.
Matti Hamalainen <ccr@tnsp.org>
parents:
323
diff
changeset
|
243 return 0 |
51c08336d7b1
feeds: Add support for Poliisi.fi information reports.
Matti Hamalainen <ccr@tnsp.org>
parents:
323
diff
changeset
|
244 } |
51c08336d7b1
feeds: Add support for Poliisi.fi information reports.
Matti Hamalainen <ccr@tnsp.org>
parents:
323
diff
changeset
|
245 |
51c08336d7b1
feeds: Add support for Poliisi.fi information reports.
Matti Hamalainen <ccr@tnsp.org>
parents:
323
diff
changeset
|
246 set umatches [regexp -all -nocase -inline -- "<div class=\"channelitem\"><div class=\"date\">(.*?)</div><a class=\"article\" href=\"(\[^\"\]+)\">(\[^<\]+)</a>" $upage] |
51c08336d7b1
feeds: Add support for Poliisi.fi information reports.
Matti Hamalainen <ccr@tnsp.org>
parents:
323
diff
changeset
|
247 set nmatches [llength $umatches] |
51c08336d7b1
feeds: Add support for Poliisi.fi information reports.
Matti Hamalainen <ccr@tnsp.org>
parents:
323
diff
changeset
|
248 for {set n 0} {$n < $nmatches} {incr n 4} { |
51c08336d7b1
feeds: Add support for Poliisi.fi information reports.
Matti Hamalainen <ccr@tnsp.org>
parents:
323
diff
changeset
|
249 set stmp [string trim [lindex $umatches [expr $n+3]]] |
51c08336d7b1
feeds: Add support for Poliisi.fi information reports.
Matti Hamalainen <ccr@tnsp.org>
parents:
323
diff
changeset
|
250 add_entry $dataname $dataprefix [lindex $umatches [expr $n+2]] "[lindex $umatches [expr $n+1]]: $stmp" |
51c08336d7b1
feeds: Add support for Poliisi.fi information reports.
Matti Hamalainen <ccr@tnsp.org>
parents:
323
diff
changeset
|
251 } |
51c08336d7b1
feeds: Add support for Poliisi.fi information reports.
Matti Hamalainen <ccr@tnsp.org>
parents:
323
diff
changeset
|
252 } |
51c08336d7b1
feeds: Add support for Poliisi.fi information reports.
Matti Hamalainen <ccr@tnsp.org>
parents:
323
diff
changeset
|
253 |
51c08336d7b1
feeds: Add support for Poliisi.fi information reports.
Matti Hamalainen <ccr@tnsp.org>
parents:
323
diff
changeset
|
254 |
51c08336d7b1
feeds: Add support for Poliisi.fi information reports.
Matti Hamalainen <ccr@tnsp.org>
parents:
323
diff
changeset
|
255 |
51c08336d7b1
feeds: Add support for Poliisi.fi information reports.
Matti Hamalainen <ccr@tnsp.org>
parents:
323
diff
changeset
|
256 |
655
4b985abf5aba
fetch_feeds: Move HTTP initialization to end of script.
Matti Hamalainen <ccr@tnsp.org>
parents:
607
diff
changeset
|
257 ############################################################################## |
321
d8b957796121
feeds: Refactor the feeds fetching.
Matti Hamalainen <ccr@tnsp.org>
parents:
296
diff
changeset
|
258 ### Open database, etc |
655
4b985abf5aba
fetch_feeds: Move HTTP initialization to end of script.
Matti Hamalainen <ccr@tnsp.org>
parents:
607
diff
changeset
|
259 ############################################################################## |
4b985abf5aba
fetch_feeds: Move HTTP initialization to end of script.
Matti Hamalainen <ccr@tnsp.org>
parents:
607
diff
changeset
|
260 if {[info exists http_user_agent] && $http_user_agent != ""} { |
4b985abf5aba
fetch_feeds: Move HTTP initialization to end of script.
Matti Hamalainen <ccr@tnsp.org>
parents:
607
diff
changeset
|
261 ::http::config -urlencoding utf8 -useragent $http_user_agent |
4b985abf5aba
fetch_feeds: Move HTTP initialization to end of script.
Matti Hamalainen <ccr@tnsp.org>
parents:
607
diff
changeset
|
262 } else { |
4b985abf5aba
fetch_feeds: Move HTTP initialization to end of script.
Matti Hamalainen <ccr@tnsp.org>
parents:
607
diff
changeset
|
263 ::http::config -urlencoding utf8 -useragent "Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101 Firefox/38.0" |
4b985abf5aba
fetch_feeds: Move HTTP initialization to end of script.
Matti Hamalainen <ccr@tnsp.org>
parents:
607
diff
changeset
|
264 } |
4b985abf5aba
fetch_feeds: Move HTTP initialization to end of script.
Matti Hamalainen <ccr@tnsp.org>
parents:
607
diff
changeset
|
265 |
4b985abf5aba
fetch_feeds: Move HTTP initialization to end of script.
Matti Hamalainen <ccr@tnsp.org>
parents:
607
diff
changeset
|
266 if {[info exists http_use_proxy] && $http_use_proxy != 0} { |
4b985abf5aba
fetch_feeds: Move HTTP initialization to end of script.
Matti Hamalainen <ccr@tnsp.org>
parents:
607
diff
changeset
|
267 ::http::config -proxyhost $http_proxy_host -proxyport $http_proxy_port |
4b985abf5aba
fetch_feeds: Move HTTP initialization to end of script.
Matti Hamalainen <ccr@tnsp.org>
parents:
607
diff
changeset
|
268 } |
4b985abf5aba
fetch_feeds: Move HTTP initialization to end of script.
Matti Hamalainen <ccr@tnsp.org>
parents:
607
diff
changeset
|
269 |
4b985abf5aba
fetch_feeds: Move HTTP initialization to end of script.
Matti Hamalainen <ccr@tnsp.org>
parents:
607
diff
changeset
|
270 if {[info exists http_tls_support] && $http_tls_support != 0} { |
4b985abf5aba
fetch_feeds: Move HTTP initialization to end of script.
Matti Hamalainen <ccr@tnsp.org>
parents:
607
diff
changeset
|
271 package require tls |
4b985abf5aba
fetch_feeds: Move HTTP initialization to end of script.
Matti Hamalainen <ccr@tnsp.org>
parents:
607
diff
changeset
|
272 ::http::register https 443 [list ::tls::socket -request true -require true -ssl2 false -ssl3 false -tls1 true -tls1.1 true -tls1.2 true -cadir $http_tls_cadir -autoservername true] |
4b985abf5aba
fetch_feeds: Move HTTP initialization to end of script.
Matti Hamalainen <ccr@tnsp.org>
parents:
607
diff
changeset
|
273 } |
4b985abf5aba
fetch_feeds: Move HTTP initialization to end of script.
Matti Hamalainen <ccr@tnsp.org>
parents:
607
diff
changeset
|
274 |
4b985abf5aba
fetch_feeds: Move HTTP initialization to end of script.
Matti Hamalainen <ccr@tnsp.org>
parents:
607
diff
changeset
|
275 |
4b985abf5aba
fetch_feeds: Move HTTP initialization to end of script.
Matti Hamalainen <ccr@tnsp.org>
parents:
607
diff
changeset
|
276 |
321
d8b957796121
feeds: Refactor the feeds fetching.
Matti Hamalainen <ccr@tnsp.org>
parents:
296
diff
changeset
|
277 set nitems 0 |
d8b957796121
feeds: Refactor the feeds fetching.
Matti Hamalainen <ccr@tnsp.org>
parents:
296
diff
changeset
|
278 set currclock [clock seconds] |
d8b957796121
feeds: Refactor the feeds fetching.
Matti Hamalainen <ccr@tnsp.org>
parents:
296
diff
changeset
|
279 global feeds_db |
607
5767ef9476e2
Oops, Rename SQLite3 database filename configuration variable in fetch_feeds also.
Matti Hamalainen <ccr@tnsp.org>
parents:
596
diff
changeset
|
280 if {[catch {sqlite3 feeds_db $feeds_db_file} uerrmsg]} { |
5767ef9476e2
Oops, Rename SQLite3 database filename configuration variable in fetch_feeds also.
Matti Hamalainen <ccr@tnsp.org>
parents:
596
diff
changeset
|
281 puts "Could not open SQLite3 database '${feeds_db_file}': ${uerrmsg}" |
321
d8b957796121
feeds: Refactor the feeds fetching.
Matti Hamalainen <ccr@tnsp.org>
parents:
296
diff
changeset
|
282 exit 2 |
d8b957796121
feeds: Refactor the feeds fetching.
Matti Hamalainen <ccr@tnsp.org>
parents:
296
diff
changeset
|
283 } |
d8b957796121
feeds: Refactor the feeds fetching.
Matti Hamalainen <ccr@tnsp.org>
parents:
296
diff
changeset
|
284 |
d8b957796121
feeds: Refactor the feeds fetching.
Matti Hamalainen <ccr@tnsp.org>
parents:
296
diff
changeset
|
285 |
d8b957796121
feeds: Refactor the feeds fetching.
Matti Hamalainen <ccr@tnsp.org>
parents:
296
diff
changeset
|
286 ### Fetch the feeds |
484
35d7be5db18b
fetch_feeds: Move the actual calling of fetching functions to the
Matti Hamalainen <ccr@tnsp.org>
parents:
480
diff
changeset
|
287 if {[catch {feeds_fetch} uerrmsg]} { |
35d7be5db18b
fetch_feeds: Move the actual calling of fetching functions to the
Matti Hamalainen <ccr@tnsp.org>
parents:
480
diff
changeset
|
288 puts "Error fetching feeds: $uerrmsg" |
35d7be5db18b
fetch_feeds: Move the actual calling of fetching functions to the
Matti Hamalainen <ccr@tnsp.org>
parents:
480
diff
changeset
|
289 feeds_db close |
35d7be5db18b
fetch_feeds: Move the actual calling of fetching functions to the
Matti Hamalainen <ccr@tnsp.org>
parents:
480
diff
changeset
|
290 exit 3 |
35d7be5db18b
fetch_feeds: Move the actual calling of fetching functions to the
Matti Hamalainen <ccr@tnsp.org>
parents:
480
diff
changeset
|
291 } |
0 | 292 |
139
3305e142eecc
Change feed fetcher to use SQLite3 backend.
Matti Hamalainen <ccr@tnsp.org>
parents:
114
diff
changeset
|
293 |
3305e142eecc
Change feed fetcher to use SQLite3 backend.
Matti Hamalainen <ccr@tnsp.org>
parents:
114
diff
changeset
|
294 ### Close database |
140
b0648e05c855
Change some variable names, etc.
Matti Hamalainen <ccr@tnsp.org>
parents:
139
diff
changeset
|
295 feeds_db close |
142 | 296 |
297 puts "$nitems new items." |