Mercurial > hg > egg-tcls
annotate fetch_feeds.tcl @ 271:f47b41d2be64
feeds: Cosmetics.
author | Matti Hamalainen <ccr@tnsp.org> |
---|---|
date | Fri, 23 Jan 2015 10:57:47 +0200 |
parents | 96310b1c88fa |
children | 9f90d6918626 |
rev | line source |
---|---|
0 | 1 #!/usr/bin/tclsh |
1 | 2 # |
3 # NOTICE! Change above path to correct tclsh binary path! | |
4 # | |
268
96310b1c88fa
feeds: Improve config resiliency.
Matti Hamalainen <ccr@tnsp.org>
parents:
265
diff
changeset
|
5 ############################################################################## |
0 | 6 # |
265
908edc54005a
feeds: Move configuration to separate file.
Matti Hamalainen <ccr@tnsp.org>
parents:
159
diff
changeset
|
7 # FeedCheck fetcher v0.9 by Matti 'ccr' Hamalainen <ccr@tnsp.org> |
908edc54005a
feeds: Move configuration to separate file.
Matti Hamalainen <ccr@tnsp.org>
parents:
159
diff
changeset
|
8 # (C) Copyright 2008-2015 Tecnic Software productions (TNSP) |
0 | 9 # |
10 # This script is freely distributable under GNU GPL (version 2) license. | |
11 # | |
268
96310b1c88fa
feeds: Improve config resiliency.
Matti Hamalainen <ccr@tnsp.org>
parents:
265
diff
changeset
|
12 ############################################################################## |
139
3305e142eecc
Change feed fetcher to use SQLite3 backend.
Matti Hamalainen <ccr@tnsp.org>
parents:
114
diff
changeset
|
13 package require sqlite3 |
265
908edc54005a
feeds: Move configuration to separate file.
Matti Hamalainen <ccr@tnsp.org>
parents:
159
diff
changeset
|
14 source [file dirname [info script]]/utillib.tcl |
0 | 15 |
265
908edc54005a
feeds: Move configuration to separate file.
Matti Hamalainen <ccr@tnsp.org>
parents:
159
diff
changeset
|
16 ### The configuration should be in config.feeds in same directory |
908edc54005a
feeds: Move configuration to separate file.
Matti Hamalainen <ccr@tnsp.org>
parents:
159
diff
changeset
|
17 ### as this script. Or change the line below to point where ever |
908edc54005a
feeds: Move configuration to separate file.
Matti Hamalainen <ccr@tnsp.org>
parents:
159
diff
changeset
|
18 ### you wish. See "config.feeds.example" for an example config file. |
908edc54005a
feeds: Move configuration to separate file.
Matti Hamalainen <ccr@tnsp.org>
parents:
159
diff
changeset
|
19 source [file dirname [info script]]/config.feeds |
0 | 20 |
21 | |
268
96310b1c88fa
feeds: Improve config resiliency.
Matti Hamalainen <ccr@tnsp.org>
parents:
265
diff
changeset
|
22 ############################################################################## |
139
3305e142eecc
Change feed fetcher to use SQLite3 backend.
Matti Hamalainen <ccr@tnsp.org>
parents:
114
diff
changeset
|
23 |
146
7106dd8db4de
Improve entity parsing, etc.
Matti Hamalainen <ccr@tnsp.org>
parents:
145
diff
changeset
|
24 set feeds_ent_str "-|-|'|'|—|-|‏||—|-|–|--|‪||‬|" |
7106dd8db4de
Improve entity parsing, etc.
Matti Hamalainen <ccr@tnsp.org>
parents:
145
diff
changeset
|
25 append feeds_ent_str "|‎||å|å|Å|Å|é|é|:|:| | " |
7106dd8db4de
Improve entity parsing, etc.
Matti Hamalainen <ccr@tnsp.org>
parents:
145
diff
changeset
|
26 append feeds_ent_str "|”|\"|“|\"|«|<<|»|>>|"|\"" |
7106dd8db4de
Improve entity parsing, etc.
Matti Hamalainen <ccr@tnsp.org>
parents:
145
diff
changeset
|
27 append feeds_ent_str "|ä|ä|ö|ö|Ä|Ä|Ö|Ö|&|&|<|<|>|>" |
7106dd8db4de
Improve entity parsing, etc.
Matti Hamalainen <ccr@tnsp.org>
parents:
145
diff
changeset
|
28 append feeds_ent_str "|ä|ä|å|ö|—|-|'|'|–|-|"|\"" |
7106dd8db4de
Improve entity parsing, etc.
Matti Hamalainen <ccr@tnsp.org>
parents:
145
diff
changeset
|
29 append feeds_ent_str "|||-|’|'|ü|ü|Ü|Ü|•|*|€|€" |
7106dd8db4de
Improve entity parsing, etc.
Matti Hamalainen <ccr@tnsp.org>
parents:
145
diff
changeset
|
30 append feeds_ent_str "|”|\"" |
7106dd8db4de
Improve entity parsing, etc.
Matti Hamalainen <ccr@tnsp.org>
parents:
145
diff
changeset
|
31 set html_ent [split [encoding convertfrom "utf-8" $feeds_ent_str] "|"] |
0 | 32 |
33 package require http | |
271 | 34 |
265
908edc54005a
feeds: Move configuration to separate file.
Matti Hamalainen <ccr@tnsp.org>
parents:
159
diff
changeset
|
35 if {[info exists http_user_agent] && $http_user_agent != ""} { |
908edc54005a
feeds: Move configuration to separate file.
Matti Hamalainen <ccr@tnsp.org>
parents:
159
diff
changeset
|
36 ::http::config -urlencoding iso8859-1 -useragent $http_user_agent |
908edc54005a
feeds: Move configuration to separate file.
Matti Hamalainen <ccr@tnsp.org>
parents:
159
diff
changeset
|
37 } else { |
908edc54005a
feeds: Move configuration to separate file.
Matti Hamalainen <ccr@tnsp.org>
parents:
159
diff
changeset
|
38 ::http::config -urlencoding iso8859-1 -useragent "Mozilla/4.0 (compatible; MSIE 6.0; MSIE 5.5; Windows NT 5.0) Opera 9.5" |
908edc54005a
feeds: Move configuration to separate file.
Matti Hamalainen <ccr@tnsp.org>
parents:
159
diff
changeset
|
39 } |
271 | 40 |
268
96310b1c88fa
feeds: Improve config resiliency.
Matti Hamalainen <ccr@tnsp.org>
parents:
265
diff
changeset
|
41 if {[info exists http_use_proxy] && $http_use_proxy != 0} { |
63 | 42 ::http::config -proxyhost $http_proxy_host -proxyport $http_proxy_port |
0 | 43 } |
44 | |
268
96310b1c88fa
feeds: Improve config resiliency.
Matti Hamalainen <ccr@tnsp.org>
parents:
265
diff
changeset
|
45 if {[info exists http_tls_support] && $http_tls_support != 0} { |
265
908edc54005a
feeds: Move configuration to separate file.
Matti Hamalainen <ccr@tnsp.org>
parents:
159
diff
changeset
|
46 package require tls |
908edc54005a
feeds: Move configuration to separate file.
Matti Hamalainen <ccr@tnsp.org>
parents:
159
diff
changeset
|
47 ::http::register https 443 [list ::tls::socket -request 1 -require 1 -tls1 1 -cadir $http_tls_cadir] |
908edc54005a
feeds: Move configuration to separate file.
Matti Hamalainen <ccr@tnsp.org>
parents:
159
diff
changeset
|
48 } |
908edc54005a
feeds: Move configuration to separate file.
Matti Hamalainen <ccr@tnsp.org>
parents:
159
diff
changeset
|
49 |
0 | 50 |
268
96310b1c88fa
feeds: Improve config resiliency.
Matti Hamalainen <ccr@tnsp.org>
parents:
265
diff
changeset
|
51 ############################################################################## |
96310b1c88fa
feeds: Improve config resiliency.
Matti Hamalainen <ccr@tnsp.org>
parents:
265
diff
changeset
|
52 |
0 | 53 proc convert_ent {udata} { |
63 | 54 global html_ent |
146
7106dd8db4de
Improve entity parsing, etc.
Matti Hamalainen <ccr@tnsp.org>
parents:
145
diff
changeset
|
55 return [string map -nocase $html_ent [string map $html_ent $udata]] |
0 | 56 } |
57 | |
139
3305e142eecc
Change feed fetcher to use SQLite3 backend.
Matti Hamalainen <ccr@tnsp.org>
parents:
114
diff
changeset
|
58 proc add_entry {uname uprefix uurl utitle} { |
142 | 59 global currclock feeds_db nitems |
147 | 60 set utmp [convert_ent $uurl] |
61 if {[string match "http://*" $utmp] || [string match "https://*" $utmp]} { | |
62 set utest "$utmp" | |
63 } else { | |
64 set utest "$uprefix$utmp" | |
65 } | |
139
3305e142eecc
Change feed fetcher to use SQLite3 backend.
Matti Hamalainen <ccr@tnsp.org>
parents:
114
diff
changeset
|
66 |
142 | 67 set usql "SELECT title FROM feeds WHERE url='[escape $utest]' AND feed='[escape $uname]'" |
140
b0648e05c855
Change some variable names, etc.
Matti Hamalainen <ccr@tnsp.org>
parents:
139
diff
changeset
|
68 if {![feeds_db exists $usql]} { |
142 | 69 set usql "INSERT INTO feeds (feed,utime,url,title) VALUES ('[escape $uname]', $currclock, '[escape $utest]', '[escape $utitle]')" |
70 incr nitems | |
140
b0648e05c855
Change some variable names, etc.
Matti Hamalainen <ccr@tnsp.org>
parents:
139
diff
changeset
|
71 if {[catch {feeds_db eval $usql} uerrmsg]} { |
139
3305e142eecc
Change feed fetcher to use SQLite3 backend.
Matti Hamalainen <ccr@tnsp.org>
parents:
114
diff
changeset
|
72 puts "\nError: $uerrmsg on:\n$usql" |
3305e142eecc
Change feed fetcher to use SQLite3 backend.
Matti Hamalainen <ccr@tnsp.org>
parents:
114
diff
changeset
|
73 exit 15 |
3305e142eecc
Change feed fetcher to use SQLite3 backend.
Matti Hamalainen <ccr@tnsp.org>
parents:
114
diff
changeset
|
74 } |
63 | 75 } |
0 | 76 } |
77 | |
78 | |
79 proc add_rss_feed {datauri dataname dataprefix} { | |
63 | 80 if {[catch {set utoken [::http::geturl $datauri -binary true -timeout 5000]} uerrmsg]} { |
81 puts "Error getting $datauri: $uerrmsg" | |
82 return 1 | |
83 } | |
84 set upage [::http::data $utoken] | |
85 ::http::cleanup $utoken | |
86 | |
87 set umatches [regexp -all -nocase -inline -- "<item>.\*\?<title><..CDATA.(.\*\?)\\\]\\\]></title>.\*\?<link>(http.\*\?)</link>.\*\?</item>" $upage] | |
88 set nmatches [llength $umatches] | |
89 for {set n 0} {$n < $nmatches} {incr n 3} { | |
90 add_entry $dataname $dataprefix [lindex $umatches [expr $n+2]] [lindex $umatches [expr $n+1]] | |
91 } | |
92 | |
93 if {$nmatches == 0} { | |
94 set umatches [regexp -all -nocase -inline -- "<item>.\*\?<title>(.\*\?)</title>.\*\?<link>(http.\*\?)</link>.\*\?</item>" $upage] | |
95 set nmatches [llength $umatches] | |
96 for {set n 0} {$n < $nmatches} {incr n 3} { | |
97 add_entry $dataname $dataprefix [lindex $umatches [expr $n+2]] [lindex $umatches [expr $n+1]] | |
98 } | |
99 } | |
0 | 100 |
63 | 101 if {$nmatches == 0} { |
102 set umatches [regexp -all -nocase -inline -- "<item \[^>\]*>.\*\?<title>(.\*\?)</title>.\*\?<link>(http.\*\?)</link>.\*\?</item>" $upage] | |
103 set nmatches [llength $umatches] | |
104 for {set n 0} {$n < $nmatches} {incr n 3} { | |
105 add_entry $dataname $dataprefix [lindex $umatches [expr $n+2]] [lindex $umatches [expr $n+1]] | |
106 } | |
107 } | |
143 | 108 |
63 | 109 return 0 |
0 | 110 } |
111 | |
112 | |
139
3305e142eecc
Change feed fetcher to use SQLite3 backend.
Matti Hamalainen <ccr@tnsp.org>
parents:
114
diff
changeset
|
113 ### Open database, etc |
142 | 114 set nitems 0 |
139
3305e142eecc
Change feed fetcher to use SQLite3 backend.
Matti Hamalainen <ccr@tnsp.org>
parents:
114
diff
changeset
|
115 set currclock [clock seconds] |
140
b0648e05c855
Change some variable names, etc.
Matti Hamalainen <ccr@tnsp.org>
parents:
139
diff
changeset
|
116 global feeds_db |
b0648e05c855
Change some variable names, etc.
Matti Hamalainen <ccr@tnsp.org>
parents:
139
diff
changeset
|
117 if {[catch {sqlite3 feeds_db $feeds_dbfile} uerrmsg]} { |
b0648e05c855
Change some variable names, etc.
Matti Hamalainen <ccr@tnsp.org>
parents:
139
diff
changeset
|
118 puts "Could not open SQLite3 database '$feeds_dbfile': $uerrmsg." |
139
3305e142eecc
Change feed fetcher to use SQLite3 backend.
Matti Hamalainen <ccr@tnsp.org>
parents:
114
diff
changeset
|
119 exit 2 |
0 | 120 } |
121 | |
122 | |
123 ############################################################################## | |
69
df3230f8aa46
Translate some comments to english and cosmetic fixes.
Matti Hamalainen <ccr@tnsp.org>
parents:
63
diff
changeset
|
124 ### Fetch and parse Halla-aho's blog page data |
0 | 125 set datauri "http://www.halla-aho.com/scripta/"; |
126 set dataname "Mestari" | |
127 if {[catch {set utoken [::http::geturl $datauri -binary true -timeout 5000]} uerrmsg]} { | |
63 | 128 puts "Error getting $datauri: $uerrmsg" |
0 | 129 } else { |
63 | 130 set upage [::http::data $utoken] |
131 ::http::cleanup $utoken | |
132 | |
133 set umatches [regexp -all -nocase -inline -- "<a href=\"(\[^\"\]+\.html)\"><b>(\[^<\]+)</b>" $upage] | |
134 set nmatches [llength $umatches] | |
135 for {set n 0} {$n < $nmatches} {incr n 3} { | |
136 add_entry $dataname $datauri [lindex $umatches [expr $n+1]] [lindex $umatches [expr $n+2]] | |
137 } | |
0 | 138 |
63 | 139 set umatches [regexp -all -nocase -inline -- "<a href=\"(\[^\"\]+\.html)\">(\[^<\]\[^b\]\[^<\]+)</a>" $upage] |
140 set nmatches [llength $umatches] | |
141 for {set n 0} {$n < $nmatches} {incr n 3} { | |
142 add_entry $dataname $datauri [lindex $umatches [expr $n+1]] [lindex $umatches [expr $n+2]] | |
143 } | |
0 | 144 } |
145 | |
146 | |
147 ### The Adventurers | |
148 set datauri "http://www.peldor.com/chapters/index_sidebar.html"; | |
149 set dataname "The Adventurers" | |
150 if {[catch {set utoken [::http::geturl $datauri -binary true -timeout 5000]} uerrmsg]} { | |
63 | 151 puts "Error getting $datauri: $uerrmsg" |
0 | 152 } else { |
63 | 153 set upage [::http::data $utoken] |
154 ::http::cleanup $utoken | |
155 | |
156 set umatches [regexp -all -nocase -inline -- "<a href=\"(\[^\"\]+)\">(\[^<\]+)</a>" $upage] | |
157 set nmatches [llength $umatches] | |
158 for {set n 0} {$n < $nmatches} {incr n 3} { | |
159 add_entry $dataname "http://www.peldor.com/" [lindex $umatches [expr $n+1]] [lindex $umatches [expr $n+2]] | |
160 } | |
0 | 161 } |
162 | |
163 | |
164 ### Order of the Stick | |
165 set datauri "http://www.giantitp.com/comics/oots.html"; | |
166 set dataname "OOTS" | |
167 if {[catch {set utoken [::http::geturl $datauri -binary true -timeout 5000]} uerrmsg]} { | |
63 | 168 puts "Error getting $datauri: $uerrmsg" |
0 | 169 } else { |
63 | 170 set upage [::http::data $utoken] |
171 ::http::cleanup $utoken | |
172 | |
173 set umatches [regexp -all -nocase -inline -- "<a href=\"(/comics/oots\[0-9\]+\.html)\">(\[^<\]+)</a>" $upage] | |
174 set nmatches [llength $umatches] | |
175 for {set n 0} {$n < $nmatches} {incr n 3} { | |
176 add_entry $dataname "http://www.giantitp.com" [lindex $umatches [expr $n+1]] [lindex $umatches [expr $n+2]] | |
177 } | |
0 | 178 } |
179 | |
180 | |
69
df3230f8aa46
Translate some comments to english and cosmetic fixes.
Matti Hamalainen <ccr@tnsp.org>
parents:
63
diff
changeset
|
181 ### Generic RSS-feed fetching |
143 | 182 #add_rss_feed "http://www.kaleva.fi/rss/145.xml" "Kaleva/Tiede" "" |
0 | 183 |
184 add_rss_feed "http://www.effi.org/xml/uutiset.rss" "EFFI" "" | |
185 | |
143 | 186 add_rss_feed "http://static.mtv3.fi/rss/uutiset_rikos.rss" "MTV3/Rikos" "" |
0 | 187 |
188 add_rss_feed "http://www.blastwave-comic.com/rss/blastwave.xml" "Blastwave" "" | |
189 | |
190 #add_rss_feed "http://lehti.samizdat.info/feed/" "Lehti" "" | |
191 | |
139
3305e142eecc
Change feed fetcher to use SQLite3 backend.
Matti Hamalainen <ccr@tnsp.org>
parents:
114
diff
changeset
|
192 |
0 | 193 |
139
3305e142eecc
Change feed fetcher to use SQLite3 backend.
Matti Hamalainen <ccr@tnsp.org>
parents:
114
diff
changeset
|
194 ### Close database |
140
b0648e05c855
Change some variable names, etc.
Matti Hamalainen <ccr@tnsp.org>
parents:
139
diff
changeset
|
195 feeds_db close |
142 | 196 |
197 puts "$nitems new items." |