Mercurial > hg > egg-tcls
annotate fetch_feeds.tcl @ 265:908edc54005a
feeds: Move configuration to separate file.
author | Matti Hamalainen <ccr@tnsp.org> |
---|---|
date | Fri, 23 Jan 2015 10:15:38 +0200 |
parents | bbc7860c22a6 |
children | 96310b1c88fa |
rev | line source |
---|---|
0 | 1 #!/usr/bin/tclsh |
1 | 2 # |
3 # NOTICE! Change above path to correct tclsh binary path! | |
4 # | |
69
df3230f8aa46
Translate some comments to english and cosmetic fixes.
Matti Hamalainen <ccr@tnsp.org>
parents:
63
diff
changeset
|
5 ########################################################################## |
0 | 6 # |
265
908edc54005a
feeds: Move configuration to separate file.
Matti Hamalainen <ccr@tnsp.org>
parents:
159
diff
changeset
|
7 # FeedCheck fetcher v0.9 by Matti 'ccr' Hamalainen <ccr@tnsp.org> |
908edc54005a
feeds: Move configuration to separate file.
Matti Hamalainen <ccr@tnsp.org>
parents:
159
diff
changeset
|
8 # (C) Copyright 2008-2015 Tecnic Software productions (TNSP) |
0 | 9 # |
10 # This script is freely distributable under GNU GPL (version 2) license. | |
11 # | |
69
df3230f8aa46
Translate some comments to english and cosmetic fixes.
Matti Hamalainen <ccr@tnsp.org>
parents:
63
diff
changeset
|
12 ########################################################################## |
139
3305e142eecc
Change feed fetcher to use SQLite3 backend.
Matti Hamalainen <ccr@tnsp.org>
parents:
114
diff
changeset
|
13 package require sqlite3 |
265
908edc54005a
feeds: Move configuration to separate file.
Matti Hamalainen <ccr@tnsp.org>
parents:
159
diff
changeset
|
14 source [file dirname [info script]]/utillib.tcl |
0 | 15 |
265
908edc54005a
feeds: Move configuration to separate file.
Matti Hamalainen <ccr@tnsp.org>
parents:
159
diff
changeset
|
16 ### The configuration should be in config.feeds in same directory |
908edc54005a
feeds: Move configuration to separate file.
Matti Hamalainen <ccr@tnsp.org>
parents:
159
diff
changeset
|
17 ### as this script. Or change the line below to point where ever |
908edc54005a
feeds: Move configuration to separate file.
Matti Hamalainen <ccr@tnsp.org>
parents:
159
diff
changeset
|
18 ### you wish. See "config.feeds.example" for an example config file. |
908edc54005a
feeds: Move configuration to separate file.
Matti Hamalainen <ccr@tnsp.org>
parents:
159
diff
changeset
|
19 source [file dirname [info script]]/config.feeds |
0 | 20 |
21 | |
69
df3230f8aa46
Translate some comments to english and cosmetic fixes.
Matti Hamalainen <ccr@tnsp.org>
parents:
63
diff
changeset
|
22 ########################################################################## |
139
3305e142eecc
Change feed fetcher to use SQLite3 backend.
Matti Hamalainen <ccr@tnsp.org>
parents:
114
diff
changeset
|
23 |
146
7106dd8db4de
Improve entity parsing, etc.
Matti Hamalainen <ccr@tnsp.org>
parents:
145
diff
changeset
|
24 set feeds_ent_str "-|-|'|'|—|-|‏||—|-|–|--|‪||‬|" |
7106dd8db4de
Improve entity parsing, etc.
Matti Hamalainen <ccr@tnsp.org>
parents:
145
diff
changeset
|
25 append feeds_ent_str "|‎||å|å|Å|Å|é|é|:|:| | " |
7106dd8db4de
Improve entity parsing, etc.
Matti Hamalainen <ccr@tnsp.org>
parents:
145
diff
changeset
|
26 append feeds_ent_str "|”|\"|“|\"|«|<<|»|>>|"|\"" |
7106dd8db4de
Improve entity parsing, etc.
Matti Hamalainen <ccr@tnsp.org>
parents:
145
diff
changeset
|
27 append feeds_ent_str "|ä|ä|ö|ö|Ä|Ä|Ö|Ö|&|&|<|<|>|>" |
7106dd8db4de
Improve entity parsing, etc.
Matti Hamalainen <ccr@tnsp.org>
parents:
145
diff
changeset
|
28 append feeds_ent_str "|ä|ä|å|ö|—|-|'|'|–|-|"|\"" |
7106dd8db4de
Improve entity parsing, etc.
Matti Hamalainen <ccr@tnsp.org>
parents:
145
diff
changeset
|
29 append feeds_ent_str "|||-|’|'|ü|ü|Ü|Ü|•|*|€|€" |
7106dd8db4de
Improve entity parsing, etc.
Matti Hamalainen <ccr@tnsp.org>
parents:
145
diff
changeset
|
30 append feeds_ent_str "|”|\"" |
7106dd8db4de
Improve entity parsing, etc.
Matti Hamalainen <ccr@tnsp.org>
parents:
145
diff
changeset
|
31 set html_ent [split [encoding convertfrom "utf-8" $feeds_ent_str] "|"] |
0 | 32 |
33 package require http | |
265
908edc54005a
feeds: Move configuration to separate file.
Matti Hamalainen <ccr@tnsp.org>
parents:
159
diff
changeset
|
34 if {[info exists http_user_agent] && $http_user_agent != ""} { |
908edc54005a
feeds: Move configuration to separate file.
Matti Hamalainen <ccr@tnsp.org>
parents:
159
diff
changeset
|
35 ::http::config -urlencoding iso8859-1 -useragent $http_user_agent |
908edc54005a
feeds: Move configuration to separate file.
Matti Hamalainen <ccr@tnsp.org>
parents:
159
diff
changeset
|
36 } else { |
908edc54005a
feeds: Move configuration to separate file.
Matti Hamalainen <ccr@tnsp.org>
parents:
159
diff
changeset
|
37 ::http::config -urlencoding iso8859-1 -useragent "Mozilla/4.0 (compatible; MSIE 6.0; MSIE 5.5; Windows NT 5.0) Opera 9.5" |
908edc54005a
feeds: Move configuration to separate file.
Matti Hamalainen <ccr@tnsp.org>
parents:
159
diff
changeset
|
38 } |
908edc54005a
feeds: Move configuration to separate file.
Matti Hamalainen <ccr@tnsp.org>
parents:
159
diff
changeset
|
39 if {$http_use_proxy != 0} { |
63 | 40 ::http::config -proxyhost $http_proxy_host -proxyport $http_proxy_port |
0 | 41 } |
42 | |
265
908edc54005a
feeds: Move configuration to separate file.
Matti Hamalainen <ccr@tnsp.org>
parents:
159
diff
changeset
|
43 if {$http_tls_support != 0} { |
908edc54005a
feeds: Move configuration to separate file.
Matti Hamalainen <ccr@tnsp.org>
parents:
159
diff
changeset
|
44 package require tls |
908edc54005a
feeds: Move configuration to separate file.
Matti Hamalainen <ccr@tnsp.org>
parents:
159
diff
changeset
|
45 ::http::register https 443 [list ::tls::socket -request 1 -require 1 -tls1 1 -cadir $http_tls_cadir] |
908edc54005a
feeds: Move configuration to separate file.
Matti Hamalainen <ccr@tnsp.org>
parents:
159
diff
changeset
|
46 } |
908edc54005a
feeds: Move configuration to separate file.
Matti Hamalainen <ccr@tnsp.org>
parents:
159
diff
changeset
|
47 |
0 | 48 |
49 proc convert_ent {udata} { | |
63 | 50 global html_ent |
146
7106dd8db4de
Improve entity parsing, etc.
Matti Hamalainen <ccr@tnsp.org>
parents:
145
diff
changeset
|
51 return [string map -nocase $html_ent [string map $html_ent $udata]] |
0 | 52 } |
53 | |
139
3305e142eecc
Change feed fetcher to use SQLite3 backend.
Matti Hamalainen <ccr@tnsp.org>
parents:
114
diff
changeset
|
54 proc add_entry {uname uprefix uurl utitle} { |
142 | 55 global currclock feeds_db nitems |
147 | 56 set utmp [convert_ent $uurl] |
57 if {[string match "http://*" $utmp] || [string match "https://*" $utmp]} { | |
58 set utest "$utmp" | |
59 } else { | |
60 set utest "$uprefix$utmp" | |
61 } | |
139
3305e142eecc
Change feed fetcher to use SQLite3 backend.
Matti Hamalainen <ccr@tnsp.org>
parents:
114
diff
changeset
|
62 |
142 | 63 set usql "SELECT title FROM feeds WHERE url='[escape $utest]' AND feed='[escape $uname]'" |
140
b0648e05c855
Change some variable names, etc.
Matti Hamalainen <ccr@tnsp.org>
parents:
139
diff
changeset
|
64 if {![feeds_db exists $usql]} { |
142 | 65 set usql "INSERT INTO feeds (feed,utime,url,title) VALUES ('[escape $uname]', $currclock, '[escape $utest]', '[escape $utitle]')" |
66 incr nitems | |
140
b0648e05c855
Change some variable names, etc.
Matti Hamalainen <ccr@tnsp.org>
parents:
139
diff
changeset
|
67 if {[catch {feeds_db eval $usql} uerrmsg]} { |
139
3305e142eecc
Change feed fetcher to use SQLite3 backend.
Matti Hamalainen <ccr@tnsp.org>
parents:
114
diff
changeset
|
68 puts "\nError: $uerrmsg on:\n$usql" |
3305e142eecc
Change feed fetcher to use SQLite3 backend.
Matti Hamalainen <ccr@tnsp.org>
parents:
114
diff
changeset
|
69 exit 15 |
3305e142eecc
Change feed fetcher to use SQLite3 backend.
Matti Hamalainen <ccr@tnsp.org>
parents:
114
diff
changeset
|
70 } |
63 | 71 } |
0 | 72 } |
73 | |
74 | |
75 proc add_rss_feed {datauri dataname dataprefix} { | |
63 | 76 if {[catch {set utoken [::http::geturl $datauri -binary true -timeout 5000]} uerrmsg]} { |
77 puts "Error getting $datauri: $uerrmsg" | |
78 return 1 | |
79 } | |
80 set upage [::http::data $utoken] | |
81 ::http::cleanup $utoken | |
82 | |
83 set umatches [regexp -all -nocase -inline -- "<item>.\*\?<title><..CDATA.(.\*\?)\\\]\\\]></title>.\*\?<link>(http.\*\?)</link>.\*\?</item>" $upage] | |
84 set nmatches [llength $umatches] | |
85 for {set n 0} {$n < $nmatches} {incr n 3} { | |
86 add_entry $dataname $dataprefix [lindex $umatches [expr $n+2]] [lindex $umatches [expr $n+1]] | |
87 } | |
88 | |
89 if {$nmatches == 0} { | |
90 set umatches [regexp -all -nocase -inline -- "<item>.\*\?<title>(.\*\?)</title>.\*\?<link>(http.\*\?)</link>.\*\?</item>" $upage] | |
91 set nmatches [llength $umatches] | |
92 for {set n 0} {$n < $nmatches} {incr n 3} { | |
93 add_entry $dataname $dataprefix [lindex $umatches [expr $n+2]] [lindex $umatches [expr $n+1]] | |
94 } | |
95 } | |
0 | 96 |
63 | 97 if {$nmatches == 0} { |
98 set umatches [regexp -all -nocase -inline -- "<item \[^>\]*>.\*\?<title>(.\*\?)</title>.\*\?<link>(http.\*\?)</link>.\*\?</item>" $upage] | |
99 set nmatches [llength $umatches] | |
100 for {set n 0} {$n < $nmatches} {incr n 3} { | |
101 add_entry $dataname $dataprefix [lindex $umatches [expr $n+2]] [lindex $umatches [expr $n+1]] | |
102 } | |
103 } | |
143 | 104 |
63 | 105 return 0 |
0 | 106 } |
107 | |
108 | |
139
3305e142eecc
Change feed fetcher to use SQLite3 backend.
Matti Hamalainen <ccr@tnsp.org>
parents:
114
diff
changeset
|
109 ### Open database, etc |
142 | 110 set nitems 0 |
139
3305e142eecc
Change feed fetcher to use SQLite3 backend.
Matti Hamalainen <ccr@tnsp.org>
parents:
114
diff
changeset
|
111 set currclock [clock seconds] |
140
b0648e05c855
Change some variable names, etc.
Matti Hamalainen <ccr@tnsp.org>
parents:
139
diff
changeset
|
112 global feeds_db |
b0648e05c855
Change some variable names, etc.
Matti Hamalainen <ccr@tnsp.org>
parents:
139
diff
changeset
|
113 if {[catch {sqlite3 feeds_db $feeds_dbfile} uerrmsg]} { |
b0648e05c855
Change some variable names, etc.
Matti Hamalainen <ccr@tnsp.org>
parents:
139
diff
changeset
|
114 puts "Could not open SQLite3 database '$feeds_dbfile': $uerrmsg." |
139
3305e142eecc
Change feed fetcher to use SQLite3 backend.
Matti Hamalainen <ccr@tnsp.org>
parents:
114
diff
changeset
|
115 exit 2 |
0 | 116 } |
117 | |
118 | |
119 ############################################################################## | |
69
df3230f8aa46
Translate some comments to english and cosmetic fixes.
Matti Hamalainen <ccr@tnsp.org>
parents:
63
diff
changeset
|
120 ### Fetch and parse Halla-aho's blog page data |
0 | 121 set datauri "http://www.halla-aho.com/scripta/"; |
122 set dataname "Mestari" | |
123 if {[catch {set utoken [::http::geturl $datauri -binary true -timeout 5000]} uerrmsg]} { | |
63 | 124 puts "Error getting $datauri: $uerrmsg" |
0 | 125 } else { |
63 | 126 set upage [::http::data $utoken] |
127 ::http::cleanup $utoken | |
128 | |
129 set umatches [regexp -all -nocase -inline -- "<a href=\"(\[^\"\]+\.html)\"><b>(\[^<\]+)</b>" $upage] | |
130 set nmatches [llength $umatches] | |
131 for {set n 0} {$n < $nmatches} {incr n 3} { | |
132 add_entry $dataname $datauri [lindex $umatches [expr $n+1]] [lindex $umatches [expr $n+2]] | |
133 } | |
0 | 134 |
63 | 135 set umatches [regexp -all -nocase -inline -- "<a href=\"(\[^\"\]+\.html)\">(\[^<\]\[^b\]\[^<\]+)</a>" $upage] |
136 set nmatches [llength $umatches] | |
137 for {set n 0} {$n < $nmatches} {incr n 3} { | |
138 add_entry $dataname $datauri [lindex $umatches [expr $n+1]] [lindex $umatches [expr $n+2]] | |
139 } | |
0 | 140 } |
141 | |
142 | |
143 ### The Adventurers | |
144 set datauri "http://www.peldor.com/chapters/index_sidebar.html"; | |
145 set dataname "The Adventurers" | |
146 if {[catch {set utoken [::http::geturl $datauri -binary true -timeout 5000]} uerrmsg]} { | |
63 | 147 puts "Error getting $datauri: $uerrmsg" |
0 | 148 } else { |
63 | 149 set upage [::http::data $utoken] |
150 ::http::cleanup $utoken | |
151 | |
152 set umatches [regexp -all -nocase -inline -- "<a href=\"(\[^\"\]+)\">(\[^<\]+)</a>" $upage] | |
153 set nmatches [llength $umatches] | |
154 for {set n 0} {$n < $nmatches} {incr n 3} { | |
155 add_entry $dataname "http://www.peldor.com/" [lindex $umatches [expr $n+1]] [lindex $umatches [expr $n+2]] | |
156 } | |
0 | 157 } |
158 | |
159 | |
160 ### Order of the Stick | |
161 set datauri "http://www.giantitp.com/comics/oots.html"; | |
162 set dataname "OOTS" | |
163 if {[catch {set utoken [::http::geturl $datauri -binary true -timeout 5000]} uerrmsg]} { | |
63 | 164 puts "Error getting $datauri: $uerrmsg" |
0 | 165 } else { |
63 | 166 set upage [::http::data $utoken] |
167 ::http::cleanup $utoken | |
168 | |
169 set umatches [regexp -all -nocase -inline -- "<a href=\"(/comics/oots\[0-9\]+\.html)\">(\[^<\]+)</a>" $upage] | |
170 set nmatches [llength $umatches] | |
171 for {set n 0} {$n < $nmatches} {incr n 3} { | |
172 add_entry $dataname "http://www.giantitp.com" [lindex $umatches [expr $n+1]] [lindex $umatches [expr $n+2]] | |
173 } | |
0 | 174 } |
175 | |
176 | |
69
df3230f8aa46
Translate some comments to english and cosmetic fixes.
Matti Hamalainen <ccr@tnsp.org>
parents:
63
diff
changeset
|
177 ### Generic RSS-feed fetching |
143 | 178 #add_rss_feed "http://www.kaleva.fi/rss/145.xml" "Kaleva/Tiede" "" |
0 | 179 |
180 add_rss_feed "http://www.effi.org/xml/uutiset.rss" "EFFI" "" | |
181 | |
143 | 182 add_rss_feed "http://static.mtv3.fi/rss/uutiset_rikos.rss" "MTV3/Rikos" "" |
0 | 183 |
184 add_rss_feed "http://www.blastwave-comic.com/rss/blastwave.xml" "Blastwave" "" | |
185 | |
186 #add_rss_feed "http://lehti.samizdat.info/feed/" "Lehti" "" | |
187 | |
139
3305e142eecc
Change feed fetcher to use SQLite3 backend.
Matti Hamalainen <ccr@tnsp.org>
parents:
114
diff
changeset
|
188 |
0 | 189 |
139
3305e142eecc
Change feed fetcher to use SQLite3 backend.
Matti Hamalainen <ccr@tnsp.org>
parents:
114
diff
changeset
|
190 ### Close database |
140
b0648e05c855
Change some variable names, etc.
Matti Hamalainen <ccr@tnsp.org>
parents:
139
diff
changeset
|
191 feeds_db close |
142 | 192 |
193 puts "$nitems new items." |