Mercurial > hg > egg-tcls
annotate fetch_feeds.tcl @ 159:bbc7860c22a6
Renamed.
author | Matti Hamalainen <ccr@tnsp.org> |
---|---|
date | Mon, 02 Jun 2014 12:29:47 +0300 |
parents | get_feeds.tcl@48460e925a8c |
children | 908edc54005a |
rev | line source |
---|---|
0 | 1 #!/usr/bin/tclsh |
1 | 2 # |
3 # NOTICE! Change above path to correct tclsh binary path! | |
4 # | |
69
df3230f8aa46
Translate some comments to english and cosmetic fixes.
Matti Hamalainen <ccr@tnsp.org>
parents:
63
diff
changeset
|
5 ########################################################################## |
0 | 6 # |
139
3305e142eecc
Change feed fetcher to use SQLite3 backend.
Matti Hamalainen <ccr@tnsp.org>
parents:
114
diff
changeset
|
7 # FeedCheck fetcher v0.8 by Matti 'ccr' Hamalainen <ccr@tnsp.org> |
3305e142eecc
Change feed fetcher to use SQLite3 backend.
Matti Hamalainen <ccr@tnsp.org>
parents:
114
diff
changeset
|
8 # (C) Copyright 2008-2013 Tecnic Software productions (TNSP) |
0 | 9 # |
10 # This script is freely distributable under GNU GPL (version 2) license. | |
11 # | |
69
df3230f8aa46
Translate some comments to english and cosmetic fixes.
Matti Hamalainen <ccr@tnsp.org>
parents:
63
diff
changeset
|
12 ########################################################################## |
139
3305e142eecc
Change feed fetcher to use SQLite3 backend.
Matti Hamalainen <ccr@tnsp.org>
parents:
114
diff
changeset
|
13 package require sqlite3 |
3305e142eecc
Change feed fetcher to use SQLite3 backend.
Matti Hamalainen <ccr@tnsp.org>
parents:
114
diff
changeset
|
14 source [file dirname [info script]]/util_convert.tcl |
0 | 15 |
139
3305e142eecc
Change feed fetcher to use SQLite3 backend.
Matti Hamalainen <ccr@tnsp.org>
parents:
114
diff
changeset
|
16 # SQLite3 database, MUST be set to same as in feeds.tcl |
140
b0648e05c855
Change some variable names, etc.
Matti Hamalainen <ccr@tnsp.org>
parents:
139
diff
changeset
|
17 set feeds_dbfile "/home/niinuska/bot/feeds.sqlite" |
0 | 18 |
19 # Use a HTTP proxy? 1 = yes, 0 = no | |
20 set http_proxy 0 | |
21 | |
22 # HTTP proxy address and port | |
23 set http_proxy_host "cache.inet.fi" | |
24 set http_proxy_port 800 | |
25 | |
26 | |
69
df3230f8aa46
Translate some comments to english and cosmetic fixes.
Matti Hamalainen <ccr@tnsp.org>
parents:
63
diff
changeset
|
27 ########################################################################## |
139
3305e142eecc
Change feed fetcher to use SQLite3 backend.
Matti Hamalainen <ccr@tnsp.org>
parents:
114
diff
changeset
|
28 |
146
7106dd8db4de
Improve entity parsing, etc.
Matti Hamalainen <ccr@tnsp.org>
parents:
145
diff
changeset
|
29 set feeds_ent_str "-|-|'|'|—|-|‏||—|-|–|--|‪||‬|" |
7106dd8db4de
Improve entity parsing, etc.
Matti Hamalainen <ccr@tnsp.org>
parents:
145
diff
changeset
|
30 append feeds_ent_str "|‎||å|å|Å|Å|é|é|:|:| | " |
7106dd8db4de
Improve entity parsing, etc.
Matti Hamalainen <ccr@tnsp.org>
parents:
145
diff
changeset
|
31 append feeds_ent_str "|”|\"|“|\"|«|<<|»|>>|"|\"" |
7106dd8db4de
Improve entity parsing, etc.
Matti Hamalainen <ccr@tnsp.org>
parents:
145
diff
changeset
|
32 append feeds_ent_str "|ä|ä|ö|ö|Ä|Ä|Ö|Ö|&|&|<|<|>|>" |
7106dd8db4de
Improve entity parsing, etc.
Matti Hamalainen <ccr@tnsp.org>
parents:
145
diff
changeset
|
33 append feeds_ent_str "|ä|ä|å|ö|—|-|'|'|–|-|"|\"" |
7106dd8db4de
Improve entity parsing, etc.
Matti Hamalainen <ccr@tnsp.org>
parents:
145
diff
changeset
|
34 append feeds_ent_str "|||-|’|'|ü|ü|Ü|Ü|•|*|€|€" |
7106dd8db4de
Improve entity parsing, etc.
Matti Hamalainen <ccr@tnsp.org>
parents:
145
diff
changeset
|
35 append feeds_ent_str "|”|\"" |
7106dd8db4de
Improve entity parsing, etc.
Matti Hamalainen <ccr@tnsp.org>
parents:
145
diff
changeset
|
36 set html_ent [split [encoding convertfrom "utf-8" $feeds_ent_str] "|"] |
0 | 37 |
38 package require http | |
39 ::http::config -urlencoding iso8859-1 -useragent "Mozilla/4.0 (compatible; MSIE 6.0; MSIE 5.5; Windows NT 5.0) Opera 9.5" | |
40 if {$http_proxy != 0} { | |
63 | 41 ::http::config -proxyhost $http_proxy_host -proxyport $http_proxy_port |
0 | 42 } |
43 | |
44 | |
45 proc convert_ent {udata} { | |
63 | 46 global html_ent |
146
7106dd8db4de
Improve entity parsing, etc.
Matti Hamalainen <ccr@tnsp.org>
parents:
145
diff
changeset
|
47 return [string map -nocase $html_ent [string map $html_ent $udata]] |
0 | 48 } |
49 | |
139
3305e142eecc
Change feed fetcher to use SQLite3 backend.
Matti Hamalainen <ccr@tnsp.org>
parents:
114
diff
changeset
|
50 proc add_entry {uname uprefix uurl utitle} { |
142 | 51 global currclock feeds_db nitems |
147 | 52 set utmp [convert_ent $uurl] |
53 if {[string match "http://*" $utmp] || [string match "https://*" $utmp]} { | |
54 set utest "$utmp" | |
55 } else { | |
56 set utest "$uprefix$utmp" | |
57 } | |
139
3305e142eecc
Change feed fetcher to use SQLite3 backend.
Matti Hamalainen <ccr@tnsp.org>
parents:
114
diff
changeset
|
58 |
142 | 59 set usql "SELECT title FROM feeds WHERE url='[escape $utest]' AND feed='[escape $uname]'" |
140
b0648e05c855
Change some variable names, etc.
Matti Hamalainen <ccr@tnsp.org>
parents:
139
diff
changeset
|
60 if {![feeds_db exists $usql]} { |
142 | 61 set usql "INSERT INTO feeds (feed,utime,url,title) VALUES ('[escape $uname]', $currclock, '[escape $utest]', '[escape $utitle]')" |
62 incr nitems | |
140
b0648e05c855
Change some variable names, etc.
Matti Hamalainen <ccr@tnsp.org>
parents:
139
diff
changeset
|
63 if {[catch {feeds_db eval $usql} uerrmsg]} { |
139
3305e142eecc
Change feed fetcher to use SQLite3 backend.
Matti Hamalainen <ccr@tnsp.org>
parents:
114
diff
changeset
|
64 puts "\nError: $uerrmsg on:\n$usql" |
3305e142eecc
Change feed fetcher to use SQLite3 backend.
Matti Hamalainen <ccr@tnsp.org>
parents:
114
diff
changeset
|
65 exit 15 |
3305e142eecc
Change feed fetcher to use SQLite3 backend.
Matti Hamalainen <ccr@tnsp.org>
parents:
114
diff
changeset
|
66 } |
63 | 67 } |
0 | 68 } |
69 | |
70 | |
71 proc add_rss_feed {datauri dataname dataprefix} { | |
63 | 72 if {[catch {set utoken [::http::geturl $datauri -binary true -timeout 5000]} uerrmsg]} { |
73 puts "Error getting $datauri: $uerrmsg" | |
74 return 1 | |
75 } | |
76 set upage [::http::data $utoken] | |
77 ::http::cleanup $utoken | |
78 | |
79 set umatches [regexp -all -nocase -inline -- "<item>.\*\?<title><..CDATA.(.\*\?)\\\]\\\]></title>.\*\?<link>(http.\*\?)</link>.\*\?</item>" $upage] | |
80 set nmatches [llength $umatches] | |
81 for {set n 0} {$n < $nmatches} {incr n 3} { | |
82 add_entry $dataname $dataprefix [lindex $umatches [expr $n+2]] [lindex $umatches [expr $n+1]] | |
83 } | |
84 | |
85 if {$nmatches == 0} { | |
86 set umatches [regexp -all -nocase -inline -- "<item>.\*\?<title>(.\*\?)</title>.\*\?<link>(http.\*\?)</link>.\*\?</item>" $upage] | |
87 set nmatches [llength $umatches] | |
88 for {set n 0} {$n < $nmatches} {incr n 3} { | |
89 add_entry $dataname $dataprefix [lindex $umatches [expr $n+2]] [lindex $umatches [expr $n+1]] | |
90 } | |
91 } | |
0 | 92 |
63 | 93 if {$nmatches == 0} { |
94 set umatches [regexp -all -nocase -inline -- "<item \[^>\]*>.\*\?<title>(.\*\?)</title>.\*\?<link>(http.\*\?)</link>.\*\?</item>" $upage] | |
95 set nmatches [llength $umatches] | |
96 for {set n 0} {$n < $nmatches} {incr n 3} { | |
97 add_entry $dataname $dataprefix [lindex $umatches [expr $n+2]] [lindex $umatches [expr $n+1]] | |
98 } | |
99 } | |
143 | 100 |
63 | 101 return 0 |
0 | 102 } |
103 | |
104 | |
139
3305e142eecc
Change feed fetcher to use SQLite3 backend.
Matti Hamalainen <ccr@tnsp.org>
parents:
114
diff
changeset
|
105 ### Open database, etc |
142 | 106 set nitems 0 |
139
3305e142eecc
Change feed fetcher to use SQLite3 backend.
Matti Hamalainen <ccr@tnsp.org>
parents:
114
diff
changeset
|
107 set currclock [clock seconds] |
140
b0648e05c855
Change some variable names, etc.
Matti Hamalainen <ccr@tnsp.org>
parents:
139
diff
changeset
|
108 global feeds_db |
b0648e05c855
Change some variable names, etc.
Matti Hamalainen <ccr@tnsp.org>
parents:
139
diff
changeset
|
109 if {[catch {sqlite3 feeds_db $feeds_dbfile} uerrmsg]} { |
b0648e05c855
Change some variable names, etc.
Matti Hamalainen <ccr@tnsp.org>
parents:
139
diff
changeset
|
110 puts "Could not open SQLite3 database '$feeds_dbfile': $uerrmsg." |
139
3305e142eecc
Change feed fetcher to use SQLite3 backend.
Matti Hamalainen <ccr@tnsp.org>
parents:
114
diff
changeset
|
111 exit 2 |
0 | 112 } |
113 | |
114 | |
115 ############################################################################## | |
69
df3230f8aa46
Translate some comments to english and cosmetic fixes.
Matti Hamalainen <ccr@tnsp.org>
parents:
63
diff
changeset
|
116 ### Fetch and parse Halla-aho's blog page data |
0 | 117 set datauri "http://www.halla-aho.com/scripta/"; |
118 set dataname "Mestari" | |
119 if {[catch {set utoken [::http::geturl $datauri -binary true -timeout 5000]} uerrmsg]} { | |
63 | 120 puts "Error getting $datauri: $uerrmsg" |
0 | 121 } else { |
63 | 122 set upage [::http::data $utoken] |
123 ::http::cleanup $utoken | |
124 | |
125 set umatches [regexp -all -nocase -inline -- "<a href=\"(\[^\"\]+\.html)\"><b>(\[^<\]+)</b>" $upage] | |
126 set nmatches [llength $umatches] | |
127 for {set n 0} {$n < $nmatches} {incr n 3} { | |
128 add_entry $dataname $datauri [lindex $umatches [expr $n+1]] [lindex $umatches [expr $n+2]] | |
129 } | |
0 | 130 |
63 | 131 set umatches [regexp -all -nocase -inline -- "<a href=\"(\[^\"\]+\.html)\">(\[^<\]\[^b\]\[^<\]+)</a>" $upage] |
132 set nmatches [llength $umatches] | |
133 for {set n 0} {$n < $nmatches} {incr n 3} { | |
134 add_entry $dataname $datauri [lindex $umatches [expr $n+1]] [lindex $umatches [expr $n+2]] | |
135 } | |
0 | 136 } |
137 | |
138 | |
139 ### The Adventurers | |
140 set datauri "http://www.peldor.com/chapters/index_sidebar.html"; | |
141 set dataname "The Adventurers" | |
142 if {[catch {set utoken [::http::geturl $datauri -binary true -timeout 5000]} uerrmsg]} { | |
63 | 143 puts "Error getting $datauri: $uerrmsg" |
0 | 144 } else { |
63 | 145 set upage [::http::data $utoken] |
146 ::http::cleanup $utoken | |
147 | |
148 set umatches [regexp -all -nocase -inline -- "<a href=\"(\[^\"\]+)\">(\[^<\]+)</a>" $upage] | |
149 set nmatches [llength $umatches] | |
150 for {set n 0} {$n < $nmatches} {incr n 3} { | |
151 add_entry $dataname "http://www.peldor.com/" [lindex $umatches [expr $n+1]] [lindex $umatches [expr $n+2]] | |
152 } | |
0 | 153 } |
154 | |
155 | |
156 ### Order of the Stick | |
157 set datauri "http://www.giantitp.com/comics/oots.html"; | |
158 set dataname "OOTS" | |
159 if {[catch {set utoken [::http::geturl $datauri -binary true -timeout 5000]} uerrmsg]} { | |
63 | 160 puts "Error getting $datauri: $uerrmsg" |
0 | 161 } else { |
63 | 162 set upage [::http::data $utoken] |
163 ::http::cleanup $utoken | |
164 | |
165 set umatches [regexp -all -nocase -inline -- "<a href=\"(/comics/oots\[0-9\]+\.html)\">(\[^<\]+)</a>" $upage] | |
166 set nmatches [llength $umatches] | |
167 for {set n 0} {$n < $nmatches} {incr n 3} { | |
168 add_entry $dataname "http://www.giantitp.com" [lindex $umatches [expr $n+1]] [lindex $umatches [expr $n+2]] | |
169 } | |
0 | 170 } |
171 | |
172 | |
69
df3230f8aa46
Translate some comments to english and cosmetic fixes.
Matti Hamalainen <ccr@tnsp.org>
parents:
63
diff
changeset
|
173 ### Generic RSS-feed fetching |
143 | 174 #add_rss_feed "http://www.kaleva.fi/rss/145.xml" "Kaleva/Tiede" "" |
0 | 175 |
176 add_rss_feed "http://www.effi.org/xml/uutiset.rss" "EFFI" "" | |
177 | |
143 | 178 add_rss_feed "http://static.mtv3.fi/rss/uutiset_rikos.rss" "MTV3/Rikos" "" |
0 | 179 |
180 add_rss_feed "http://www.blastwave-comic.com/rss/blastwave.xml" "Blastwave" "" | |
181 | |
182 #add_rss_feed "http://lehti.samizdat.info/feed/" "Lehti" "" | |
183 | |
139
3305e142eecc
Change feed fetcher to use SQLite3 backend.
Matti Hamalainen <ccr@tnsp.org>
parents:
114
diff
changeset
|
184 |
0 | 185 |
139
3305e142eecc
Change feed fetcher to use SQLite3 backend.
Matti Hamalainen <ccr@tnsp.org>
parents:
114
diff
changeset
|
186 ### Close database |
140
b0648e05c855
Change some variable names, etc.
Matti Hamalainen <ccr@tnsp.org>
parents:
139
diff
changeset
|
187 feeds_db close |
142 | 188 |
189 puts "$nitems new items." |