Mercurial > hg > egg-tcls

diff fetch_feeds.tcl @ 159:bbc7860c22a6
Renamed.
author: Matti Hamalainen <ccr@tnsp.org>
date: Mon, 02 Jun 2014 12:29:47 +0300
parents: get_feeds.tcl@48460e925a8c
children: 908edc54005a
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fetch_feeds.tcl	Mon Jun 02 12:29:47 2014 +0300
@@ -0,0 +1,189 @@
+#!/usr/bin/tclsh
+#
+# NOTICE! Change above path to correct tclsh binary path!
+#
+##########################################################################
+#
+# FeedCheck fetcher v0.8 by Matti 'ccr' Hamalainen <ccr@tnsp.org>
+# (C) Copyright 2008-2013 Tecnic Software productions (TNSP) 
+#
+# This script is freely distributable under GNU GPL (version 2) license.
+#
+##########################################################################
+package require sqlite3
+source [file dirname [info script]]/util_convert.tcl
+
+# SQLite3 database, MUST be set to same as in feeds.tcl
+set feeds_dbfile "/home/niinuska/bot/feeds.sqlite"
+
+# Use a HTTP proxy? 1 = yes, 0 = no
+set http_proxy 0
+
+# HTTP proxy address and port
+set http_proxy_host "cache.inet.fi"
+set http_proxy_port 800
+
+
+##########################################################################
+
+set    feeds_ent_str "&#45;|-|&#39;|'|—|-|&rlm;||&#8212;|-|&#8211;|--|&#x202a;||&#x202c;|"
+append feeds_ent_str "|&lrm;||&aring;|å|&Aring;|Å|&eacute;|é|&#58;|:|&nbsp;| "
+append feeds_ent_str "|&#8221;|\"|&#8220;|\"|&laquo;|<<|&raquo;|>>|&quot;|\""
+append feeds_ent_str "|&auml;|ä|&ouml;|ö|&Auml;|Ä|&Ouml;|Ö|&amp;|&|&lt;|<|&gt;|>"
+append feeds_ent_str "|&#228;|ä|&#229;|ö|&mdash;|-|&#039;|'|&ndash;|-|&#034;|\""
+append feeds_ent_str "|&#124;|-|&#8217;|'|&uuml;|ü|&Uuml;|Ü|&bull;|*|&euro;|€"
+append feeds_ent_str "|&rdquo;|\""
+set html_ent [split [encoding convertfrom "utf-8" $feeds_ent_str] "|"]
+
+package require http
+::http::config -urlencoding iso8859-1 -useragent "Mozilla/4.0 (compatible; MSIE 6.0; MSIE 5.5; Windows NT 5.0) Opera 9.5"
+if {$http_proxy != 0} {
+  ::http::config -proxyhost $http_proxy_host -proxyport $http_proxy_port
+}
+
+
+proc convert_ent {udata} {
+  global html_ent
+  return [string map -nocase $html_ent [string map $html_ent $udata]]
+}
+
+proc add_entry {uname uprefix uurl utitle} {
+  global currclock feeds_db nitems
+  set utmp [convert_ent $uurl]
+  if {[string match "http://*" $utmp] || [string match "https://*" $utmp]} {
+    set utest "$utmp"
+  } else {
+    set utest "$uprefix$utmp"
+  }
+
+  set usql "SELECT title FROM feeds WHERE url='[escape $utest]' AND feed='[escape $uname]'"
+  if {![feeds_db exists $usql]} {
+    set usql "INSERT INTO feeds (feed,utime,url,title) VALUES ('[escape $uname]', $currclock, '[escape $utest]', '[escape $utitle]')"
+    incr nitems
+    if {[catch {feeds_db eval $usql} uerrmsg]} {
+      puts "\nError: $uerrmsg on:\n$usql"
+      exit 15
+    }
+  }
+}
+
+
+proc add_rss_feed {datauri dataname dataprefix} {
+  if {[catch {set utoken [::http::geturl $datauri -binary true -timeout 5000]} uerrmsg]} {
+    puts "Error getting $datauri: $uerrmsg"
+    return 1
+  }
+  set upage [::http::data $utoken]
+  ::http::cleanup $utoken
+  
+  set umatches [regexp -all -nocase -inline -- "<item>.\*\?<title><..CDATA.(.\*\?)\\\]\\\]></title>.\*\?<link>(http.\*\?)</link>.\*\?</item>" $upage]
+  set nmatches [llength $umatches]
+  for {set n 0} {$n < $nmatches} {incr n 3} {
+    add_entry $dataname $dataprefix [lindex $umatches [expr $n+2]] [lindex $umatches [expr $n+1]]
+  }
+  
+  if {$nmatches == 0} {
+  set umatches [regexp -all -nocase -inline -- "<item>.\*\?<title>(.\*\?)</title>.\*\?<link>(http.\*\?)</link>.\*\?</item>" $upage]
+  set nmatches [llength $umatches]
+  for {set n 0} {$n < $nmatches} {incr n 3} {
+    add_entry $dataname $dataprefix [lindex $umatches [expr $n+2]] [lindex $umatches [expr $n+1]]
+  }
+  }
+
+  if {$nmatches == 0} {
+  set umatches [regexp -all -nocase -inline -- "<item \[^>\]*>.\*\?<title>(.\*\?)</title>.\*\?<link>(http.\*\?)</link>.\*\?</item>" $upage]
+  set nmatches [llength $umatches]
+  for {set n 0} {$n < $nmatches} {incr n 3} {
+    add_entry $dataname $dataprefix [lindex $umatches [expr $n+2]] [lindex $umatches [expr $n+1]]
+  }
+  }
+
+  return 0
+}
+
+
+### Open database, etc
+set nitems 0
+set currclock [clock seconds]
+global feeds_db
+if {[catch {sqlite3 feeds_db $feeds_dbfile} uerrmsg]} {
+  puts "Could not open SQLite3 database '$feeds_dbfile': $uerrmsg."
+  exit 2
+}
+
+
+##############################################################################
+### Fetch and parse Halla-aho's blog page data
+set datauri "http://www.halla-aho.com/scripta/";
+set dataname "Mestari"
+if {[catch {set utoken [::http::geturl $datauri -binary true -timeout 5000]} uerrmsg]} {
+  puts "Error getting $datauri: $uerrmsg"
+} else {
+  set upage [::http::data $utoken]
+  ::http::cleanup $utoken
+  
+  set umatches [regexp -all -nocase -inline -- "<a href=\"(\[^\"\]+\.html)\"><b>(\[^<\]+)</b>" $upage]
+  set nmatches [llength $umatches]
+  for {set n 0} {$n < $nmatches} {incr n 3} {
+    add_entry $dataname $datauri [lindex $umatches [expr $n+1]] [lindex $umatches [expr $n+2]]
+  }
+
+  set umatches [regexp -all -nocase -inline -- "<a href=\"(\[^\"\]+\.html)\">(\[^<\]\[^b\]\[^<\]+)</a>" $upage]
+  set nmatches [llength $umatches]
+  for {set n 0} {$n < $nmatches} {incr n 3} {
+    add_entry $dataname $datauri [lindex $umatches [expr $n+1]] [lindex $umatches [expr $n+2]]
+  }
+}
+
+
+### The Adventurers
+set datauri "http://www.peldor.com/chapters/index_sidebar.html";
+set dataname "The Adventurers"
+if {[catch {set utoken [::http::geturl $datauri -binary true -timeout 5000]} uerrmsg]} {
+  puts "Error getting $datauri: $uerrmsg"
+} else {
+  set upage [::http::data $utoken]
+  ::http::cleanup $utoken
+  
+  set umatches [regexp -all -nocase -inline -- "<a href=\"(\[^\"\]+)\">(\[^<\]+)</a>" $upage]
+  set nmatches [llength $umatches]
+  for {set n 0} {$n < $nmatches} {incr n 3} {
+    add_entry $dataname "http://www.peldor.com/" [lindex $umatches [expr $n+1]] [lindex $umatches [expr $n+2]]
+  }
+}
+
+
+### Order of the Stick
+set datauri "http://www.giantitp.com/comics/oots.html";
+set dataname "OOTS"
+if {[catch {set utoken [::http::geturl $datauri -binary true -timeout 5000]} uerrmsg]} {
+  puts "Error getting $datauri: $uerrmsg"
+} else {
+  set upage [::http::data $utoken]
+  ::http::cleanup $utoken
+  
+  set umatches [regexp -all -nocase -inline -- "<a href=\"(/comics/oots\[0-9\]+\.html)\">(\[^<\]+)</a>" $upage]
+  set nmatches [llength $umatches]
+  for {set n 0} {$n < $nmatches} {incr n 3} {
+    add_entry $dataname "http://www.giantitp.com" [lindex $umatches [expr $n+1]] [lindex $umatches [expr $n+2]]
+  }
+}
+
+
+### Generic RSS-feed fetching
+#add_rss_feed "http://www.kaleva.fi/rss/145.xml" "Kaleva/Tiede" ""
+
+add_rss_feed "http://www.effi.org/xml/uutiset.rss" "EFFI" ""
+
+add_rss_feed "http://static.mtv3.fi/rss/uutiset_rikos.rss" "MTV3/Rikos" ""
+
+add_rss_feed "http://www.blastwave-comic.com/rss/blastwave.xml" "Blastwave" ""
+
+#add_rss_feed "http://lehti.samizdat.info/feed/" "Lehti" ""
+
+
+
+### Close database
+feeds_db close
+
+puts "$nitems new items."
author	Matti Hamalainen <ccr@tnsp.org>
date	Mon, 02 Jun 2014 12:29:47 +0300
parents	get_feeds.tcl@48460e925a8c
children	908edc54005a