view fetch_feeds.tcl @ 300:2a9ee3f68225

urllog: Make TLD check configurable.
author Matti Hamalainen <ccr@tnsp.org>
date Tue, 27 Jan 2015 08:23:22 +0200
parents a7455b0dc144
children d8b957796121
line wrap: on
line source

#!/usr/bin/tclsh
#
# NOTICE! Change above path to correct tclsh binary path!
#
##############################################################################
#
# FeedCheck fetcher v0.9 by Matti 'ccr' Hamalainen <ccr@tnsp.org>
# (C) Copyright 2008-2015 Tecnic Software productions (TNSP) 
#
# This script is freely distributable under GNU GPL (version 2) license.
#
##############################################################################
package require sqlite3
source [file dirname [info script]]/utillib.tcl

### The configuration should be in config.feeds in same directory
### as this script. Or change the line below to point where ever
### you wish. See "config.feeds.example" for an example config file.
source [file dirname [info script]]/config.feeds


##############################################################################

package require http

if {[info exists http_user_agent] && $http_user_agent != ""} {
  ::http::config -urlencoding utf8 -useragent $http_user_agent
} else {
  ::http::config -urlencoding utf8 -useragent "Mozilla/4.0 (compatible; MSIE 6.0; MSIE 5.5; Windows NT 5.0) Opera 9.5"
}

if {[info exists http_use_proxy] && $http_use_proxy != 0} {
  ::http::config -proxyhost $http_proxy_host -proxyport $http_proxy_port
}

if {[info exists http_tls_support] && $http_tls_support != 0} {
  package require tls
  ::http::register https 443 [list ::tls::socket -request 1 -require 1 -tls1 1 -cadir $http_tls_cadir]
}


##############################################################################

proc add_entry {uname uprefix uurl utitle} {
  global currclock feeds_db nitems
  set utmp [utl_convert_html_ent $uurl]
  if {[string match "http://*" $utmp] || [string match "https://*" $utmp]} {
    set utest "$utmp"
  } else {
    set utest "$uprefix$utmp"
  }

  set usql "SELECT title FROM feeds WHERE url='[utl_escape $utest]' AND feed='[utl_escape $uname]'"
  if {![feeds_db exists $usql]} {
    set usql "INSERT INTO feeds (feed,utime,url,title) VALUES ('[utl_escape $uname]', $currclock, '[utl_escape $utest]', '[utl_escape $utitle]')"
    incr nitems
    if {[catch {feeds_db eval $usql} uerrmsg]} {
      puts "\nError: $uerrmsg on:\n$usql"
      exit 15
    }
  }
}


proc add_rss_feed {datauri dataname dataprefix} {
  if {[catch {set utoken [::http::geturl $datauri -binary true -timeout 5000]} uerrmsg]} {
    puts "Error getting $datauri: $uerrmsg"
    return 1
  }
  set upage [::http::data $utoken]
  ::http::cleanup $utoken
  
  set umatches [regexp -all -nocase -inline -- "<item>.\*\?<title><..CDATA.(.\*\?)\\\]\\\]></title>.\*\?<link>(http.\*\?)</link>.\*\?</item>" $upage]
  set nmatches [llength $umatches]
  for {set n 0} {$n < $nmatches} {incr n 3} {
    add_entry $dataname $dataprefix [lindex $umatches [expr $n+2]] [lindex $umatches [expr $n+1]]
  }
  
  if {$nmatches == 0} {
  set umatches [regexp -all -nocase -inline -- "<item>.\*\?<title>(.\*\?)</title>.\*\?<link>(http.\*\?)</link>.\*\?</item>" $upage]
  set nmatches [llength $umatches]
  for {set n 0} {$n < $nmatches} {incr n 3} {
    add_entry $dataname $dataprefix [lindex $umatches [expr $n+2]] [lindex $umatches [expr $n+1]]
  }
  }

  if {$nmatches == 0} {
  set umatches [regexp -all -nocase -inline -- "<item \[^>\]*>.\*\?<title>(.\*\?)</title>.\*\?<link>(http.\*\?)</link>.\*\?</item>" $upage]
  set nmatches [llength $umatches]
  for {set n 0} {$n < $nmatches} {incr n 3} {
    add_entry $dataname $dataprefix [lindex $umatches [expr $n+2]] [lindex $umatches [expr $n+1]]
  }
  }

  return 0
}


### Open database, etc
set nitems 0
set currclock [clock seconds]
global feeds_db
if {[catch {sqlite3 feeds_db $feeds_dbfile} uerrmsg]} {
  puts "Could not open SQLite3 database '$feeds_dbfile': $uerrmsg."
  exit 2
}


##############################################################################
### Fetch and parse Halla-aho's blog page data
set datauri "http://www.halla-aho.com/scripta/";
set dataname "Mestari"
if {[catch {set utoken [::http::geturl $datauri -binary true -timeout 5000]} uerrmsg]} {
  puts "Error getting $datauri: $uerrmsg"
} else {
  set upage [::http::data $utoken]
  ::http::cleanup $utoken
  
  set umatches [regexp -all -nocase -inline -- "<a href=\"(\[^\"\]+\.html)\"><b>(\[^<\]+)</b>" $upage]
  set nmatches [llength $umatches]
  for {set n 0} {$n < $nmatches} {incr n 3} {
    add_entry $dataname $datauri [lindex $umatches [expr $n+1]] [lindex $umatches [expr $n+2]]
  }

  set umatches [regexp -all -nocase -inline -- "<a href=\"(\[^\"\]+\.html)\">(\[^<\]\[^b\]\[^<\]+)</a>" $upage]
  set nmatches [llength $umatches]
  for {set n 0} {$n < $nmatches} {incr n 3} {
    add_entry $dataname $datauri [lindex $umatches [expr $n+1]] [lindex $umatches [expr $n+2]]
  }
}


### The Adventurers
set datauri "http://www.peldor.com/chapters/index_sidebar.html";
set dataname "The Adventurers"
if {[catch {set utoken [::http::geturl $datauri -binary true -timeout 5000]} uerrmsg]} {
  puts "Error getting $datauri: $uerrmsg"
} else {
  set upage [::http::data $utoken]
  ::http::cleanup $utoken
  
  set umatches [regexp -all -nocase -inline -- "<a href=\"(\[^\"\]+)\">(\[^<\]+)</a>" $upage]
  set nmatches [llength $umatches]
  for {set n 0} {$n < $nmatches} {incr n 3} {
    add_entry $dataname "http://www.peldor.com/" [lindex $umatches [expr $n+1]] [lindex $umatches [expr $n+2]]
  }
}


### Order of the Stick
set datauri "http://www.giantitp.com/comics/oots.html";
set dataname "OOTS"
if {[catch {set utoken [::http::geturl $datauri -binary true -timeout 5000]} uerrmsg]} {
  puts "Error getting $datauri: $uerrmsg"
} else {
  set upage [::http::data $utoken]
  ::http::cleanup $utoken
  
  set umatches [regexp -all -nocase -inline -- "<a href=\"(/comics/oots\[0-9\]+\.html)\">(\[^<\]+)</a>" $upage]
  set nmatches [llength $umatches]
  for {set n 0} {$n < $nmatches} {incr n 3} {
    add_entry $dataname "http://www.giantitp.com" [lindex $umatches [expr $n+1]] [lindex $umatches [expr $n+2]]
  }
}


### Generic RSS-feed fetching
#add_rss_feed "http://www.kaleva.fi/rss/145.xml" "Kaleva/Tiede" ""

add_rss_feed "http://www.effi.org/xml/uutiset.rss" "EFFI" ""

add_rss_feed "http://static.mtv3.fi/rss/uutiset_rikos.rss" "MTV3/Rikos" ""

add_rss_feed "http://www.blastwave-comic.com/rss/blastwave.xml" "Blastwave" ""

#add_rss_feed "http://lehti.samizdat.info/feed/" "Lehti" ""



### Close database
feeds_db close

puts "$nitems new items."