view hae_feedit.tcl @ 138:743c1bea2498

Add utility functions for feeds database creation.
author Matti Hamalainen <ccr@tnsp.org>
date Tue, 04 Jun 2013 13:24:30 +0300
parents 593874678e45
children 3305e142eecc
line wrap: on
line source

#!/usr/bin/tclsh
#
# NOTICE! Change above path to correct tclsh binary path!
#
##########################################################################
#
# FeedCheck fetcher v0.7 by Matti 'ccr' Hamalainen <ccr@tnsp.org>
# (C) Copyright 2008-2011 Tecnic Software productions (TNSP) 
#
# This script is freely distributable under GNU GPL (version 2) license.
#
##########################################################################

# Datafile, MUST be set to same as in feeds.tcl
set datafile "/home/niinuska/bot/data.feeds"

# Use a HTTP proxy? 1 = yes, 0 = no
set http_proxy 0

# HTTP proxy address and port
set http_proxy_host "cache.inet.fi"
set http_proxy_port 800


##########################################################################
set html_ent [split "&#160;| |&raquo;|>>|&quot;|\"|&auml;|ä|&ouml;|ö|&Auml;|Ä|&Ouml;|Ö|&amp;|&|&lt;|<|&gt;|>|ä|ä|ö|ö|Ä|Ä" "|"]

package require http
::http::config -urlencoding iso8859-1 -useragent "Mozilla/4.0 (compatible; MSIE 6.0; MSIE 5.5; Windows NT 5.0) Opera 9.5"
if {$http_proxy != 0} {
  ::http::config -proxyhost $http_proxy_host -proxyport $http_proxy_port
}


proc convert_ent {udata} {
  global html_ent
  return [string map $html_ent $udata]
}


proc add_entry {uname uprefix uurl udesc} {
  global entries isentries newurls currclock
  set utest "$uprefix[convert_ent $uurl]"
  set isentries($utest) 1
  if {[catch {set utmp $entries($utest)}]} {
    set entries($utest) [list $currclock $uname $utest [convert_ent $udesc]]
    incr newurls
  }
}


proc add_rss_feed {datauri dataname dataprefix} {
  if {[catch {set utoken [::http::geturl $datauri -binary true -timeout 5000]} uerrmsg]} {
    puts "Error getting $datauri: $uerrmsg"
    return 1
  }
  set upage [::http::data $utoken]
  ::http::cleanup $utoken
  
  set umatches [regexp -all -nocase -inline -- "<item>.\*\?<title><..CDATA.(.\*\?)\\\]\\\]></title>.\*\?<link>(http.\*\?)</link>.\*\?</item>" $upage]
  set nmatches [llength $umatches]
  for {set n 0} {$n < $nmatches} {incr n 3} {
    add_entry $dataname $dataprefix [lindex $umatches [expr $n+2]] [lindex $umatches [expr $n+1]]
  }
  
  if {$nmatches == 0} {
  set umatches [regexp -all -nocase -inline -- "<item>.\*\?<title>(.\*\?)</title>.\*\?<link>(http.\*\?)</link>.\*\?</item>" $upage]
  set nmatches [llength $umatches]
  for {set n 0} {$n < $nmatches} {incr n 3} {
    add_entry $dataname $dataprefix [lindex $umatches [expr $n+2]] [lindex $umatches [expr $n+1]]
  }
  }

  if {$nmatches == 0} {
  set umatches [regexp -all -nocase -inline -- "<item \[^>\]*>.\*\?<title>(.\*\?)</title>.\*\?<link>(http.\*\?)</link>.\*\?</item>" $upage]
  set nmatches [llength $umatches]
  for {set n 0} {$n < $nmatches} {incr n 3} {
    add_entry $dataname $dataprefix [lindex $umatches [expr $n+2]] [lindex $umatches [expr $n+1]]
  }
  }
  return 0
}


##########################################################################
### Read in old data
set oldurls 0
set newurls 0
if {![catch {set ufile [open $datafile r 0600]} uerrmsg]} {
  while {![eof $ufile]} {
    gets $ufile uline
    set urec [split $uline "½"]
    if {[llength $urec] == 4} {
      set entries([lindex $urec 2]) $urec
      set isentries([lindex $urec 2]) 0
      incr oldurls
    }
  }
  close $ufile
}
set currclock [clock seconds]


##############################################################################
### Fetch and parse Halla-aho's blog page data
set datauri "http://www.halla-aho.com/scripta/";
set dataname "Mestari"
if {[catch {set utoken [::http::geturl $datauri -binary true -timeout 5000]} uerrmsg]} {
  puts "Error getting $datauri: $uerrmsg"
} else {
  set upage [::http::data $utoken]
  ::http::cleanup $utoken
  
  set umatches [regexp -all -nocase -inline -- "<a href=\"(\[^\"\]+\.html)\"><b>(\[^<\]+)</b>" $upage]
  set nmatches [llength $umatches]
  for {set n 0} {$n < $nmatches} {incr n 3} {
    add_entry $dataname $datauri [lindex $umatches [expr $n+1]] [lindex $umatches [expr $n+2]]
  }

  set umatches [regexp -all -nocase -inline -- "<a href=\"(\[^\"\]+\.html)\">(\[^<\]\[^b\]\[^<\]+)</a>" $upage]
  set nmatches [llength $umatches]
  for {set n 0} {$n < $nmatches} {incr n 3} {
    add_entry $dataname $datauri [lindex $umatches [expr $n+1]] [lindex $umatches [expr $n+2]]
  }
}


### The Adventurers
set datauri "http://www.peldor.com/chapters/index_sidebar.html";
set dataname "The Adventurers"
if {[catch {set utoken [::http::geturl $datauri -binary true -timeout 5000]} uerrmsg]} {
  puts "Error getting $datauri: $uerrmsg"
} else {
  set upage [::http::data $utoken]
  ::http::cleanup $utoken
  
  set umatches [regexp -all -nocase -inline -- "<a href=\"(\[^\"\]+)\">(\[^<\]+)</a>" $upage]
  set nmatches [llength $umatches]
  for {set n 0} {$n < $nmatches} {incr n 3} {
    add_entry $dataname "http://www.peldor.com/" [lindex $umatches [expr $n+1]] [lindex $umatches [expr $n+2]]
  }
}


### Order of the Stick
set datauri "http://www.giantitp.com/comics/oots.html";
set dataname "OOTS"
if {[catch {set utoken [::http::geturl $datauri -binary true -timeout 5000]} uerrmsg]} {
  puts "Error getting $datauri: $uerrmsg"
} else {
  set upage [::http::data $utoken]
  ::http::cleanup $utoken
  
  set umatches [regexp -all -nocase -inline -- "<a href=\"(/comics/oots\[0-9\]+\.html)\">(\[^<\]+)</a>" $upage]
  set nmatches [llength $umatches]
  for {set n 0} {$n < $nmatches} {incr n 3} {
    add_entry $dataname "http://www.giantitp.com" [lindex $umatches [expr $n+1]] [lindex $umatches [expr $n+2]]
  }
}


### Generic RSS-feed fetching
add_rss_feed "http://www.kaleva.fi/rss/145.xml" "Kaleva/Tiede" ""

#add_rss_feed "http://sektori.com/?tpl=rssNewsFeed" "Sektori" ""

add_rss_feed "http://www.effi.org/xml/uutiset.rss" "EFFI" ""

add_rss_feed "http://www.mtv3.fi/rss/uutiset_rikos.rss" "MTV3/Rikos" ""

add_rss_feed "http://www.blastwave-comic.com/rss/blastwave.xml" "Blastwave" ""

#add_rss_feed "http://lehti.samizdat.info/feed/" "Lehti" ""

##############################################################################
### Open data file for writing
set tmpfname "$datafile.tmp"
if {[catch {set outfile [open $tmpfname w 0600]} uerrmsg]} {
  puts "Error opening $tmpfname for writing: $uerrmsg"
  return 1
}

set uexpire [expr [clock seconds] - (7*24*60*60)]
foreach {ukey udata} [array get entries] {
#  if {$isentries($ukey) != 0 || [lindex $udata 0] >= $uexpire} {
    puts $outfile [join $udata "½"]
#  }
}

close $outfile
if {[catch {file rename -force -- $tmpfname $datafile} uerrmsg]} {
  puts "Error renaming $tmpfname to $datafile: $uerrmsg"
}
#puts "$newurls new entries."