view fetch_feeds.tcl @ 698:6ba9f961e463 default tip

quotedb: Bump version and copyright.
author Matti Hamalainen <ccr@tnsp.org>
date Mon, 18 Sep 2023 11:38:41 +0300
parents 721c8cef5039
children
line wrap: on
line source

#!/usr/bin/tclsh
#
# NOTICE! Change above path to correct tclsh binary path!
#
##############################################################################
#
# FeedCheck fetcher v1.1 by Matti 'ccr' Hamalainen <ccr@tnsp.org>
# (C) Copyright 2008-2021 Tecnic Software productions (TNSP)
#
# This script is freely distributable under GNU GPL (version 2) license.
#
# NOTICE! NOTICE! This script REQUIRES tcl-tls 1.7.13+ if you wish to
# support SSL/TLS https for URL checking. And you probably do.
#
##############################################################################

### The configuration should be in config.feeds in same directory
### as this script. Or change the line below to point where ever
### you wish. See "config.feeds.example" for an example config file.
source [file dirname [info script]]/config.feeds

### Required utillib.tcl
source [file dirname [info script]]/utillib.tcl


##############################################################################
package require sqlite3
package require http


##############################################################################
### Utility functions

proc fetch_sanitize_encoding {uencoding} {
  regsub -- "^\[a-z\]\[a-z\]_\[A-Z\]\[A-Z\]\." $uencoding "" uencoding
  set uencoding [string tolower $uencoding]
  regsub -- "^iso-" $uencoding "iso" uencoding
  return $uencoding
}


proc fetch_dorequest { urlStr urlStatus urlSCode urlCode urlData urlMeta } {

  upvar $urlStatus ustatus
  upvar $urlSCode uscode
  upvar $urlCode ucode
  upvar $urlData udata
  upvar $urlMeta umeta

  set urlHeaders {}
  lappend urlHeaders "Accept-Encoding" "identity"
  #lappend urlHeaders "Connection" "keep-alive"

  set uresult [utl_http_do_request $urlHeaders $urlStr ustatus uscode ucode udata umeta]
  if {$uresult == -1} {
    puts "HTTP request failed: $uscode ($urlStr)"
    return 0
  } elseif {$uresult < 0} {
    urllog_log "Error in HTTP request: $ustatus / $uscode ($urlStr)"
    return 0
  }

  ### Perform encoding conversion if necessary
  if {$ucode >= 200 && $ucode <= 205} {
    set uenc_doc ""
    set uenc_http ""
    set uencoding ""

    if {[info exists umeta(content-type)] && [regexp -nocase {charset\s*=\s*([a-z0-9._-]+)} $umeta(content-type) -> uenc_http]} {
      # Found character set encoding information in HTTP headers
    }

    if {[regexp -nocase -- "<meta.\*\?content=\"text/html.\*\?charset=(\[^\"\]*)\".\*\?/\?>" $udata -> uenc_doc]} {
      # Found old style HTML meta tag with character set information
    } elseif {[regexp -nocase -- "<meta.\*\?charset=\"(\[^\"\]*)\".\*\?/\?>" $udata -> uenc_doc]} {
      # Found HTML5 style meta tag with character set information
    }

    # Make sanitized versions of the encoding strings
    set uenc_http2 [fetch_sanitize_encoding $uenc_http]
    set uenc_doc2 [fetch_sanitize_encoding $uenc_doc]

    # Check if the document has specified encoding
    set uencoding $uenc_http2
    if {$uencoding == "" && $uenc_doc2 != ""} {
      set uencoding $uenc_doc2
    } elseif {$uencoding == ""} {
      # If _NO_ known encoding of any kind, assume the default of iso8859-1
      set uencoding "iso8859-1"
    }

    #puts "Charsets: http='$uenc_http', doc='$uenc_doc' / sanitized http='$uenc_http2', doc='$uenc_doc2' -> '$uencoding'"

    # Get the document title, if any
    if {$uencoding != ""} {
      if {[catch {set udata [encoding convertfrom $uencoding $udata]} cerrmsg]} {
        puts "Error in charset conversion: $urlStr: $cerrmsg"
        return 0
      }
    }
    return 1
  } else {
    return 0
  }
}


proc add_entry {uname uprefix uurl utitle} {
  global currclock feeds_db nitems

  set utmp [utl_convert_html_ent $uurl]
  if {[string match "http://*" $utmp] || [string match "https://*" $utmp]} {
    set utest "$utmp"
  } else {
    if {[string range $uprefix end end] != "/" && [string range $utmp 0 0] != "/"} {
      set utest "$uprefix/$utmp"
    } else {
      set utest "$uprefix$utmp"
    }
  }

  set usql "SELECT title FROM feeds WHERE url='[utl_escape $utest]' AND feed='[utl_escape $uname]'"
  if {![feeds_db exists $usql]} {
#    puts "NEW: $utest : $utitle"
    set usql "INSERT INTO feeds (feed,utime,url,title) VALUES ('[utl_escape $uname]', $currclock, '[utl_escape $utest]', '[utl_escape [utl_convert_html_ent $utitle]]')"
    incr nitems
    if {[catch {feeds_db eval $usql} uerrmsg]} {
      puts "\nError: $uerrmsg on:\n$usql"
      exit 15
    }
  }
}


proc add_rss_feed {datauri dataname dataprefix} {
  if {![fetch_dorequest $datauri ustatus uscode ucode upage umeta]} {
    return 0
  }

  set umatches [regexp -all -nocase -inline -- "<item>.\*\?<title><..CDATA.(.\*\?)\\\]\\\]></title>.\*\?<link>(http.\*\?)</link>.\*\?</item>" $upage]
  set nmatches [llength $umatches]
  for {set n 0} {$n < $nmatches} {incr n 3} {
    add_entry $dataname $dataprefix [lindex $umatches [expr $n+2]] [lindex $umatches [expr $n+1]]
  }

  if {$nmatches == 0} {
  set umatches [regexp -all -nocase -inline -- "<item>.\*\?<title>(.\*\?)</title>.\*\?<link>(http.\*\?)</link>.\*\?</item>" $upage]
  set nmatches [llength $umatches]
  for {set n 0} {$n < $nmatches} {incr n 3} {
    add_entry $dataname $dataprefix [lindex $umatches [expr $n+2]] [lindex $umatches [expr $n+1]]
  }
  }

  if {$nmatches == 0} {
  set umatches [regexp -all -nocase -inline -- "<item \[^>\]*>.\*\?<title>(.\*\?)</title>.\*\?<link>(http.\*\?)</link>.\*\?</item>" $upage]
  set nmatches [llength $umatches]
  for {set n 0} {$n < $nmatches} {incr n 3} {
    add_entry $dataname $dataprefix [lindex $umatches [expr $n+2]] [lindex $umatches [expr $n+1]]
  }
  }

  return 0
}


##############################################################################
### Fetch and parse Halla-aho's blog page data
proc fetch_halla_aho { } {
  set datauri "http://www.halla-aho.com/scripta/";
  set dataname "Mestari"
  if {![fetch_dorequest $datauri ustatus uscode ucode upage umeta]} {
    return 0
  }

  set umatches [regexp -all -nocase -inline -- "<a href=\"(\[^\"\]+\.html)\"><b>(\[^<\]+)</b>" $upage]
  set nmatches [llength $umatches]
  for {set n 0} {$n < $nmatches} {incr n 3} {
    add_entry $dataname $datauri [lindex $umatches [expr $n+1]] [lindex $umatches [expr $n+2]]
  }

  set umatches [regexp -all -nocase -inline -- "<a href=\"(\[^\"\]+\.html)\">(\[^<\]\[^b\]\[^<\]+)</a>" $upage]
  set nmatches [llength $umatches]
  for {set n 0} {$n < $nmatches} {incr n 3} {
    add_entry $dataname $datauri [lindex $umatches [expr $n+1]] [lindex $umatches [expr $n+2]]
  }
}


### The Adventurers
proc fetch_adventurers { } {
  set datauri "http://www.peldor.com/chapters/index_sidebar.html";
  set dataname "The Adventurers"
  if {![fetch_dorequest $datauri ustatus uscode ucode upage umeta]} {
    return 0
  }

  set umatches [regexp -all -nocase -inline -- "<a href=\"(\[^\"\]+)\">(\[^<\]+)</a>" $upage]
  set nmatches [llength $umatches]
  for {set n 0} {$n < $nmatches} {incr n 3} {
    add_entry $dataname "http://www.peldor.com/" [lindex $umatches [expr $n+1]] [lindex $umatches [expr $n+2]]
  }
}


### Order of the Stick
proc fetch_oots { } {
  set datauri "https://www.giantitp.com/comics/oots.html";
  set dataname "OOTS"
  if {![fetch_dorequest $datauri ustatus uscode ucode upage umeta]} {
    return 0
  }

  set umatches [regexp -all -nocase -inline -- "<a href=\"(/comics/oots\[0-9\]+\.html)\">(\[^<\]+)</a>" $upage]
  set nmatches [llength $umatches]
  for {set n 0} {$n < $nmatches} {incr n 3} {
    add_entry $dataname "https://www.giantitp.com" [lindex $umatches [expr $n+1]] [lindex $umatches [expr $n+2]]
  }
}


### Poliisi tiedotteet
proc fetch_poliisi { datauri dataname dataprefix } {
  if {![fetch_dorequest $datauri ustatus uscode ucode upage umeta]} {
    return 0
  }

  set umatches [regexp -all -nocase -inline -- "<div class=\"channelitem\"><div class=\"date\">(.*?)</div><a class=\"article\" href=\"(\[^\"\]+)\">(\[^<\]+)</a>" $upage]
  set nmatches [llength $umatches]
  for {set n 0} {$n < $nmatches} {incr n 4} {
    set stmp [string trim [lindex $umatches [expr $n+3]]]
    add_entry $dataname $dataprefix [lindex $umatches [expr $n+2]] "[lindex $umatches [expr $n+1]]: $stmp"
  }
}




##############################################################################
###
### Main code starts
###
##############################################################################
### Set up HTTP stuff
if {[info exists http_user_agent] && $http_user_agent != ""} {
  ::http::config -urlencoding utf8 -useragent $http_user_agent
} else {
  ::http::config -urlencoding utf8 -useragent "Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101 Firefox/38.0"
}

if {[info exists http_use_proxy] && $http_use_proxy != 0} {
  ::http::config -proxyhost $http_proxy_host -proxyport $http_proxy_port
}

if {[info exists http_tls_support] && $http_tls_support != 0} {
  package require tls
  ::http::register https 443 [list ::tls::socket -request true -require true -ssl2 false -ssl3 false -tls1 true -tls1.1 true -tls1.2 true -cadir $http_tls_cadir -autoservername true]
}

### Open database
set nitems 0
set currclock [clock seconds]
global feeds_db
if {[catch {sqlite3 feeds_db $feeds_db_file} uerrmsg]} {
  puts "Could not open SQLite3 database '${feeds_db_file}': ${uerrmsg}"
  exit 2
}


### Fetch the feeds
if {[catch {feeds_fetch} uerrmsg]} {
  puts "Error fetching feeds: $uerrmsg"
  feeds_db close
  exit 3
}


### Close database
feeds_db close

puts "$nitems new items."