Mercurial > hg > egg-tcls
view fetch_feeds.tcl @ 681:204699e84dee
utillib: Increase HTTP request timeout to 8 seconds from 6.
author | Matti Hamalainen <ccr@tnsp.org> |
---|---|
date | Thu, 15 Jul 2021 12:36:50 +0300 |
parents | 721c8cef5039 |
children |
line wrap: on
line source
#!/usr/bin/tclsh # # NOTICE! Change above path to correct tclsh binary path! # ############################################################################## # # FeedCheck fetcher v1.1 by Matti 'ccr' Hamalainen <ccr@tnsp.org> # (C) Copyright 2008-2021 Tecnic Software productions (TNSP) # # This script is freely distributable under GNU GPL (version 2) license. # # NOTICE! NOTICE! This script REQUIRES tcl-tls 1.7.13+ if you wish to # support SSL/TLS https for URL checking. And you probably do. # ############################################################################## ### The configuration should be in config.feeds in same directory ### as this script. Or change the line below to point where ever ### you wish. See "config.feeds.example" for an example config file. source [file dirname [info script]]/config.feeds ### Required utillib.tcl source [file dirname [info script]]/utillib.tcl ############################################################################## package require sqlite3 package require http ############################################################################## ### Utility functions proc fetch_sanitize_encoding {uencoding} { regsub -- "^\[a-z\]\[a-z\]_\[A-Z\]\[A-Z\]\." $uencoding "" uencoding set uencoding [string tolower $uencoding] regsub -- "^iso-" $uencoding "iso" uencoding return $uencoding } proc fetch_dorequest { urlStr urlStatus urlSCode urlCode urlData urlMeta } { upvar $urlStatus ustatus upvar $urlSCode uscode upvar $urlCode ucode upvar $urlData udata upvar $urlMeta umeta set urlHeaders {} lappend urlHeaders "Accept-Encoding" "identity" #lappend urlHeaders "Connection" "keep-alive" set uresult [utl_http_do_request $urlHeaders $urlStr ustatus uscode ucode udata umeta] if {$uresult == -1} { puts "HTTP request failed: $uscode ($urlStr)" return 0 } elseif {$uresult < 0} { urllog_log "Error in HTTP request: $ustatus / $uscode ($urlStr)" return 0 } ### Perform encoding conversion if necessary if {$ucode >= 200 && $ucode <= 205} { set uenc_doc "" set uenc_http "" set uencoding "" if {[info exists umeta(content-type)] && [regexp -nocase {charset\s*=\s*([a-z0-9._-]+)} $umeta(content-type) -> uenc_http]} { # Found character set encoding information in HTTP headers } if {[regexp -nocase -- "<meta.\*\?content=\"text/html.\*\?charset=(\[^\"\]*)\".\*\?/\?>" $udata -> uenc_doc]} { # Found old style HTML meta tag with character set information } elseif {[regexp -nocase -- "<meta.\*\?charset=\"(\[^\"\]*)\".\*\?/\?>" $udata -> uenc_doc]} { # Found HTML5 style meta tag with character set information } # Make sanitized versions of the encoding strings set uenc_http2 [fetch_sanitize_encoding $uenc_http] set uenc_doc2 [fetch_sanitize_encoding $uenc_doc] # Check if the document has specified encoding set uencoding $uenc_http2 if {$uencoding == "" && $uenc_doc2 != ""} { set uencoding $uenc_doc2 } elseif {$uencoding == ""} { # If _NO_ known encoding of any kind, assume the default of iso8859-1 set uencoding "iso8859-1" } #puts "Charsets: http='$uenc_http', doc='$uenc_doc' / sanitized http='$uenc_http2', doc='$uenc_doc2' -> '$uencoding'" # Get the document title, if any if {$uencoding != ""} { if {[catch {set udata [encoding convertfrom $uencoding $udata]} cerrmsg]} { puts "Error in charset conversion: $urlStr: $cerrmsg" return 0 } } return 1 } else { return 0 } } proc add_entry {uname uprefix uurl utitle} { global currclock feeds_db nitems set utmp [utl_convert_html_ent $uurl] if {[string match "http://*" $utmp] || [string match "https://*" $utmp]} { set utest "$utmp" } else { if {[string range $uprefix end end] != "/" && [string range $utmp 0 0] != "/"} { set utest "$uprefix/$utmp" } else { set utest "$uprefix$utmp" } } set usql "SELECT title FROM feeds WHERE url='[utl_escape $utest]' AND feed='[utl_escape $uname]'" if {![feeds_db exists $usql]} { # puts "NEW: $utest : $utitle" set usql "INSERT INTO feeds (feed,utime,url,title) VALUES ('[utl_escape $uname]', $currclock, '[utl_escape $utest]', '[utl_escape [utl_convert_html_ent $utitle]]')" incr nitems if {[catch {feeds_db eval $usql} uerrmsg]} { puts "\nError: $uerrmsg on:\n$usql" exit 15 } } } proc add_rss_feed {datauri dataname dataprefix} { if {![fetch_dorequest $datauri ustatus uscode ucode upage umeta]} { return 0 } set umatches [regexp -all -nocase -inline -- "<item>.\*\?<title><..CDATA.(.\*\?)\\\]\\\]></title>.\*\?<link>(http.\*\?)</link>.\*\?</item>" $upage] set nmatches [llength $umatches] for {set n 0} {$n < $nmatches} {incr n 3} { add_entry $dataname $dataprefix [lindex $umatches [expr $n+2]] [lindex $umatches [expr $n+1]] } if {$nmatches == 0} { set umatches [regexp -all -nocase -inline -- "<item>.\*\?<title>(.\*\?)</title>.\*\?<link>(http.\*\?)</link>.\*\?</item>" $upage] set nmatches [llength $umatches] for {set n 0} {$n < $nmatches} {incr n 3} { add_entry $dataname $dataprefix [lindex $umatches [expr $n+2]] [lindex $umatches [expr $n+1]] } } if {$nmatches == 0} { set umatches [regexp -all -nocase -inline -- "<item \[^>\]*>.\*\?<title>(.\*\?)</title>.\*\?<link>(http.\*\?)</link>.\*\?</item>" $upage] set nmatches [llength $umatches] for {set n 0} {$n < $nmatches} {incr n 3} { add_entry $dataname $dataprefix [lindex $umatches [expr $n+2]] [lindex $umatches [expr $n+1]] } } return 0 } ############################################################################## ### Fetch and parse Halla-aho's blog page data proc fetch_halla_aho { } { set datauri "http://www.halla-aho.com/scripta/"; set dataname "Mestari" if {![fetch_dorequest $datauri ustatus uscode ucode upage umeta]} { return 0 } set umatches [regexp -all -nocase -inline -- "<a href=\"(\[^\"\]+\.html)\"><b>(\[^<\]+)</b>" $upage] set nmatches [llength $umatches] for {set n 0} {$n < $nmatches} {incr n 3} { add_entry $dataname $datauri [lindex $umatches [expr $n+1]] [lindex $umatches [expr $n+2]] } set umatches [regexp -all -nocase -inline -- "<a href=\"(\[^\"\]+\.html)\">(\[^<\]\[^b\]\[^<\]+)</a>" $upage] set nmatches [llength $umatches] for {set n 0} {$n < $nmatches} {incr n 3} { add_entry $dataname $datauri [lindex $umatches [expr $n+1]] [lindex $umatches [expr $n+2]] } } ### The Adventurers proc fetch_adventurers { } { set datauri "http://www.peldor.com/chapters/index_sidebar.html"; set dataname "The Adventurers" if {![fetch_dorequest $datauri ustatus uscode ucode upage umeta]} { return 0 } set umatches [regexp -all -nocase -inline -- "<a href=\"(\[^\"\]+)\">(\[^<\]+)</a>" $upage] set nmatches [llength $umatches] for {set n 0} {$n < $nmatches} {incr n 3} { add_entry $dataname "http://www.peldor.com/" [lindex $umatches [expr $n+1]] [lindex $umatches [expr $n+2]] } } ### Order of the Stick proc fetch_oots { } { set datauri "https://www.giantitp.com/comics/oots.html"; set dataname "OOTS" if {![fetch_dorequest $datauri ustatus uscode ucode upage umeta]} { return 0 } set umatches [regexp -all -nocase -inline -- "<a href=\"(/comics/oots\[0-9\]+\.html)\">(\[^<\]+)</a>" $upage] set nmatches [llength $umatches] for {set n 0} {$n < $nmatches} {incr n 3} { add_entry $dataname "https://www.giantitp.com" [lindex $umatches [expr $n+1]] [lindex $umatches [expr $n+2]] } } ### Poliisi tiedotteet proc fetch_poliisi { datauri dataname dataprefix } { if {![fetch_dorequest $datauri ustatus uscode ucode upage umeta]} { return 0 } set umatches [regexp -all -nocase -inline -- "<div class=\"channelitem\"><div class=\"date\">(.*?)</div><a class=\"article\" href=\"(\[^\"\]+)\">(\[^<\]+)</a>" $upage] set nmatches [llength $umatches] for {set n 0} {$n < $nmatches} {incr n 4} { set stmp [string trim [lindex $umatches [expr $n+3]]] add_entry $dataname $dataprefix [lindex $umatches [expr $n+2]] "[lindex $umatches [expr $n+1]]: $stmp" } } ############################################################################## ### ### Main code starts ### ############################################################################## ### Set up HTTP stuff if {[info exists http_user_agent] && $http_user_agent != ""} { ::http::config -urlencoding utf8 -useragent $http_user_agent } else { ::http::config -urlencoding utf8 -useragent "Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101 Firefox/38.0" } if {[info exists http_use_proxy] && $http_use_proxy != 0} { ::http::config -proxyhost $http_proxy_host -proxyport $http_proxy_port } if {[info exists http_tls_support] && $http_tls_support != 0} { package require tls ::http::register https 443 [list ::tls::socket -request true -require true -ssl2 false -ssl3 false -tls1 true -tls1.1 true -tls1.2 true -cadir $http_tls_cadir -autoservername true] } ### Open database set nitems 0 set currclock [clock seconds] global feeds_db if {[catch {sqlite3 feeds_db $feeds_db_file} uerrmsg]} { puts "Could not open SQLite3 database '${feeds_db_file}': ${uerrmsg}" exit 2 } ### Fetch the feeds if {[catch {feeds_fetch} uerrmsg]} { puts "Error fetching feeds: $uerrmsg" feeds_db close exit 3 } ### Close database feeds_db close puts "$nitems new items."