# HG changeset patch # User Matti Hamalainen # Date 1613755830 -7200 # Node ID 7192d94f8c281d8e85e88343f4671d0e6289831b # Parent 4b985abf5ababd5a65cca437954fb93188fa048a fetch_feeds: Copy improved HTTP request code from urllog script. diff -r 4b985abf5aba -r 7192d94f8c28 fetch_feeds.tcl --- a/fetch_feeds.tcl Fri Feb 19 19:29:51 2021 +0200 +++ b/fetch_feeds.tcl Fri Feb 19 19:30:30 2021 +0200 @@ -46,30 +46,79 @@ upvar $urlData udata upvar $urlMeta umeta - if {[catch {set utoken [::http::geturl $urlStr -timeout 6000 -binary 1 -headers {Accept-Encoding identity}]} uerrmsg]} { + set urlHeaders {} + lappend urlHeaders "Accept-Encoding" "identity" + #lappend urlHeaders "Connection" "keep-alive" + + ### Perform request + if {[catch {set utoken [::http::geturl $urlStr -timeout 6000 -binary 1 -headers $urlHeaders]} uerrmsg]} { puts "HTTP request failed: $uerrmsg" return 0 } + ### Check status set ustatus [::http::status $utoken] - if {$ustatus == "timeout"} { - puts "HTTP request timed out ($urlStr)" + set uscode [::http::code $utoken] + set ucode [::http::ncode $utoken] + + if {$ustatus != "ok"} { + puts "Error in HTTP request: $ustatus / $uscode ($urlStr)" return 0 } - if {$ustatus != "ok"} { - puts "Error in HTTP transaction: [::http::error $utoken] ($urlStr)" - return 0 - } - - set ustatus [::http::status $utoken] - set uscode [::http::code $utoken] - set ucode [::http::ncode $utoken] + ### Get data set udata [::http::data $utoken] array set umeta [::http::meta $utoken] ::http::cleanup $utoken - return 1 + ### Sanitize the metadata KEYS + foreach {ukey uvalue} [array get umeta] { + set ukey [string tolower $ukey] + set umeta($ukey) $uvalue + } + + ### Perform encoding conversion if necessary + if {$ucode >= 200 && $ucode <= 205} { + set uenc_doc "" + set uenc_http "" + set uencoding "" + + if {[info exists umeta(content-type)] && [regexp -nocase {charset\s*=\s*([a-z0-9._-]+)} $umeta(content-type) -> uenc_http]} { + # Found character set encoding information in HTTP headers + } + + if {[regexp -nocase -- "" $udata -> uenc_doc]} { + # Found old style HTML meta tag with character set information + } elseif {[regexp -nocase -- "" $udata -> uenc_doc]} { + # Found HTML5 style meta tag with character set information + } + + # Make sanitized versions of the encoding strings + set uenc_http2 [fetch_sanitize_encoding $uenc_http] + set uenc_doc2 [fetch_sanitize_encoding $uenc_doc] + + # Check if the document has specified encoding + set uencoding $uenc_http2 + if {$uencoding == "" && $uenc_doc2 != ""} { + set uencoding $uenc_doc2 + } elseif {$uencoding == ""} { + # If _NO_ known encoding of any kind, assume the default of iso8859-1 + set uencoding "iso8859-1" + } + + #puts "Charsets: http='$uenc_http', doc='$uenc_doc' / sanitized http='$uenc_http2', doc='$uenc_doc2' -> '$uencoding'" + + # Get the document title, if any + if {$uencoding != ""} { + if {[catch {set udata [encoding convertfrom $uencoding $udata]} cerrmsg]} { + puts "Error in charset conversion: $urlStr: $cerrmsg" + return 0 + } + } + return 1 + } else { + return 0 + } }