changeset 656:7192d94f8c28

fetch_feeds: Copy improved HTTP request code from urllog script.
author Matti Hamalainen <ccr@tnsp.org>
date Fri, 19 Feb 2021 19:30:30 +0200
parents 4b985abf5aba
children f46c152183a2
files fetch_feeds.tcl
diffstat 1 files changed, 61 insertions(+), 12 deletions(-) [+]
line wrap: on
line diff
--- a/fetch_feeds.tcl	Fri Feb 19 19:29:51 2021 +0200
+++ b/fetch_feeds.tcl	Fri Feb 19 19:30:30 2021 +0200
@@ -46,30 +46,79 @@
   upvar $urlData udata
   upvar $urlMeta umeta
 
-  if {[catch {set utoken [::http::geturl $urlStr -timeout 6000 -binary 1 -headers {Accept-Encoding identity}]} uerrmsg]} {
+  set urlHeaders {}
+  lappend urlHeaders "Accept-Encoding" "identity"
+  #lappend urlHeaders "Connection" "keep-alive"
+
+  ### Perform request
+  if {[catch {set utoken [::http::geturl $urlStr -timeout 6000 -binary 1 -headers $urlHeaders]} uerrmsg]} {
     puts "HTTP request failed: $uerrmsg"
     return 0
   }
 
+  ### Check status
   set ustatus [::http::status $utoken]
-  if {$ustatus == "timeout"} {
-    puts "HTTP request timed out ($urlStr)"
+  set uscode [::http::code $utoken]
+  set ucode [::http::ncode $utoken]
+
+  if {$ustatus != "ok"} {
+    puts "Error in HTTP request: $ustatus / $uscode ($urlStr)"
     return 0
   }
 
-  if {$ustatus != "ok"} {
-    puts "Error in HTTP transaction: [::http::error $utoken] ($urlStr)"
-    return 0
-  }
-
-  set ustatus [::http::status $utoken]
-  set uscode [::http::code $utoken]
-  set ucode [::http::ncode $utoken]
+  ### Get data
   set udata [::http::data $utoken]
   array set umeta [::http::meta $utoken]
   ::http::cleanup $utoken
 
-  return 1
+  ### Sanitize the metadata KEYS
+  foreach {ukey uvalue} [array get umeta] {
+    set ukey [string tolower $ukey]
+    set umeta($ukey) $uvalue
+  }
+
+  ### Perform encoding conversion if necessary
+  if {$ucode >= 200 && $ucode <= 205} {
+    set uenc_doc ""
+    set uenc_http ""
+    set uencoding ""
+
+    if {[info exists umeta(content-type)] && [regexp -nocase {charset\s*=\s*([a-z0-9._-]+)} $umeta(content-type) -> uenc_http]} {
+      # Found character set encoding information in HTTP headers
+    }
+
+    if {[regexp -nocase -- "<meta.\*\?content=\"text/html.\*\?charset=(\[^\"\]*)\".\*\?/\?>" $udata -> uenc_doc]} {
+      # Found old style HTML meta tag with character set information
+    } elseif {[regexp -nocase -- "<meta.\*\?charset=\"(\[^\"\]*)\".\*\?/\?>" $udata -> uenc_doc]} {
+      # Found HTML5 style meta tag with character set information
+    }
+
+    # Make sanitized versions of the encoding strings
+    set uenc_http2 [fetch_sanitize_encoding $uenc_http]
+    set uenc_doc2 [fetch_sanitize_encoding $uenc_doc]
+
+    # Check if the document has specified encoding
+    set uencoding $uenc_http2
+    if {$uencoding == "" && $uenc_doc2 != ""} {
+      set uencoding $uenc_doc2
+    } elseif {$uencoding == ""} {
+      # If _NO_ known encoding of any kind, assume the default of iso8859-1
+      set uencoding "iso8859-1"
+    }
+
+    #puts "Charsets: http='$uenc_http', doc='$uenc_doc' / sanitized http='$uenc_http2', doc='$uenc_doc2' -> '$uencoding'"
+
+    # Get the document title, if any
+    if {$uencoding != ""} {
+      if {[catch {set udata [encoding convertfrom $uencoding $udata]} cerrmsg]} {
+        puts "Error in charset conversion: $urlStr: $cerrmsg"
+        return 0
+      }
+    }
+    return 1
+  } else {
+    return 0
+  }
 }