changeset 116:4f3edcf72987

urllog: Improvements in document / HTTP encoding handling and conversion.
author Matti Hamalainen <ccr@tnsp.org>
date Thu, 13 Oct 2011 20:13:09 +0300
parents 5db02af76016
children d40a0f3af7ab
files urllog.tcl
diffstat 1 files changed, 60 insertions(+), 20 deletions(-) [+]
line wrap: on
line diff
--- a/urllog.tcl	Thu Oct 13 20:11:02 2011 +0300
+++ b/urllog.tcl	Thu Oct 13 20:13:09 2011 +0300
@@ -262,6 +262,15 @@
   return [string map {' ''} $str]
 }
 
+
+proc urllog_sanitize_encoding {uencoding} {
+  regsub -- "^\[a-z\]\[a-z\]_\[A-Z\]\[A-Z\]\." $uencoding "" uencoding
+  set uencoding [string tolower $uencoding]
+  regsub -- "^iso-" $uencoding "iso" uencoding
+  return $uencoding
+}
+
+
 #-------------------------------------------------------------------------
 proc urllog_get_short {utime} {
   global urllog_shorturl_prefix
@@ -477,30 +486,61 @@
 
   # Fixme! Handle redirects!
   set ucode [::http::ncode $utoken]
+  set udata [::http::data $utoken]
+  array set umeta [::http::meta $utoken]
+  ::http::cleanup $utoken
+
   if {$ucode >= 200 && $ucode <= 309} {
-    set udata [::http::data $utoken]
-    set uconvert 0
-    if {[regexp -nocase -- "<meta.\*\?content=\"text/html.\*\?charset=(\[^\"\]*)\".\*\?/>" $udata umatches uencoding]} {
-      if {[string length $uencoding] > 3} {
-        set uencoding [string tolower $uencoding]
-        regsub -- "iso-" $uencoding "iso" uencoding
-        set uconvert 1
+    set uenc_doc ""
+    set uenc_http ""
+    set uencoding ""
+
+    # Get information about specified character encodings
+    if {[info exists umeta(Content-Type)] && [regexp -nocase {charset\s*=\s*([a-z0-9._-]+)} $umeta(Content-Type) umatches uenc_http]} {
+      # Found character set encoding information in HTTP headers
+    }
+
+    if {[regexp -nocase -- "<meta.\*\?content=\"text/html.\*\?charset=(\[^\"\]*)\".\*\?/>" $udata umatches uenc_doc]} {
+      # Found old style HTML meta tag with character set information
+    } elseif {[regexp -nocase -- "<meta.\*\?charset=\"(\[^\"\]*)\".\*\?/>" $udata umatches uenc_doc]} {
+      # Found HTML5 style meta tag with character set information
+    }
+
+    # Make sanitized versions of the encoding strings
+    set uenc_http2 [urllog_sanitize_encoding $uenc_http]
+    set uenc_doc2 [urllog_sanitize_encoding $uenc_doc]
+
+    # KLUDGE!
+    set uencoding $uenc_http2
+
+    # Check if the document has specified encoding
+    if {$uenc_doc != ""} {
+      # Does it differ from what HTTP says?
+      if {$uenc_http != "" && $uenc_doc != $uenc_http && $uenc_doc2 != $uenc_http2} {
+        # Yes, we will try reconverting
+        set uencoding $uenc_doc2
       }
-    }
-    if {$uconvert == 0} {
+    } elseif {$uenc_http == ""} {
+      # If _NO_ known encoding of any kind, assume the default of iso8859-1    
       set uencoding "iso8859-1"
     }
 
-    set umatches [regexp -nocase -inline -- "<title>(.\*\?)</title>" $udata]
-    if {[llength $umatches] > 0} {
-      set urlTitle [lindex $umatches 1]
-      if {[catch {set urlTitle [encoding convertfrom $uencoding $urlTitle]} cerrmsg]} {
-        urllog_log "Error in charset conversion: $cerrmsg"
+    # Get the document title, if any
+    set urlTitle ""
+    if {[regexp -nocase -- "<title>(.\*\?)</title>" $udata umatches urlTitle]} {
+      # If character set conversion is required, do it now
+      if {$uencoding != ""} {
+      	putlog "conversion requested from $uencoding"
+      	if {[catch {set urlTitle [encoding convertfrom $uencoding $urlTitle]} cerrmsg]} {
+          urllog_log "Error in charset conversion: $cerrmsg"
+        }
       }
-      set urlTitle [urllog_convert_ent $urlTitle]
-      regsub -all "(^ *| *$)" $urlTitle "" urlTitle
-    } else {
-      set urlTitle ""
+      
+      # Convert some HTML entities to plaintext and do some cleanup
+      set utmp [urllog_convert_ent $urlTitle]
+      regsub -all "\r|\n|\t" $utmp " " utmp
+      regsub -all "  *" $utmp " " utmp
+      set urlTitle [string trim $utmp]
     }
 
     # Rasiatube hack
@@ -532,8 +572,8 @@
     }
     return 1
   } else {
-    urllog_verb_msg $urlNick $urlChan "$urlmsg_errorgettingdoc ([::http::code $utoken])"
-    urllog_log "[::http::code $utoken] - $urlStr"
+    urllog_verb_msg $urlNick $urlChan "$urlmsg_errorgettingdoc ($ucode)"
+    urllog_log "$ucode - $urlStr"
   }
 
   ::http::cleanup $utoken