changeset 101:372b63af72b5

urllog: Improve page character set encoding detection/guessing.
author Matti Hamalainen <ccr@tnsp.org>
date Tue, 13 Sep 2011 15:49:43 +0300
parents 8139169293f9
children 5425dc418505
files urllog.tcl
diffstat 1 files changed, 3 insertions(+), 4 deletions(-) [+]
line wrap: on
line diff
--- a/urllog.tcl	Mon Sep 12 19:14:14 2011 +0300
+++ b/urllog.tcl	Tue Sep 13 15:49:43 2011 +0300
@@ -477,12 +477,11 @@
   set ucode [::http::ncode $utoken]
   if {$ucode >= 200 && $ucode <= 309} {
     set udata [::http::data $utoken]
-    set umatches [regexp -nocase -inline -- "<meta.\*\?content=\".\*\?charset=(\[^\"\]*)\"/>" $udata]
     set uconvert 0
-    if {[llength $umatches] > 0} {
-      set uencoding [lindex $umatches 1]
+    if {[regexp -nocase -- "<meta.\*\?content=\"text/html.\*\?charset=(\[^\"\]*)\".\*\?/>" $udata umatches uencoding]} {
       if {[string length $uencoding] > 3} {
-        regsub -nocase "-" $uencoding "" uencoding
+        set uencoding [string tolower $uencoding]
+        regsub -- "iso-" $uencoding "iso" uencoding
         set uconvert 1
       }
     }