changeset 86:4c2b6482c08c

urllog: Different strategy for charset encoding conversion.
author Matti Hamalainen <ccr@tnsp.org>
date Mon, 12 Sep 2011 01:50:53 +0300
parents a8278d55c6db
children 97c56d1e9ce2
files urllog.tcl
diffstat 1 files changed, 7 insertions(+), 5 deletions(-) [+]
line wrap: on
line diff
--- a/urllog.tcl	Mon Sep 12 01:41:00 2011 +0300
+++ b/urllog.tcl	Mon Sep 12 01:50:53 2011 +0300
@@ -138,7 +138,7 @@
 set urllog_tlds [split $urllog_tlds ","]
 set urllog_httprep [split "\@|%40|{|%7B|}|%7D|\[|%5B|\]|%5D" "|"] 
 
-set urllog_html_ent [split "&rlm;||&#8212;|-|&#x202a;||&#x202c;||&lrm;||&aring;|å|&Aring;|Å|&eacute;|é|&#58;|:|&#xe4;|ä|&#xf6;|ö|&#228;|ä|&#246;|ö|&nbsp;| |&#45;|-|&#8221;|\"|&#8220;|\"|&raquo;|>>|&quot;|\"|&auml;|ä|&ouml;|ö|&Auml;|Ä|&Ouml;|Ö|&amp;|&|&lt;|<|&gt;|>|ä|ä|ö|ö|Ä|Ä" "|"]
+set urllog_html_ent [split "—|-|&rlm;||&#8212;|-|&#x202a;||&#x202c;||&lrm;||&aring;|å|&Aring;|Å|&eacute;|é|&#58;|:|&nbsp;| |&#8221;|\"|&#8220;|\"|&raquo;|>>|&quot;|\"|&auml;|ä|&ouml;|ö|&Auml;|Ä|&Ouml;|Ö|&amp;|&|&lt;|<|&gt;|>" "|"]
 
 ### Require packages
 package require sqlite3
@@ -474,17 +474,19 @@
     if {[llength $umatches] > 0} {
       set uencoding [lindex $umatches 1]
       if {[string length $uencoding] > 3} {
+        regsub -nocase "-" $uencoding "" uencoding
         set uconvert 1
       }
     }
+    if {$uconvert == 0} {
+      set uencoding "iso8859-1"
+    }
 
     set umatches [regexp -nocase -inline -- "<title>(.\*\?)</title>" $udata]
     if {[llength $umatches] > 0} {
       set urlTitle [lindex $umatches 1]
-      if {$uconvert != 0} {
-        if {[catch {set urlTitle [encoding convertfrom $uencoding $urlTitle]} cerrmsg]} {
-          urllog_log "Error in charset conversion: $cerrmsg"
-        }
+      if {[catch {set urlTitle [encoding convertfrom $uencoding $urlTitle]} cerrmsg]} {
+        urllog_log "Error in charset conversion: $cerrmsg"
       }
       set urlTitle [urllog_convert_ent $urlTitle]
       regsub -all "(^ *| *$)" $urlTitle "" urlTitle