Mercurial > hg > egg-tcls
changeset 86:4c2b6482c08c
urllog: Different strategy for charset encoding conversion.
author | Matti Hamalainen <ccr@tnsp.org> |
---|---|
date | Mon, 12 Sep 2011 01:50:53 +0300 |
parents | a8278d55c6db |
children | 97c56d1e9ce2 |
files | urllog.tcl |
diffstat | 1 files changed, 7 insertions(+), 5 deletions(-) [+] |
line wrap: on
line diff
--- a/urllog.tcl Mon Sep 12 01:41:00 2011 +0300 +++ b/urllog.tcl Mon Sep 12 01:50:53 2011 +0300 @@ -138,7 +138,7 @@ set urllog_tlds [split $urllog_tlds ","] set urllog_httprep [split "\@|%40|{|%7B|}|%7D|\[|%5B|\]|%5D" "|"] -set urllog_html_ent [split "‏||—|-|‪||‬||‎||å|å|Å|Å|é|é|:|:|ä|ä|ö|ö|ä|ä|ö|ö| | |-|-|”|\"|“|\"|»|>>|"|\"|ä|ä|ö|ö|Ä|Ä|Ö|Ö|&|&|<|<|>|>|ä|ä|ö|ö|Ä|Ä" "|"] +set urllog_html_ent [split "—|-|‏||—|-|‪||‬||‎||å|Ã¥|Å|Ã…|é|é|:|:| | |”|\"|“|\"|»|>>|"|\"|ä|ä|ö|ö|Ä|Ä|Ö|Ö|&|&|<|<|>|>" "|"] ### Require packages package require sqlite3 @@ -474,17 +474,19 @@ if {[llength $umatches] > 0} { set uencoding [lindex $umatches 1] if {[string length $uencoding] > 3} { + regsub -nocase "-" $uencoding "" uencoding set uconvert 1 } } + if {$uconvert == 0} { + set uencoding "iso8859-1" + } set umatches [regexp -nocase -inline -- "<title>(.\*\?)</title>" $udata] if {[llength $umatches] > 0} { set urlTitle [lindex $umatches 1] - if {$uconvert != 0} { - if {[catch {set urlTitle [encoding convertfrom $uencoding $urlTitle]} cerrmsg]} { - urllog_log "Error in charset conversion: $cerrmsg" - } + if {[catch {set urlTitle [encoding convertfrom $uencoding $urlTitle]} cerrmsg]} { + urllog_log "Error in charset conversion: $cerrmsg" } set urlTitle [urllog_convert_ent $urlTitle] regsub -all "(^ *| *$)" $urlTitle "" urlTitle