Mercurial > hg > egg-tcls
comparison urllog.tcl @ 116:4f3edcf72987
urllog: Improvements in document / HTTP encoding handling and conversion.
author | Matti Hamalainen <ccr@tnsp.org> |
---|---|
date | Thu, 13 Oct 2011 20:13:09 +0300 |
parents | 5db02af76016 |
children | d40a0f3af7ab |
comparison
equal
deleted
inserted
replaced
115:5db02af76016 | 116:4f3edcf72987 |
---|---|
259 | 259 |
260 | 260 |
261 proc urllog_escape { str } { | 261 proc urllog_escape { str } { |
262 return [string map {' ''} $str] | 262 return [string map {' ''} $str] |
263 } | 263 } |
264 | |
265 | |
266 proc urllog_sanitize_encoding {uencoding} { | |
267 regsub -- "^\[a-z\]\[a-z\]_\[A-Z\]\[A-Z\]\." $uencoding "" uencoding | |
268 set uencoding [string tolower $uencoding] | |
269 regsub -- "^iso-" $uencoding "iso" uencoding | |
270 return $uencoding | |
271 } | |
272 | |
264 | 273 |
265 #------------------------------------------------------------------------- | 274 #------------------------------------------------------------------------- |
266 proc urllog_get_short {utime} { | 275 proc urllog_get_short {utime} { |
267 global urllog_shorturl_prefix | 276 global urllog_shorturl_prefix |
268 | 277 |
475 return 0 | 484 return 0 |
476 } | 485 } |
477 | 486 |
478 # Fixme! Handle redirects! | 487 # Fixme! Handle redirects! |
479 set ucode [::http::ncode $utoken] | 488 set ucode [::http::ncode $utoken] |
489 set udata [::http::data $utoken] | |
490 array set umeta [::http::meta $utoken] | |
491 ::http::cleanup $utoken | |
492 | |
480 if {$ucode >= 200 && $ucode <= 309} { | 493 if {$ucode >= 200 && $ucode <= 309} { |
481 set udata [::http::data $utoken] | 494 set uenc_doc "" |
482 set uconvert 0 | 495 set uenc_http "" |
483 if {[regexp -nocase -- "<meta.\*\?content=\"text/html.\*\?charset=(\[^\"\]*)\".\*\?/>" $udata umatches uencoding]} { | 496 set uencoding "" |
484 if {[string length $uencoding] > 3} { | 497 |
485 set uencoding [string tolower $uencoding] | 498 # Get information about specified character encodings |
486 regsub -- "iso-" $uencoding "iso" uencoding | 499 if {[info exists umeta(Content-Type)] && [regexp -nocase {charset\s*=\s*([a-z0-9._-]+)} $umeta(Content-Type) umatches uenc_http]} { |
487 set uconvert 1 | 500 # Found character set encoding information in HTTP headers |
501 } | |
502 | |
503 if {[regexp -nocase -- "<meta.\*\?content=\"text/html.\*\?charset=(\[^\"\]*)\".\*\?/>" $udata umatches uenc_doc]} { | |
504 # Found old style HTML meta tag with character set information | |
505 } elseif {[regexp -nocase -- "<meta.\*\?charset=\"(\[^\"\]*)\".\*\?/>" $udata umatches uenc_doc]} { | |
506 # Found HTML5 style meta tag with character set information | |
507 } | |
508 | |
509 # Make sanitized versions of the encoding strings | |
510 set uenc_http2 [urllog_sanitize_encoding $uenc_http] | |
511 set uenc_doc2 [urllog_sanitize_encoding $uenc_doc] | |
512 | |
513 # KLUDGE! | |
514 set uencoding $uenc_http2 | |
515 | |
516 # Check if the document has specified encoding | |
517 if {$uenc_doc != ""} { | |
518 # Does it differ from what HTTP says? | |
519 if {$uenc_http != "" && $uenc_doc != $uenc_http && $uenc_doc2 != $uenc_http2} { | |
520 # Yes, we will try reconverting | |
521 set uencoding $uenc_doc2 | |
488 } | 522 } |
489 } | 523 } elseif {$uenc_http == ""} { |
490 if {$uconvert == 0} { | 524 # If _NO_ known encoding of any kind, assume the default of iso8859-1 |
491 set uencoding "iso8859-1" | 525 set uencoding "iso8859-1" |
492 } | 526 } |
493 | 527 |
494 set umatches [regexp -nocase -inline -- "<title>(.\*\?)</title>" $udata] | 528 # Get the document title, if any |
495 if {[llength $umatches] > 0} { | 529 set urlTitle "" |
496 set urlTitle [lindex $umatches 1] | 530 if {[regexp -nocase -- "<title>(.\*\?)</title>" $udata umatches urlTitle]} { |
497 if {[catch {set urlTitle [encoding convertfrom $uencoding $urlTitle]} cerrmsg]} { | 531 # If character set conversion is required, do it now |
498 urllog_log "Error in charset conversion: $cerrmsg" | 532 if {$uencoding != ""} { |
533 putlog "conversion requested from $uencoding" | |
534 if {[catch {set urlTitle [encoding convertfrom $uencoding $urlTitle]} cerrmsg]} { | |
535 urllog_log "Error in charset conversion: $cerrmsg" | |
536 } | |
499 } | 537 } |
500 set urlTitle [urllog_convert_ent $urlTitle] | 538 |
501 regsub -all "(^ *| *$)" $urlTitle "" urlTitle | 539 # Convert some HTML entities to plaintext and do some cleanup |
502 } else { | 540 set utmp [urllog_convert_ent $urlTitle] |
503 set urlTitle "" | 541 regsub -all "\r|\n|\t" $utmp " " utmp |
542 regsub -all " *" $utmp " " utmp | |
543 set urlTitle [string trim $utmp] | |
504 } | 544 } |
505 | 545 |
506 # Rasiatube hack | 546 # Rasiatube hack |
507 if {[string match "*/rasiatube/view*" $urlStr]} { | 547 if {[string match "*/rasiatube/view*" $urlStr]} { |
508 set rasia 0 | 548 set rasia 0 |
530 if {[urllog_exists $urlStr $urlNick $urlHost $urlChan]} { | 570 if {[urllog_exists $urlStr $urlNick $urlHost $urlChan]} { |
531 urllog_addurl $urlStr $urlNick $urlHost $urlChan $urlTitle | 571 urllog_addurl $urlStr $urlNick $urlHost $urlChan $urlTitle |
532 } | 572 } |
533 return 1 | 573 return 1 |
534 } else { | 574 } else { |
535 urllog_verb_msg $urlNick $urlChan "$urlmsg_errorgettingdoc ([::http::code $utoken])" | 575 urllog_verb_msg $urlNick $urlChan "$urlmsg_errorgettingdoc ($ucode)" |
536 urllog_log "[::http::code $utoken] - $urlStr" | 576 urllog_log "$ucode - $urlStr" |
537 } | 577 } |
538 | 578 |
539 ::http::cleanup $utoken | 579 ::http::cleanup $utoken |
540 } | 580 } |
541 | 581 |