comparison urllog.tcl @ 116:4f3edcf72987

urllog: Improvements in document / HTTP encoding handling and conversion.
author Matti Hamalainen <ccr@tnsp.org>
date Thu, 13 Oct 2011 20:13:09 +0300
parents 5db02af76016
children d40a0f3af7ab
comparison
equal deleted inserted replaced
115:5db02af76016 116:4f3edcf72987
259 259
260 260
261 proc urllog_escape { str } { 261 proc urllog_escape { str } {
262 return [string map {' ''} $str] 262 return [string map {' ''} $str]
263 } 263 }
264
265
266 proc urllog_sanitize_encoding {uencoding} {
267 regsub -- "^\[a-z\]\[a-z\]_\[A-Z\]\[A-Z\]\." $uencoding "" uencoding
268 set uencoding [string tolower $uencoding]
269 regsub -- "^iso-" $uencoding "iso" uencoding
270 return $uencoding
271 }
272
264 273
265 #------------------------------------------------------------------------- 274 #-------------------------------------------------------------------------
266 proc urllog_get_short {utime} { 275 proc urllog_get_short {utime} {
267 global urllog_shorturl_prefix 276 global urllog_shorturl_prefix
268 277
475 return 0 484 return 0
476 } 485 }
477 486
478 # Fixme! Handle redirects! 487 # Fixme! Handle redirects!
479 set ucode [::http::ncode $utoken] 488 set ucode [::http::ncode $utoken]
489 set udata [::http::data $utoken]
490 array set umeta [::http::meta $utoken]
491 ::http::cleanup $utoken
492
480 if {$ucode >= 200 && $ucode <= 309} { 493 if {$ucode >= 200 && $ucode <= 309} {
481 set udata [::http::data $utoken] 494 set uenc_doc ""
482 set uconvert 0 495 set uenc_http ""
483 if {[regexp -nocase -- "<meta.\*\?content=\"text/html.\*\?charset=(\[^\"\]*)\".\*\?/>" $udata umatches uencoding]} { 496 set uencoding ""
484 if {[string length $uencoding] > 3} { 497
485 set uencoding [string tolower $uencoding] 498 # Get information about specified character encodings
486 regsub -- "iso-" $uencoding "iso" uencoding 499 if {[info exists umeta(Content-Type)] && [regexp -nocase {charset\s*=\s*([a-z0-9._-]+)} $umeta(Content-Type) umatches uenc_http]} {
487 set uconvert 1 500 # Found character set encoding information in HTTP headers
501 }
502
503 if {[regexp -nocase -- "<meta.\*\?content=\"text/html.\*\?charset=(\[^\"\]*)\".\*\?/>" $udata umatches uenc_doc]} {
504 # Found old style HTML meta tag with character set information
505 } elseif {[regexp -nocase -- "<meta.\*\?charset=\"(\[^\"\]*)\".\*\?/>" $udata umatches uenc_doc]} {
506 # Found HTML5 style meta tag with character set information
507 }
508
509 # Make sanitized versions of the encoding strings
510 set uenc_http2 [urllog_sanitize_encoding $uenc_http]
511 set uenc_doc2 [urllog_sanitize_encoding $uenc_doc]
512
513 # KLUDGE!
514 set uencoding $uenc_http2
515
516 # Check if the document has specified encoding
517 if {$uenc_doc != ""} {
518 # Does it differ from what HTTP says?
519 if {$uenc_http != "" && $uenc_doc != $uenc_http && $uenc_doc2 != $uenc_http2} {
520 # Yes, we will try reconverting
521 set uencoding $uenc_doc2
488 } 522 }
489 } 523 } elseif {$uenc_http == ""} {
490 if {$uconvert == 0} { 524 # If _NO_ known encoding of any kind, assume the default of iso8859-1
491 set uencoding "iso8859-1" 525 set uencoding "iso8859-1"
492 } 526 }
493 527
494 set umatches [regexp -nocase -inline -- "<title>(.\*\?)</title>" $udata] 528 # Get the document title, if any
495 if {[llength $umatches] > 0} { 529 set urlTitle ""
496 set urlTitle [lindex $umatches 1] 530 if {[regexp -nocase -- "<title>(.\*\?)</title>" $udata umatches urlTitle]} {
497 if {[catch {set urlTitle [encoding convertfrom $uencoding $urlTitle]} cerrmsg]} { 531 # If character set conversion is required, do it now
498 urllog_log "Error in charset conversion: $cerrmsg" 532 if {$uencoding != ""} {
533 putlog "conversion requested from $uencoding"
534 if {[catch {set urlTitle [encoding convertfrom $uencoding $urlTitle]} cerrmsg]} {
535 urllog_log "Error in charset conversion: $cerrmsg"
536 }
499 } 537 }
500 set urlTitle [urllog_convert_ent $urlTitle] 538
501 regsub -all "(^ *| *$)" $urlTitle "" urlTitle 539 # Convert some HTML entities to plaintext and do some cleanup
502 } else { 540 set utmp [urllog_convert_ent $urlTitle]
503 set urlTitle "" 541 regsub -all "\r|\n|\t" $utmp " " utmp
542 regsub -all " *" $utmp " " utmp
543 set urlTitle [string trim $utmp]
504 } 544 }
505 545
506 # Rasiatube hack 546 # Rasiatube hack
507 if {[string match "*/rasiatube/view*" $urlStr]} { 547 if {[string match "*/rasiatube/view*" $urlStr]} {
508 set rasia 0 548 set rasia 0
530 if {[urllog_exists $urlStr $urlNick $urlHost $urlChan]} { 570 if {[urllog_exists $urlStr $urlNick $urlHost $urlChan]} {
531 urllog_addurl $urlStr $urlNick $urlHost $urlChan $urlTitle 571 urllog_addurl $urlStr $urlNick $urlHost $urlChan $urlTitle
532 } 572 }
533 return 1 573 return 1
534 } else { 574 } else {
535 urllog_verb_msg $urlNick $urlChan "$urlmsg_errorgettingdoc ([::http::code $utoken])" 575 urllog_verb_msg $urlNick $urlChan "$urlmsg_errorgettingdoc ($ucode)"
536 urllog_log "[::http::code $utoken] - $urlStr" 576 urllog_log "$ucode - $urlStr"
537 } 577 }
538 578
539 ::http::cleanup $utoken 579 ::http::cleanup $utoken
540 } 580 }
541 581