# HG changeset patch # User Matti Hamalainen # Date 1421704955 -7200 # Node ID 4d4f3defe3cae789b3ce2b6c0c71c09868975701 # Parent 8d14e5d3eab07de95acb086e64a4b6c1ef2fb9e3# Parent dd30f2eaabd3d70a00aff6e7c66e39169d1cf544 Merged. diff -r 8d14e5d3eab0 -r 4d4f3defe3ca urllog.tcl --- a/urllog.tcl Mon Jan 19 23:10:07 2015 +0200 +++ b/urllog.tcl Tue Jan 20 00:02:35 2015 +0200 @@ -1,7 +1,7 @@ ########################################################################## # -# URLLog v2.3.0 by Matti 'ccr' Hamalainen -# (C) Copyright 2000-2014 Tecnic Software productions (TNSP) +# URLLog v2.4.0 by Matti 'ccr' Hamalainen +# (C) Copyright 2000-2015 Tecnic Software productions (TNSP) # # This script is freely distributable under GNU GPL (version 2) license. # @@ -152,7 +152,7 @@ # No need to look below this line ########################################################################## set urllog_name "URLLog" -set urllog_version "2.3.0" +set urllog_version "2.4.0" set urllog_tlds [split $urllog_tlds ","] set urllog_httprep [split "\@|%40|{|%7B|}|%7D|\[|%5B|\]|%5D" "|"] @@ -174,12 +174,12 @@ ### Binding initializations bind pub - !urlfind urllog_pub_urlfind bind msg - !urlfind urllog_msg_urlfind -bind pubm - *.* urllog_checkmsg -bind topc - *.* urllog_checkmsg +bind pubm - *.* urllog_check_line +bind topc - *.* urllog_check_line ### Initialization messages -set urllog_message "$urllog_name v$urllog_version (C) 2000-2014 ccr/TNSP" +set urllog_message "$urllog_name v$urllog_version (C) 2000-2015 ccr/TNSP" putlog "$urllog_message" ### HTTP module initialization @@ -373,11 +373,50 @@ #------------------------------------------------------------------------- -proc urllog_checkurl {urlStr urlNick urlHost urlChan} { - global urllog_tlds urllog_check urlmsg_nosuchhost urlmsg_ioerror - global urlmsg_timeout urlmsg_errorgettingdoc urllog_httprep - global urllog_shorturl_prefix urllog_shorturl urllog_encoding - global http_tls_support +proc urllog_dorequest { urlNick urlChan urlStr urlStatus urlSCode urlCode urlData urlMeta } { + global urlmsg_ioerror urlmsg_timeout urlmsg_errorgettingdoc + + upvar 1 $urlStatus ustatus + upvar 1 $urlSCode uscode + upvar 1 $urlCode ucode + upvar 1 $urlData udata + upvar 1 $urlMeta umeta + + if {[catch {set utoken [::http::geturl $urlStr -timeout 6000 -headers {Accept-Encoding identity}]} uerrmsg]} { + urllog_verb_msg $urlNick $urlChan "$urlmsg_ioerror ($uerrmsg)" + urllog_log "HTTP request failed: $uerrmsg" + return 0 + } + + set ustatus [::http::status $utoken] + if {$ustatus == "timeout"} { + urllog_verb_msg $urlNick $urlChan "$urlmsg_timeout" + urllog_log "HTTP request timed out ($urlStr)" + return 0 + } + + if {$ustatus != "ok"} { + urllog_verb_msg $urlNick $urlChan "$urlmsg_errorgettingdoc ([::http::error $utoken])" + urllog_log "Error in HTTP transaction: [::http::error $utoken] ($urlStr)" + return 0 + } + + set ustatus [::http::status $utoken] + set uscode [::http::code $utoken] + set ucode [::http::ncode $utoken] + set udata [::http::data $utoken] + array set umeta [::http::meta $utoken] + ::http::cleanup $utoken + + return 1 +} + +#------------------------------------------------------------------------- +proc urllog_validate_url { urlNick urlChan urlMStr urlMProto } { + global urllog_tlds urllog_check urlmsg_nosuchhost urllog_httprep + global urllog_shorturl_prefix urllog_shorturl + upvar 1 $urlMStr urlStr + upvar 1 $urlMProto urlProto ### Try to guess the URL protocol component (if it is missing) set u_checktld 1 @@ -388,7 +427,7 @@ } ### Handle URLs that have an IPv4-address - if {[regexp "(\[a-z\]+)://(\[0-9\]{1,3})\\.(\[0-9\]{1,3})\\.(\[0-9\]{1,3})\\.(\[0-9\]{1,3})" $urlStr u_match u_proto ni1 ni2 ni3 ni4]} { + if {[regexp "(\[a-z\]+)://(\[0-9\]{1,3})\\.(\[0-9\]{1,3})\\.(\[0-9\]{1,3})\\.(\[0-9\]{1,3})" $urlStr u_match urlProto ni1 ni2 ni3 ni4]} { # Check if the IP is on local network if {$ni1 == 127 || $ni1 == 10 || ($ni1 == 192 && $ni2 == 168) || $ni1 == 0} { urllog_log "URL pointing to local or invalid network, ignored ($urlStr)." @@ -401,17 +440,18 @@ ### Check now if we have an ShortURL here ... if {[string match "$urllog_shorturl_prefix*" $urlStr]} { urllog_log "Ignoring ShortURL from $urlNick: $urlStr" - set uud "" - set usql "SELECT id AS uid, url AS uurl, user AS uuser, host AS uhost, chan AS uchan, title AS utitle FROM urls WHERE utime=$uud" - urldb eval $usql { - - } +# set uud "" +# set usql "SELECT id AS uid, url AS uurl, user AS uuser, host AS uhost, chan AS uchan, title AS utitle FROM urls WHERE utime=$uud" +# urldb eval $usql { +# urllog_verb_msg $urlNick $urlChan "'$utitle' - $uurl" +# return 1 +# } return 0 } ### Get URL protocol component - set u_proto "" - regexp "(\[a-z\]+)://" $urlStr u_match u_proto + set urlProto "" + regexp "(\[a-z\]+)://" $urlStr u_match urlProto ### Check the PORT (if the ":" is there) set u_record [split $urlStr "/"] @@ -424,8 +464,8 @@ } ### Is it a http or ftp url? (FIX ME!) - if {$u_proto != "http" && $u_proto != "https" && $u_proto != "ftp"} { - urllog_log "Broken URL from $urlNick: ($urlStr) UNSUPPORTED protocol class ($u_proto)." + if {$urlProto != "http" && $urlProto != "https" && $urlProto != "ftp"} { + urllog_log "Broken URL from $urlNick: ($urlStr) UNSUPPORTED protocol class ($urlProto)." return 0 } @@ -454,11 +494,24 @@ } set urlStr [string map $urllog_httprep $urlStr] + return 1 +} + + +#------------------------------------------------------------------------- +proc urllog_check_url {urlStr urlNick urlHost urlChan} { + global urllog_encoding http_tls_support urlmsg_errorgettingdoc urllog_check ### Does the URL already exist? if {![urllog_exists $urlStr $urlNick $urlHost $urlChan]} { return 1 } + + ### Validate URL compoments, etc. + set u_proto "" + if {![urllog_validate_url $urlNick $urlChan urlStr u_proto]} { + return 1 + } ### Do we perform additional optional checks? if {$urllog_check == 0 || !(($http_tls_support != 0 && $u_proto == "https") || $u_proto == "http")} { @@ -469,34 +522,42 @@ } ### Does the document pointed by the URL exist? - if {[catch {set utoken [::http::geturl $urlStr -timeout 6000 -headers {Accept-Encoding identity}]} uerrmsg]} { - urllog_verb_msg $urlNick $urlChan "$urlmsg_ioerror ($uerrmsg)" - urllog_log "HTTP request failed: $uerrmsg" - return 0 - } - - set ustatus [::http::status $utoken] - if {$ustatus == "timeout"} { - urllog_verb_msg $urlNick $urlChan "$urlmsg_timeout" - urllog_log "HTTP request timed out ($urlStr)" - return 0 + if {![urllog_dorequest $urlNick $urlChan $urlStr ustatus uscode ucode udata umeta]} { + return 1 } - if {$ustatus != "ok"} { - urllog_verb_msg $urlNick $urlChan "$urlmsg_errorgettingdoc ([::http::error $utoken])" - urllog_log "Error in HTTP transaction: [::http::error $utoken] ($urlStr)" - return 0 + ### Handle redirects + if {$ucode >= 301 && $ucode <= 302} { + set nurlStr $umeta(Location) + urllog_log "Redirection: $urlStr -> $nurlStr" + set urlStr $nurlStr + + if {![urllog_validate_url $urlNick $urlChan urlStr urlProto]} { + return 1 + } + + if {![urllog_dorequest $urlNick $urlChan $urlStr ustatus uscode ucode udata umeta]} { + return 1 + } } - # Fixme! Handle redirects! - set ustatus [::http::status $utoken] - set uscode [::http::code $utoken] - set ucode [::http::ncode $utoken] - set udata [::http::data $utoken] - array set umeta [::http::meta $utoken] - ::http::cleanup $utoken + ### Handle 2nd level redirects + if {$ucode >= 301 && $ucode <= 302} { + set nurlStr $umeta(Location) + urllog_log "Redirection #2: $urlStr -> $nurlStr" + set urlStr $nurlStr - if {$ucode >= 200 && $ucode <= 309} { + if {![urllog_validate_url $urlNick $urlChan urlStr urlProto]} { + return 1 + } + + if {![urllog_dorequest $urlNick $urlChan $urlStr ustatus uscode ucode udata umeta]} { + return 1 + } + } + + # Final document + if {$ucode >= 200 && $ucode <= 205} { set uenc_doc "" set uenc_http "" set uencoding "" @@ -588,12 +649,12 @@ #------------------------------------------------------------------------- -proc urllog_checkmsg {unick uhost uhand uchan utext} { +proc urllog_check_line {unick uhost uhand uchan utext} { global urllog_log_channels ### Check the nick if {$unick == "*"} { - urllog_log "urllog_checkmsg: nick was wc, this should not happen." + urllog_log "urllog_check_line: Nick was wc, this should not happen." return 0 } @@ -603,7 +664,7 @@ ### Do the URL checking foreach str [split $utext " "] { if {[regexp "((ftp|http|https)://\[^\[:space:\]\]+|^(www|ftp)\.\[^\[:space:\]\]+)" $str ulink]} { - urllog_checkurl $str $unick $uhost $uchan + urllog_check_url $str $unick $uhost $uchan } } return 0