# HG changeset patch # User Matti Hamalainen # Date 1421704862 -7200 # Node ID e59f0c3ea0f4b0816d7b18291ee5414076a21e58 # Parent e706f1cdebb4da533c0cdfbcf5413268a7c06032 urllog: Handle first and second level redirects. diff -r e706f1cdebb4 -r e59f0c3ea0f4 urllog.tcl --- a/urllog.tcl Tue Jan 20 00:00:01 2015 +0200 +++ b/urllog.tcl Tue Jan 20 00:01:02 2015 +0200 @@ -373,11 +373,50 @@ #------------------------------------------------------------------------- -proc urllog_checkurl {urlStr urlNick urlHost urlChan} { - global urllog_tlds urllog_check urlmsg_nosuchhost urlmsg_ioerror - global urlmsg_timeout urlmsg_errorgettingdoc urllog_httprep - global urllog_shorturl_prefix urllog_shorturl urllog_encoding - global http_tls_support +proc urllog_dorequest { urlNick urlChan urlStr urlStatus urlSCode urlCode urlData urlMeta } { + global urlmsg_ioerror urlmsg_timeout urlmsg_errorgettingdoc + + upvar 1 $urlStatus ustatus + upvar 1 $urlSCode uscode + upvar 1 $urlCode ucode + upvar 1 $urlData udata + upvar 1 $urlMeta umeta + + if {[catch {set utoken [::http::geturl $urlStr -timeout 6000 -headers {Accept-Encoding identity}]} uerrmsg]} { + urllog_verb_msg $urlNick $urlChan "$urlmsg_ioerror ($uerrmsg)" + urllog_log "HTTP request failed: $uerrmsg" + return 0 + } + + set ustatus [::http::status $utoken] + if {$ustatus == "timeout"} { + urllog_verb_msg $urlNick $urlChan "$urlmsg_timeout" + urllog_log "HTTP request timed out ($urlStr)" + return 0 + } + + if {$ustatus != "ok"} { + urllog_verb_msg $urlNick $urlChan "$urlmsg_errorgettingdoc ([::http::error $utoken])" + urllog_log "Error in HTTP transaction: [::http::error $utoken] ($urlStr)" + return 0 + } + + set ustatus [::http::status $utoken] + set uscode [::http::code $utoken] + set ucode [::http::ncode $utoken] + set udata [::http::data $utoken] + array set umeta [::http::meta $utoken] + ::http::cleanup $utoken + + return 1 +} + +#------------------------------------------------------------------------- +proc urllog_validate_url { urlNick urlChan urlMStr urlMProto } { + global urllog_tlds urllog_check urlmsg_nosuchhost urllog_httprep + global urllog_shorturl_prefix urllog_shorturl + upvar 1 $urlMStr urlStr + upvar 1 $urlMProto urlProto ### Try to guess the URL protocol component (if it is missing) set u_checktld 1 @@ -388,7 +427,7 @@ } ### Handle URLs that have an IPv4-address - if {[regexp "(\[a-z\]+)://(\[0-9\]{1,3})\\.(\[0-9\]{1,3})\\.(\[0-9\]{1,3})\\.(\[0-9\]{1,3})" $urlStr u_match u_proto ni1 ni2 ni3 ni4]} { + if {[regexp "(\[a-z\]+)://(\[0-9\]{1,3})\\.(\[0-9\]{1,3})\\.(\[0-9\]{1,3})\\.(\[0-9\]{1,3})" $urlStr u_match urlProto ni1 ni2 ni3 ni4]} { # Check if the IP is on local network if {$ni1 == 127 || $ni1 == 10 || ($ni1 == 192 && $ni2 == 168) || $ni1 == 0} { urllog_log "URL pointing to local or invalid network, ignored ($urlStr)." @@ -410,8 +449,8 @@ } ### Get URL protocol component - set u_proto "" - regexp "(\[a-z\]+)://" $urlStr u_match u_proto + set urlProto "" + regexp "(\[a-z\]+)://" $urlStr u_match urlProto ### Check the PORT (if the ":" is there) set u_record [split $urlStr "/"] @@ -423,14 +462,9 @@ return 0 } - # Default to port 80 (HTTP) - if {![urllog_isnumber $u_port]} { - set u_port 80 - } - ### Is it a http or ftp url? (FIX ME!) - if {$u_proto != "http" && $u_proto != "https" && $u_proto != "ftp"} { - urllog_log "Broken URL from $urlNick: ($urlStr) UNSUPPORTED protocol class ($u_proto)." + if {$urlProto != "http" && $urlProto != "https" && $urlProto != "ftp"} { + urllog_log "Broken URL from $urlNick: ($urlStr) UNSUPPORTED protocol class ($urlProto)." return 0 } @@ -459,11 +493,24 @@ } set urlStr [string map $urllog_httprep $urlStr] + return 1 +} + + +#------------------------------------------------------------------------- +proc urllog_check_url {urlStr urlNick urlHost urlChan} { + global urllog_encoding http_tls_support urlmsg_errorgettingdoc urllog_check ### Does the URL already exist? if {![urllog_exists $urlStr $urlNick $urlHost $urlChan]} { return 1 } + + ### Validate URL compoments, etc. + set u_proto "" + if {![urllog_validate_url $urlNick $urlChan urlStr u_proto]} { + return 1 + } ### Do we perform additional optional checks? if {$urllog_check == 0 || !(($http_tls_support != 0 && $u_proto == "https") || $u_proto == "http")} { @@ -474,34 +521,42 @@ } ### Does the document pointed by the URL exist? - if {[catch {set utoken [::http::geturl $urlStr -timeout 6000 -headers {Accept-Encoding identity}]} uerrmsg]} { - urllog_verb_msg $urlNick $urlChan "$urlmsg_ioerror ($uerrmsg)" - urllog_log "HTTP request failed: $uerrmsg" - return 0 - } - - set ustatus [::http::status $utoken] - if {$ustatus == "timeout"} { - urllog_verb_msg $urlNick $urlChan "$urlmsg_timeout" - urllog_log "HTTP request timed out ($urlStr)" - return 0 + if {![urllog_dorequest $urlNick $urlChan $urlStr ustatus uscode ucode udata umeta]} { + return 1 } - if {$ustatus != "ok"} { - urllog_verb_msg $urlNick $urlChan "$urlmsg_errorgettingdoc ([::http::error $utoken])" - urllog_log "Error in HTTP transaction: [::http::error $utoken] ($urlStr)" - return 0 + ### Handle redirects + if {$ucode >= 301 && $ucode <= 302} { + set nurlStr $umeta(Location) + urllog_log "Redirection: $urlStr -> $nurlStr" + set urlStr $nurlStr + + if {![urllog_validate_url $urlNick $urlChan urlStr urlProto]} { + return 1 + } + + if {![urllog_dorequest $urlNick $urlChan $urlStr ustatus uscode ucode udata umeta]} { + return 1 + } } - # Fixme! Handle redirects! - set ustatus [::http::status $utoken] - set uscode [::http::code $utoken] - set ucode [::http::ncode $utoken] - set udata [::http::data $utoken] - array set umeta [::http::meta $utoken] - ::http::cleanup $utoken + ### Handle 2nd level redirects + if {$ucode >= 301 && $ucode <= 302} { + set nurlStr $umeta(Location) + urllog_log "Redirection #2: $urlStr -> $nurlStr" + set urlStr $nurlStr - if {$ucode >= 200 && $ucode <= 309} { + if {![urllog_validate_url $urlNick $urlChan urlStr urlProto]} { + return 1 + } + + if {![urllog_dorequest $urlNick $urlChan $urlStr ustatus uscode ucode udata umeta]} { + return 1 + } + } + + # Final document + if {$ucode >= 200 && $ucode <= 205} { set uenc_doc "" set uenc_http "" set uencoding "" @@ -608,7 +663,7 @@ ### Do the URL checking foreach str [split $utext " "] { if {[regexp "((ftp|http|https)://\[^\[:space:\]\]+|^(www|ftp)\.\[^\[:space:\]\]+)" $str ulink]} { - urllog_checkurl $str $unick $uhost $uchan + urllog_check_url $str $unick $uhost $uchan } } return 0