changeset 251:e59f0c3ea0f4

urllog: Handle first and second level redirects.
author Matti Hamalainen <ccr@tnsp.org>
date Tue, 20 Jan 2015 00:01:02 +0200
parents e706f1cdebb4
children eb2fce89b8ab
files urllog.tcl
diffstat 1 files changed, 94 insertions(+), 39 deletions(-) [+]
line wrap: on
line diff
--- a/urllog.tcl	Tue Jan 20 00:00:01 2015 +0200
+++ b/urllog.tcl	Tue Jan 20 00:01:02 2015 +0200
@@ -373,11 +373,50 @@
 
 
 #-------------------------------------------------------------------------
-proc urllog_checkurl {urlStr urlNick urlHost urlChan} {
-  global urllog_tlds urllog_check urlmsg_nosuchhost urlmsg_ioerror
-  global urlmsg_timeout urlmsg_errorgettingdoc urllog_httprep
-  global urllog_shorturl_prefix urllog_shorturl urllog_encoding
-  global http_tls_support
+proc urllog_dorequest { urlNick urlChan urlStr urlStatus urlSCode urlCode urlData urlMeta } {
+  global urlmsg_ioerror urlmsg_timeout urlmsg_errorgettingdoc
+
+  upvar 1 $urlStatus ustatus
+  upvar 1 $urlSCode uscode
+  upvar 1 $urlCode ucode
+  upvar 1 $urlData udata
+  upvar 1 $urlMeta umeta
+
+  if {[catch {set utoken [::http::geturl $urlStr -timeout 6000 -headers {Accept-Encoding identity}]} uerrmsg]} {
+    urllog_verb_msg $urlNick $urlChan "$urlmsg_ioerror ($uerrmsg)"
+    urllog_log "HTTP request failed: $uerrmsg"
+    return 0
+  }
+
+  set ustatus [::http::status $utoken]
+  if {$ustatus == "timeout"} {
+    urllog_verb_msg $urlNick $urlChan "$urlmsg_timeout"
+    urllog_log "HTTP request timed out ($urlStr)"
+    return 0
+  }
+
+  if {$ustatus != "ok"} {
+    urllog_verb_msg $urlNick $urlChan "$urlmsg_errorgettingdoc ([::http::error $utoken])"
+    urllog_log "Error in HTTP transaction: [::http::error $utoken] ($urlStr)"
+    return 0
+  }
+
+  set ustatus [::http::status $utoken]
+  set uscode [::http::code $utoken]
+  set ucode [::http::ncode $utoken]
+  set udata [::http::data $utoken]
+  array set umeta [::http::meta $utoken]
+  ::http::cleanup $utoken
+
+  return 1
+}
+
+#-------------------------------------------------------------------------
+proc urllog_validate_url { urlNick urlChan urlMStr urlMProto } {
+  global urllog_tlds urllog_check urlmsg_nosuchhost urllog_httprep
+  global urllog_shorturl_prefix urllog_shorturl
+  upvar 1 $urlMStr urlStr
+  upvar 1 $urlMProto urlProto
 
   ### Try to guess the URL protocol component (if it is missing)
   set u_checktld 1
@@ -388,7 +427,7 @@
   }
 
   ### Handle URLs that have an IPv4-address
-  if {[regexp "(\[a-z\]+)://(\[0-9\]{1,3})\\.(\[0-9\]{1,3})\\.(\[0-9\]{1,3})\\.(\[0-9\]{1,3})" $urlStr u_match u_proto ni1 ni2 ni3 ni4]} {
+  if {[regexp "(\[a-z\]+)://(\[0-9\]{1,3})\\.(\[0-9\]{1,3})\\.(\[0-9\]{1,3})\\.(\[0-9\]{1,3})" $urlStr u_match urlProto ni1 ni2 ni3 ni4]} {
     # Check if the IP is on local network
     if {$ni1 == 127 || $ni1 == 10 || ($ni1 == 192 && $ni2 == 168) || $ni1 == 0} {
       urllog_log "URL pointing to local or invalid network, ignored ($urlStr)."
@@ -410,8 +449,8 @@
   }
 
   ### Get URL protocol component
-  set u_proto ""
-  regexp "(\[a-z\]+)://" $urlStr u_match u_proto
+  set urlProto ""
+  regexp "(\[a-z\]+)://" $urlStr u_match urlProto
 
   ### Check the PORT (if the ":" is there)
   set u_record [split $urlStr "/"]
@@ -423,14 +462,9 @@
     return 0
   }
 
-  # Default to port 80 (HTTP)
-  if {![urllog_isnumber $u_port]} {
-    set u_port 80
-  }
-
   ### Is it a http or ftp url? (FIX ME!)
-  if {$u_proto != "http" && $u_proto != "https" && $u_proto != "ftp"} {
-    urllog_log "Broken URL from $urlNick: ($urlStr) UNSUPPORTED protocol class ($u_proto)."
+  if {$urlProto != "http" && $urlProto != "https" && $urlProto != "ftp"} {
+    urllog_log "Broken URL from $urlNick: ($urlStr) UNSUPPORTED protocol class ($urlProto)."
     return 0
   }
 
@@ -459,11 +493,24 @@
   }
 
   set urlStr [string map $urllog_httprep $urlStr]
+  return 1
+}
+
+
+#-------------------------------------------------------------------------
+proc urllog_check_url {urlStr urlNick urlHost urlChan} {
+  global urllog_encoding http_tls_support urlmsg_errorgettingdoc urllog_check
 
   ### Does the URL already exist?
   if {![urllog_exists $urlStr $urlNick $urlHost $urlChan]} {
     return 1
   }
+  
+  ### Validate URL compoments, etc.
+  set u_proto ""
+  if {![urllog_validate_url $urlNick $urlChan urlStr u_proto]} {
+    return 1
+  }
 
   ### Do we perform additional optional checks?
   if {$urllog_check == 0 || !(($http_tls_support != 0 && $u_proto == "https") || $u_proto == "http")} {
@@ -474,34 +521,42 @@
   }
 
   ### Does the document pointed by the URL exist?
-  if {[catch {set utoken [::http::geturl $urlStr -timeout 6000 -headers {Accept-Encoding identity}]} uerrmsg]} {
-    urllog_verb_msg $urlNick $urlChan "$urlmsg_ioerror ($uerrmsg)"
-    urllog_log "HTTP request failed: $uerrmsg"
-    return 0
-  }
-
-  set ustatus [::http::status $utoken]
-  if {$ustatus == "timeout"} {
-    urllog_verb_msg $urlNick $urlChan "$urlmsg_timeout"
-    urllog_log "HTTP request timed out ($urlStr)"
-    return 0
+  if {![urllog_dorequest $urlNick $urlChan $urlStr ustatus uscode ucode udata umeta]} {
+    return 1
   }
 
-  if {$ustatus != "ok"} {
-    urllog_verb_msg $urlNick $urlChan "$urlmsg_errorgettingdoc ([::http::error $utoken])"
-    urllog_log "Error in HTTP transaction: [::http::error $utoken] ($urlStr)"
-    return 0
+  ### Handle redirects
+  if {$ucode >= 301 && $ucode <= 302} {
+    set nurlStr $umeta(Location)
+    urllog_log "Redirection: $urlStr -> $nurlStr"
+    set urlStr $nurlStr
+
+    if {![urllog_validate_url $urlNick $urlChan urlStr urlProto]} {
+      return 1
+    }
+
+    if {![urllog_dorequest $urlNick $urlChan $urlStr ustatus uscode ucode udata umeta]} {
+      return 1
+    }
   }
 
-  # Fixme! Handle redirects!
-  set ustatus [::http::status $utoken]
-  set uscode [::http::code $utoken]
-  set ucode [::http::ncode $utoken]
-  set udata [::http::data $utoken]
-  array set umeta [::http::meta $utoken]
-  ::http::cleanup $utoken
+  ### Handle 2nd level redirects
+  if {$ucode >= 301 && $ucode <= 302} {
+    set nurlStr $umeta(Location)
+    urllog_log "Redirection #2: $urlStr -> $nurlStr"
+    set urlStr $nurlStr
 
-  if {$ucode >= 200 && $ucode <= 309} {
+    if {![urllog_validate_url $urlNick $urlChan urlStr urlProto]} {
+      return 1
+    }
+
+    if {![urllog_dorequest $urlNick $urlChan $urlStr ustatus uscode ucode udata umeta]} {
+      return 1
+    }
+  }
+
+  # Final document
+  if {$ucode >= 200 && $ucode <= 205} {
     set uenc_doc ""
     set uenc_http ""
     set uencoding ""
@@ -608,7 +663,7 @@
       ### Do the URL checking
       foreach str [split $utext " "] {
         if {[regexp "((ftp|http|https)://\[^\[:space:\]\]+|^(www|ftp)\.\[^\[:space:\]\]+)" $str ulink]} {
-          urllog_checkurl $str $unick $uhost $uchan
+          urllog_check_url $str $unick $uhost $uchan
         }
       }
       return 0