changeset 255:4d4f3defe3ca

Merged.
author Matti Hamalainen <ccr@tnsp.org>
date Tue, 20 Jan 2015 00:02:35 +0200
parents 8d14e5d3eab0 (current diff) dd30f2eaabd3 (diff)
children f3906e583798
files urllog.tcl
diffstat 1 files changed, 108 insertions(+), 47 deletions(-) [+]
line wrap: on
line diff
--- a/urllog.tcl	Mon Jan 19 23:10:07 2015 +0200
+++ b/urllog.tcl	Tue Jan 20 00:02:35 2015 +0200
@@ -1,7 +1,7 @@
 ##########################################################################
 #
-# URLLog v2.3.0 by Matti 'ccr' Hamalainen <ccr@tnsp.org>
-# (C) Copyright 2000-2014 Tecnic Software productions (TNSP)
+# URLLog v2.4.0 by Matti 'ccr' Hamalainen <ccr@tnsp.org>
+# (C) Copyright 2000-2015 Tecnic Software productions (TNSP)
 #
 # This script is freely distributable under GNU GPL (version 2) license.
 #
@@ -152,7 +152,7 @@
 # No need to look below this line
 ##########################################################################
 set urllog_name "URLLog"
-set urllog_version "2.3.0"
+set urllog_version "2.4.0"
 
 set urllog_tlds [split $urllog_tlds ","]
 set urllog_httprep [split "\@|%40|{|%7B|}|%7D|\[|%5B|\]|%5D" "|"] 
@@ -174,12 +174,12 @@
 ### Binding initializations
 bind pub - !urlfind urllog_pub_urlfind
 bind msg - !urlfind urllog_msg_urlfind
-bind pubm - *.* urllog_checkmsg
-bind topc - *.* urllog_checkmsg
+bind pubm - *.* urllog_check_line
+bind topc - *.* urllog_check_line
 
 
 ### Initialization messages
-set urllog_message "$urllog_name v$urllog_version (C) 2000-2014 ccr/TNSP"
+set urllog_message "$urllog_name v$urllog_version (C) 2000-2015 ccr/TNSP"
 putlog "$urllog_message"
 
 ### HTTP module initialization
@@ -373,11 +373,50 @@
 
 
 #-------------------------------------------------------------------------
-proc urllog_checkurl {urlStr urlNick urlHost urlChan} {
-  global urllog_tlds urllog_check urlmsg_nosuchhost urlmsg_ioerror
-  global urlmsg_timeout urlmsg_errorgettingdoc urllog_httprep
-  global urllog_shorturl_prefix urllog_shorturl urllog_encoding
-  global http_tls_support
+proc urllog_dorequest { urlNick urlChan urlStr urlStatus urlSCode urlCode urlData urlMeta } {
+  global urlmsg_ioerror urlmsg_timeout urlmsg_errorgettingdoc
+
+  upvar 1 $urlStatus ustatus
+  upvar 1 $urlSCode uscode
+  upvar 1 $urlCode ucode
+  upvar 1 $urlData udata
+  upvar 1 $urlMeta umeta
+
+  if {[catch {set utoken [::http::geturl $urlStr -timeout 6000 -headers {Accept-Encoding identity}]} uerrmsg]} {
+    urllog_verb_msg $urlNick $urlChan "$urlmsg_ioerror ($uerrmsg)"
+    urllog_log "HTTP request failed: $uerrmsg"
+    return 0
+  }
+
+  set ustatus [::http::status $utoken]
+  if {$ustatus == "timeout"} {
+    urllog_verb_msg $urlNick $urlChan "$urlmsg_timeout"
+    urllog_log "HTTP request timed out ($urlStr)"
+    return 0
+  }
+
+  if {$ustatus != "ok"} {
+    urllog_verb_msg $urlNick $urlChan "$urlmsg_errorgettingdoc ([::http::error $utoken])"
+    urllog_log "Error in HTTP transaction: [::http::error $utoken] ($urlStr)"
+    return 0
+  }
+
+  set ustatus [::http::status $utoken]
+  set uscode [::http::code $utoken]
+  set ucode [::http::ncode $utoken]
+  set udata [::http::data $utoken]
+  array set umeta [::http::meta $utoken]
+  ::http::cleanup $utoken
+
+  return 1
+}
+
+#-------------------------------------------------------------------------
+proc urllog_validate_url { urlNick urlChan urlMStr urlMProto } {
+  global urllog_tlds urllog_check urlmsg_nosuchhost urllog_httprep
+  global urllog_shorturl_prefix urllog_shorturl
+  upvar 1 $urlMStr urlStr
+  upvar 1 $urlMProto urlProto
 
   ### Try to guess the URL protocol component (if it is missing)
   set u_checktld 1
@@ -388,7 +427,7 @@
   }
 
   ### Handle URLs that have an IPv4-address
-  if {[regexp "(\[a-z\]+)://(\[0-9\]{1,3})\\.(\[0-9\]{1,3})\\.(\[0-9\]{1,3})\\.(\[0-9\]{1,3})" $urlStr u_match u_proto ni1 ni2 ni3 ni4]} {
+  if {[regexp "(\[a-z\]+)://(\[0-9\]{1,3})\\.(\[0-9\]{1,3})\\.(\[0-9\]{1,3})\\.(\[0-9\]{1,3})" $urlStr u_match urlProto ni1 ni2 ni3 ni4]} {
     # Check if the IP is on local network
     if {$ni1 == 127 || $ni1 == 10 || ($ni1 == 192 && $ni2 == 168) || $ni1 == 0} {
       urllog_log "URL pointing to local or invalid network, ignored ($urlStr)."
@@ -401,17 +440,18 @@
   ### Check now if we have an ShortURL here ...
   if {[string match "$urllog_shorturl_prefix*" $urlStr]} {
     urllog_log "Ignoring ShortURL from $urlNick: $urlStr"
-    set uud ""
-    set usql "SELECT id AS uid, url AS uurl, user AS uuser, host AS uhost, chan AS uchan, title AS utitle FROM urls WHERE utime=$uud"
-    urldb eval $usql {
-      
-    }
+#    set uud ""
+#    set usql "SELECT id AS uid, url AS uurl, user AS uuser, host AS uhost, chan AS uchan, title AS utitle FROM urls WHERE utime=$uud"
+#    urldb eval $usql {
+#      urllog_verb_msg $urlNick $urlChan "'$utitle' - $uurl"
+#      return 1
+#    }
     return 0
   }
 
   ### Get URL protocol component
-  set u_proto ""
-  regexp "(\[a-z\]+)://" $urlStr u_match u_proto
+  set urlProto ""
+  regexp "(\[a-z\]+)://" $urlStr u_match urlProto
 
   ### Check the PORT (if the ":" is there)
   set u_record [split $urlStr "/"]
@@ -424,8 +464,8 @@
   }
 
   ### Is it a http or ftp url? (FIX ME!)
-  if {$u_proto != "http" && $u_proto != "https" && $u_proto != "ftp"} {
-    urllog_log "Broken URL from $urlNick: ($urlStr) UNSUPPORTED protocol class ($u_proto)."
+  if {$urlProto != "http" && $urlProto != "https" && $urlProto != "ftp"} {
+    urllog_log "Broken URL from $urlNick: ($urlStr) UNSUPPORTED protocol class ($urlProto)."
     return 0
   }
 
@@ -454,11 +494,24 @@
   }
 
   set urlStr [string map $urllog_httprep $urlStr]
+  return 1
+}
+
+
+#-------------------------------------------------------------------------
+proc urllog_check_url {urlStr urlNick urlHost urlChan} {
+  global urllog_encoding http_tls_support urlmsg_errorgettingdoc urllog_check
 
   ### Does the URL already exist?
   if {![urllog_exists $urlStr $urlNick $urlHost $urlChan]} {
     return 1
   }
+  
+  ### Validate URL compoments, etc.
+  set u_proto ""
+  if {![urllog_validate_url $urlNick $urlChan urlStr u_proto]} {
+    return 1
+  }
 
   ### Do we perform additional optional checks?
   if {$urllog_check == 0 || !(($http_tls_support != 0 && $u_proto == "https") || $u_proto == "http")} {
@@ -469,34 +522,42 @@
   }
 
   ### Does the document pointed by the URL exist?
-  if {[catch {set utoken [::http::geturl $urlStr -timeout 6000 -headers {Accept-Encoding identity}]} uerrmsg]} {
-    urllog_verb_msg $urlNick $urlChan "$urlmsg_ioerror ($uerrmsg)"
-    urllog_log "HTTP request failed: $uerrmsg"
-    return 0
-  }
-
-  set ustatus [::http::status $utoken]
-  if {$ustatus == "timeout"} {
-    urllog_verb_msg $urlNick $urlChan "$urlmsg_timeout"
-    urllog_log "HTTP request timed out ($urlStr)"
-    return 0
+  if {![urllog_dorequest $urlNick $urlChan $urlStr ustatus uscode ucode udata umeta]} {
+    return 1
   }
 
-  if {$ustatus != "ok"} {
-    urllog_verb_msg $urlNick $urlChan "$urlmsg_errorgettingdoc ([::http::error $utoken])"
-    urllog_log "Error in HTTP transaction: [::http::error $utoken] ($urlStr)"
-    return 0
+  ### Handle redirects
+  if {$ucode >= 301 && $ucode <= 302} {
+    set nurlStr $umeta(Location)
+    urllog_log "Redirection: $urlStr -> $nurlStr"
+    set urlStr $nurlStr
+
+    if {![urllog_validate_url $urlNick $urlChan urlStr urlProto]} {
+      return 1
+    }
+
+    if {![urllog_dorequest $urlNick $urlChan $urlStr ustatus uscode ucode udata umeta]} {
+      return 1
+    }
   }
 
-  # Fixme! Handle redirects!
-  set ustatus [::http::status $utoken]
-  set uscode [::http::code $utoken]
-  set ucode [::http::ncode $utoken]
-  set udata [::http::data $utoken]
-  array set umeta [::http::meta $utoken]
-  ::http::cleanup $utoken
+  ### Handle 2nd level redirects
+  if {$ucode >= 301 && $ucode <= 302} {
+    set nurlStr $umeta(Location)
+    urllog_log "Redirection #2: $urlStr -> $nurlStr"
+    set urlStr $nurlStr
 
-  if {$ucode >= 200 && $ucode <= 309} {
+    if {![urllog_validate_url $urlNick $urlChan urlStr urlProto]} {
+      return 1
+    }
+
+    if {![urllog_dorequest $urlNick $urlChan $urlStr ustatus uscode ucode udata umeta]} {
+      return 1
+    }
+  }
+
+  # Final document
+  if {$ucode >= 200 && $ucode <= 205} {
     set uenc_doc ""
     set uenc_http ""
     set uencoding ""
@@ -588,12 +649,12 @@
 #-------------------------------------------------------------------------
 
 
-proc urllog_checkmsg {unick uhost uhand uchan utext} {
+proc urllog_check_line {unick uhost uhand uchan utext} {
   global urllog_log_channels
 
   ### Check the nick
   if {$unick == "*"} {
-    urllog_log "urllog_checkmsg: nick was wc, this should not happen."
+    urllog_log "urllog_check_line: Nick was wc, this should not happen."
     return 0
   }
 
@@ -603,7 +664,7 @@
       ### Do the URL checking
       foreach str [split $utext " "] {
         if {[regexp "((ftp|http|https)://\[^\[:space:\]\]+|^(www|ftp)\.\[^\[:space:\]\]+)" $str ulink]} {
-          urllog_checkurl $str $unick $uhost $uchan
+          urllog_check_url $str $unick $uhost $uchan
         }
       }
       return 0