changeset 571:d4d2fda12308

urllog: Improve URL parsing/validation and protocol guessing.
author Matti Hamalainen <ccr@tnsp.org>
date Sun, 27 Sep 2020 14:51:19 +0300
parents 0e3ee1f51c80
children 295c225e3152
files urllog.tcl
diffstat 1 files changed, 18 insertions(+), 7 deletions(-) [+]
line wrap: on
line diff
--- a/urllog.tcl	Sun Sep 20 18:47:16 2020 +0300
+++ b/urllog.tcl	Sun Sep 27 14:51:19 2020 +0300
@@ -241,14 +241,21 @@
   upvar 1 $urlMProto urlProto
   upvar 1 $urlMHostName urlHostName
 
-  ### Try to guess the URL protocol component (if it is missing)
   set u_checktld 1
-  if {![string match "http://*" $urlStr] && ![string match "https://*" $urlStr] && ![string match "ftp://*" $urlStr] && ![string match "*://*" $urlStr]} {
-    if {[string match "*www.*" $urlStr]} {
-      set urlStr "http://$urlStr"
-    } elseif {[string match "*ftp.*" $urlStr]} {
-      set urlStr "ftp://$urlStr"
-    }
+
+  ### Hack for removing parenthesis around an URL
+  if {[regexp {^\((.+)\)$} $urlStr urlMatch urlClean]} {
+    set urlStr $urlClean
+  }
+
+  ### Clean excess stuff, if any, and attempt to
+  ### guess the URL protocol component if it is missing
+  if {[regexp "(\[a-z\]+)://\[^ \]+" $urlStr urlMatch urlProto]} {
+    set urlStr $urlMatch
+  } elseif {[regexp "www\.\[^ \]+" $urlStr urlMatch]} {
+    set urlStr "http://$urlMatch"
+  } elseif {[regexp "ftp\.\[^ \]+" $urlStr urlMatch]} {
+    set urlStr "ftp://$urlMatch"
   }
 
   ### Handle URLs that have an IPv4-address
@@ -258,6 +265,10 @@
       urllog_log "URL pointing to local or invalid network, ignored ($urlStr)."
       return 0
     }
+    if {$ni1 >= 255 || $ni2 >= 255 || $ni3 >= 255 || $ni4 >= 255} {
+      urllog_log "URL pointing to invalid network, ignored ($urlStr)."
+      return 0
+    }
     # Skip TLD check for URLs with IP address
     set u_checktld 0
   }