# HG changeset patch # User Matti Hamalainen # Date 1601207479 -10800 # Node ID d4d2fda123081949615470aed39f9cd9e99db06b # Parent 0e3ee1f51c80402c7fc5efe2d4b4866cc36438f3 urllog: Improve URL parsing/validation and protocol guessing. diff -r 0e3ee1f51c80 -r d4d2fda12308 urllog.tcl --- a/urllog.tcl Sun Sep 20 18:47:16 2020 +0300 +++ b/urllog.tcl Sun Sep 27 14:51:19 2020 +0300 @@ -241,14 +241,21 @@ upvar 1 $urlMProto urlProto upvar 1 $urlMHostName urlHostName - ### Try to guess the URL protocol component (if it is missing) set u_checktld 1 - if {![string match "http://*" $urlStr] && ![string match "https://*" $urlStr] && ![string match "ftp://*" $urlStr] && ![string match "*://*" $urlStr]} { - if {[string match "*www.*" $urlStr]} { - set urlStr "http://$urlStr" - } elseif {[string match "*ftp.*" $urlStr]} { - set urlStr "ftp://$urlStr" - } + + ### Hack for removing parenthesis around an URL + if {[regexp {^\((.+)\)$} $urlStr urlMatch urlClean]} { + set urlStr $urlClean + } + + ### Clean excess stuff, if any, and attempt to + ### guess the URL protocol component if it is missing + if {[regexp "(\[a-z\]+)://\[^ \]+" $urlStr urlMatch urlProto]} { + set urlStr $urlMatch + } elseif {[regexp "www\.\[^ \]+" $urlStr urlMatch]} { + set urlStr "http://$urlMatch" + } elseif {[regexp "ftp\.\[^ \]+" $urlStr urlMatch]} { + set urlStr "ftp://$urlMatch" } ### Handle URLs that have an IPv4-address @@ -258,6 +265,10 @@ urllog_log "URL pointing to local or invalid network, ignored ($urlStr)." return 0 } + if {$ni1 >= 255 || $ni2 >= 255 || $ni3 >= 255 || $ni4 >= 255} { + urllog_log "URL pointing to invalid network, ignored ($urlStr)." + return 0 + } # Skip TLD check for URLs with IP address set u_checktld 0 }