comparison urllog.tcl @ 571:d4d2fda12308

urllog: Improve URL parsing/validation and protocol guessing.
author Matti Hamalainen <ccr@tnsp.org>
date Sun, 27 Sep 2020 14:51:19 +0300
parents 0e3ee1f51c80
children 295c225e3152
comparison
equal deleted inserted replaced
570:0e3ee1f51c80 571:d4d2fda12308
239 global urllog_shorturl_prefix urllog_shorturl urllog_check_tld 239 global urllog_shorturl_prefix urllog_shorturl urllog_check_tld
240 upvar 1 $urlMStr urlStr 240 upvar 1 $urlMStr urlStr
241 upvar 1 $urlMProto urlProto 241 upvar 1 $urlMProto urlProto
242 upvar 1 $urlMHostName urlHostName 242 upvar 1 $urlMHostName urlHostName
243 243
244 ### Try to guess the URL protocol component (if it is missing)
245 set u_checktld 1 244 set u_checktld 1
246 if {![string match "http://*" $urlStr] && ![string match "https://*" $urlStr] && ![string match "ftp://*" $urlStr] && ![string match "*://*" $urlStr]} { 245
247 if {[string match "*www.*" $urlStr]} { 246 ### Hack for removing parenthesis around an URL
248 set urlStr "http://$urlStr" 247 if {[regexp {^\((.+)\)$} $urlStr urlMatch urlClean]} {
249 } elseif {[string match "*ftp.*" $urlStr]} { 248 set urlStr $urlClean
250 set urlStr "ftp://$urlStr" 249 }
251 } 250
251 ### Clean excess stuff, if any, and attempt to
252 ### guess the URL protocol component if it is missing
253 if {[regexp "(\[a-z\]+)://\[^ \]+" $urlStr urlMatch urlProto]} {
254 set urlStr $urlMatch
255 } elseif {[regexp "www\.\[^ \]+" $urlStr urlMatch]} {
256 set urlStr "http://$urlMatch"
257 } elseif {[regexp "ftp\.\[^ \]+" $urlStr urlMatch]} {
258 set urlStr "ftp://$urlMatch"
252 } 259 }
253 260
254 ### Handle URLs that have an IPv4-address 261 ### Handle URLs that have an IPv4-address
255 if {[regexp "(\[a-z\]+)://(\[0-9\]{1,3})\\.(\[0-9\]{1,3})\\.(\[0-9\]{1,3})\\.(\[0-9\]{1,3})" $urlStr urlMatch urlProto ni1 ni2 ni3 ni4]} { 262 if {[regexp "(\[a-z\]+)://(\[0-9\]{1,3})\\.(\[0-9\]{1,3})\\.(\[0-9\]{1,3})\\.(\[0-9\]{1,3})" $urlStr urlMatch urlProto ni1 ni2 ni3 ni4]} {
256 # Check if the IP is on local network 263 # Check if the IP is on local network
257 if {$ni1 == 127 || $ni1 == 10 || ($ni1 == 192 && $ni2 == 168) || $ni1 == 0} { 264 if {$ni1 == 127 || $ni1 == 10 || ($ni1 == 192 && $ni2 == 168) || $ni1 == 0} {
258 urllog_log "URL pointing to local or invalid network, ignored ($urlStr)." 265 urllog_log "URL pointing to local or invalid network, ignored ($urlStr)."
266 return 0
267 }
268 if {$ni1 >= 255 || $ni2 >= 255 || $ni3 >= 255 || $ni4 >= 255} {
269 urllog_log "URL pointing to invalid network, ignored ($urlStr)."
259 return 0 270 return 0
260 } 271 }
261 # Skip TLD check for URLs with IP address 272 # Skip TLD check for URLs with IP address
262 set u_checktld 0 273 set u_checktld 0
263 } 274 }