view urllog.tcl @ 4:8c9049f2b2b0

Improve RasiaTube de-mangler.
author Matti Hamalainen <ccr@tnsp.org>
date Fri, 25 Mar 2011 00:40:41 +0200
parents 8003090caa35
children 50b52294e93e
line wrap: on
line source

##########################################################################
#
# URLLog v1.99.12 by ccr/TNSP <ccr@tnsp.org>
# (C) Copyright 2000-2011 Tecnic Software productions (TNSP)
#
##########################################################################
#
# NOTICE! If you are upgrading to v1.90+ from any older version, you
# might want to run a conversion script against your URL-database file.
#
# It is NOT strictly necessary, but recommended especially if you
# utilize the "shorturl" functionality. The conversion script is
# available at < http://tnsp.org/egg-tcls/ >
#
##########################################################################
###
### HTTP options
###
# Set to 1 if you want to use proxy
set http_proxy 0

# Proxy host and port number (only used if enabled above)
set http_proxy_host ""
set http_proxy_port 8080


###
### General options
###

# Filename where the logged URL data goes
set urllog_file "data.urllog"


# 1 = Verbose: Say messages when URL is OK, bad, etc.
# 0 = Quiet  : Be quiet (only speak if asked with !urlfind, etc)
set urllog_verbose 1


# 1 = Put some info to bot's Logfile during operation
# 0 = Don't.
set urllog_logmsg 1


# 1 = Check URLs for validity and existence before adding.
# 0 = No checks. Add _anything_ that looks like an URL to the database.
set urllog_check 1


###
### Search related settings
###

# 0 = No search-commands available
# 1 = Search enabled
set urllog_search 1


# How many URL's should the !urlfind command show (maximum limit)
set urllog_showmax_pub 3


# For private-search, this is the default limit (user can change it)
set urllog_showmax_priv 6


###
### ShortURL-settings
###

# 1 = Use ShortURLs
# 0 = Don't.
set urllog_shorturl 1

# Max length of original URL to be shown
set urllog_shorturl_orig 30

# Path to PHP/CGI-script that redirects ShortURLs
set urllog_shorturl_prefix "http://tnsp.org/u/"


###
### Message-texts
###

# No such host was found
set urlmsg_nosuchhost "ei tommosta oo!"

# Could not connect host (I/O errors etc)
set urlmsg_ioerror "kraak, virhe yhdynnässä."

# HTTP timeout
set urlmsg_timeout "ei jaksa ootella"

# No such document was found
set urlmsg_errorgettingdoc "siitosvirhe"

# URL was already known (was in database)
set urlmsg_alreadyknown "wanha!"
#set urlmsg_alreadyknown "Empiiristen havaintojen perusteella ja tällä sovellutusalueella esiintyneisiin aikaisempiin kontekstuaalisiin ilmaisuihin viitaten uskallan todeta, että sovellukseen ilmoittamasi tietoverkko-osoite oli kronologisti ajatellen varsin postpresentuaalisesti sopimaton ja ennestään hyvin tunnettu."

# No match was found when searched with !urlfind or other command
set urlmsg_nomatch "Ei osumia."


###
### Things that you usually don't need to touch ...
###

# What IRC "command" should we use to send messages:
# (Valid alternatives are "PRIVMSG" and "NOTICE")
set urllog_preferredmsg "PRIVMSG"

# The valid known Top Level Domains (TLDs), but not the country code TLDs
# (Now includes the new IANA published TLDs)
set urllog_tlds "org,com,net,mil,gov,biz,edu,coop,aero,info,museum,name,pro,int"


##########################################################################
# No need to look below this line
##########################################################################
#-------------------------------------------------------------------------
set urllog_name "URLLog"
set urllog_version "1.99.12"

set urllog_tlds [split $urllog_tlds ","]
set urllog_httprep [split "\@|%40|{|%7B|}|%7D|\[|%5B|\]|%5D" "|"] 

set urllog_html_ent [split "&#x202a;||&#x202c;||&lrm;||&aring;|å|&Aring;|Å|&eacute;|é|&#58;|:|&#xe4;|ä|&#xf6;|ö|&#228;|ä|&#246;|ö|&nbsp;| |&#45;|-|&#8221;|\"|&#8220;|\"|&raquo;|>>|&quot;|\"|&auml;|ä|&ouml;|ö|&Auml;|Ä|&Ouml;|Ö|&amp;|&|&lt;|<|&gt;|>|ä|ä|ö|ö|Ä|Ä" "|"]


### HTTP module initialization
package require http
::http::config -useragent "$urllog_name/$urllog_version"
if {$http_proxy != 0} {
	::http::config -proxyhost $http_proxy_host -proxyport $http_proxy_port
}


### Binding initializations
if {$urllog_search != 0} {
	bind pub - !urlfind urllog_pub_urlfind
	bind msg - urlfind urllog_msg_urlfind
}

bind pubm - *.* urllog_checkmsg
bind topc - *.* urllog_checkmsg
bind msg - paska urllog_checkmsg2


### Initialization messages
set urllog_message "$urllog_name v$urllog_version (C) 2000-2011 ccr/TNSP"
putlog "$urllog_message"

if {$http_proxy != 0} {
	putlog " (Using proxy $http_proxy_host:$http_proxy_port)"
}

if {$urllog_check != 0} {
	putlog " (Additional URL validity checks enabled)"
}

if {$urllog_verbose != 0} {
	putlog " (Verbose mode enabled)"
}

if {$urllog_search != 0} {
	putlog " (Search commands enabled)"
}


#-------------------------------------------------------------------------
### Utility functions
proc urllog_log {arg} {
	global urllog_logmsg urllog_name

	if {$urllog_logmsg != 0} {
		putlog "$urllog_name: $arg"
	}
}


proc urllog_ctime { utime } {

	if {$utime == "" || $utime == "*"} {
		set utime 0
	}

	return [clock format $utime -format "%d.%m.%Y %H:%M"]
}


proc urllog_isnumber {uarg} {
	set ufoo 1

	foreach i [split $uarg {}] {
		if {![string match \[0-9\] $i]} {set ufoo 0}
	}

	return $ufoo
}


proc urllog_msg {apublic anick achan amsg} {
	global urllog_preferredmsg

	if {$apublic == 1} {
		putserv "$urllog_preferredmsg $achan :$amsg"
	} else {
		putserv "$urllog_preferredmsg $anick :$amsg" 
	}
}


proc urllog_verb_msg {anick achan amsg} {
	global urllog_verbose

	if {$urllog_verbose != 0} {
		urllog_msg 1 $anick $achan $amsg
	}
}


proc urllog_convert_ent {udata} {
	global urllog_html_ent
	regsub -all "	" $udata " " utmp
	regsub -all "\r" $udata " " utmp
	regsub -all "\n" $utmp " " utmp
	regsub -all "  *" $utmp " " utmp
	regsub -all "\t" $utmp "" utmp
	return [string map -nocase $urllog_html_ent $utmp]
}


#-------------------------------------------------------------------------
proc urllog_get_short {utime} {
	global urllog_shorturl urllog_shorturl_prefix
	set ustr "ABCDEFGHIJKLNMOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
	set ulen [string length $ustr]

	set u1 [expr $utime / ($ulen * $ulen)]
	set utmp [expr $utime % ($ulen * $ulen)]
	set u2 [expr $utmp / $ulen]
	set u3 [expr $utmp % $ulen]

	return "\[ $urllog_shorturl_prefix[string index $ustr $u1][string index $ustr $u2][string index $ustr $u3] \]"
}


#-------------------------------------------------------------------------
proc urllog_chop_url {url} {
	global urllog_shorturl_orig
	if {[string length $url] > $urllog_shorturl_orig} {
		return "[string range $url 0 $urllog_shorturl_orig]..."
	} else {
		return $url
	}
}

#-------------------------------------------------------------------------
proc urllog_addurl {urlStr urlNick urlHost urlChan urlTitle} {
	global urlmsg_alreadyknown urllog_file urllog_shorturl

 	### Let's check if we already know the URL
	set fd [open $urllog_file a+]
	close $fd

	set fd [open $urllog_file r]
	set urlID -1
	while {![eof $fd]} {
		set qitems [split [gets $fd] " "]
		set qindex [lindex $qitems 4]
		if {$qindex != "" && $qindex > $urlID} {
			set urlID $qindex
		}
		if {[lindex $qitems 0] == $urlStr} {
			urllog_log "URL said by $urlNick ($urlStr) already known"
			if {$urllog_shorturl != 0} {
				set qstr "[urllog_get_short $urlID] "
			} else {
				set qstr ""
			}
			append qstr "([lindex $qitems 2]@[urllog_ctime [lindex $qitems 1]])"
			if {[string length $urlTitle] > 0} {
				set qstr "$urlmsg_alreadyknown - '$urlTitle' $qstr"
			} else {
				set qstr "$urlmsg_alreadyknown $qstr"
			}
			urllog_verb_msg $urlNick $urlChan $qstr
			return 0
		}
	}
	close $fd


	### OK, the URL was not already known - thus we add it
	incr urlID
	set urlTime [unixtime]
	set fd [open $urllog_file a+]
	puts $fd "$urlStr $urlTime $urlNick ($urlHost) $urlID"
	close $fd
	urllog_log "Added URL ($urlNick@$urlChan): $urlStr"


	### Let's say something, to confirm that everything went well.
	if {$urllog_shorturl != 0} {
		set qstr "[urllog_get_short $urlID] "
	} else {
		set qstr ""
	}
	if {[string length $urlTitle] > 0} {
		urllog_verb_msg $urlNick $urlChan "'$urlTitle' ([urllog_chop_url $urlStr]) $qstr"
	} else {
		urllog_verb_msg $urlNick $urlChan "[urllog_chop_url $urlStr] $qstr"
	}

	return 1
}


#-------------------------------------------------------------------------
proc urllog_http_handler {utoken utotal ucurr} {
	upvar #0 $utoken state

	# Stop fetching data after 3000 bytes, this should be enough to
	# contain the head section of a HTML page.
	if {$ucurr > 64000} {
		set state(status) "ok"
	}
}

#-------------------------------------------------------------------------
proc urllog_checkurl {urlStr urlNick urlHost urlChan} {
	global botnick urllog_html urllog_tlds urllog_check urllog_file
	global urlmsg_nosuchhost urlmsg_ioerror urlmsg_timeout urlmsg_errorgettingdoc
	global urllog_httprep urllog_shorturl_prefix urllog_shorturl urllog_encoding

	### Print status to bot's log
	urllog_log "$urlStr ($urlNick@$urlChan)"

	### Try to determine the URL protocol component (if it is missing)
	set u_checktld 1
	if {[string match "*www.*" $urlStr] && ![string match "http://*" $urlStr] && ![string match "https://*" $urlStr]} {
		set urlStr "http://$urlStr"
	} elseif {[string match "*ftp.*" $urlStr] && ![string match "ftp://*" $urlStr]} {
		set urlStr "ftp://$urlStr"
	}

	if {[regexp "(ftp|http|https)://(\[0-9\]{1,3})\\.(\[0-9\]{1,3})\\.(\[0-9\]{1,3})\\.(\[0-9\]{1,3})" $urlStr u_match u_prefix ni1 ni2 ni3 ni4]} {
		# Check if the IP is on local network
		if {($ni1 == 127) || ($ni1 == 10) || ($ni1 == 192 && $ni2 == 168) || ($ni1 == 0)} {
			urllog_log "URL pointing to local or invalid network, ignored ($urlStr)."
			return 0
		}

		# Skip TLD check for URLs with IP address
		set u_checktld 0
	}

	if {$urllog_shorturl != 0 && [string match "*$urllog_shorturl_prefix*" $urlStr]} {
		urllog_log "Ignoring ShortURL."
		return 0
	}

	### Check the PORT (if the ":" is there)
	set u_record [split $urlStr "/"]
	set u_hostname [lindex $u_record 2]
	set u_port [lindex [split $u_hostname ":"] end]

	if {![urllog_isnumber $u_port] && $u_port != "" && $u_port != $u_hostname} {
		urllog_log "Broken URL from $urlNick: ($urlStr) illegal port $u_port"
		return 0
	}

	# Default to port 80 (HTTP)
	if {![urllog_isnumber $u_port]} {
		set u_port 80
	}

	### Is it a http or ftp url? (FIX ME!)
	if {[string range $urlStr 0 3] != "http" && [string range $urlStr 0 2] != "ftp"} {
		urllog_log "Broken URL from $urlNick: ($urlStr) UNSUPPORTED TYPE (not HTTP or FTP)"
		return 0
	}

	### Check the Top Level Domain (TLD) validity
	if {$u_checktld != 0} {
		set u_sane [lindex [split $u_hostname "."] end]
		set u_tld [lindex [split $u_sane ":"] 0]
		set u_found 0

		if {[string length $u_tld] == 2} {
			# Assume all 2-letter domains to be valid :)
			set u_found 1
		} else {
			# Check our list of known TLDs
			foreach itld $urllog_tlds {
				if {[string match $itld $u_tld]} {
					set u_found 1
				}
			}
		}

		if {$u_found == 0} {
			urllog_log "Broken URL from $urlNick: ($urlStr) illegal TLD: $u_tld."
			return 0
		}
	}

	set urlStr [string map $urllog_httprep $urlStr]


	### Do we perform additional optional checks?
	if {$urllog_check == 0 || [string range $urlStr 0 6] != "http://"} {
		# No optional checks, just add the URL
		urllog_addurl $urlStr $urlNick $urlHost $urlChan ""
		return 1
	}
	
	### Does the document pointed by the URL exist?
	if {[catch {set utoken [::http::geturl $urlStr -progress urllog_http_handler -blocksize 1024 -timeout 3000]} uerrmsg]} {
		urllog_verb_msg $urlNick $urlChan "$urlmsg_ioerror ($uerrmsg)"
		urllog_log "HTTP request failed: $uerrmsg"
		return 0
	}

	if {[::http::status $utoken] == "timeout"} {
		urllog_verb_msg $urlNick $urlChan "$urlmsg_timeout"
		urllog_log "HTTP request timed out ($urlStr)"
		return 0
	}

	if {[::http::status $utoken] != "ok"} {
		urllog_verb_msg $urlNick $urlChan "$urlmsg_errorgettingdoc ([::http::error $utoken])"
		urllog_log "Error in HTTP transaction: [::http::error $utoken] ($urlStr)"
		return 0
	}

	# Fixme! Handle redirects!
	set ucode [::http::ncode $utoken]
	if {$ucode >= 200 && $ucode <= 309} {
		set udata [::http::data $utoken]
		set umatches [regexp -nocase -inline -- "<meta.\*\?content=\".\*\?charset=(\[^\"\]*)\"/>" $udata]
		set uconvert 0
		if {[llength $umatches] > 0} {
			set uencoding [lindex $umatches 1]
			if {[string length $uencoding] > 3} {
				set uconvert 1
			}
		}

		set umatches [regexp -nocase -inline -- "<title>(.\*\?)</title>" $udata]
		if {[llength $umatches] > 0} {
			set urlTitle [lindex $umatches 1]
			if {$uconvert != 0} {
				set urlTitle [encoding convertfrom $uencoding $urlTitle]
			}
			set urlTitle [urllog_convert_ent $urlTitle]
			regsub -all "(^ *| *$)" $urlTitle "" urlTitle
		} else {
			set urlTitle ""
		}

		# Rasiatube hack
		if {[string match "*/rasiatube/view*" $urlStr]} {
			set rasia 0
			set umatches [regexp -nocase -inline -- "<link rel=\"video_src\"\.\*\?file=(http://\[^&\]+)&" $udata]
			if {[llength $umatches] > 0} {
				set urlStr [lindex $umatches 1]
				regsub -all "\/v\/" $urlStr "\/watch\?v=" urlStr
				set rasia 1
			} else {
				set umatches [regexp -nocase -inline -- "SWFObject.\"(\[^\"\]+)\", *\"flashvideo" $udata]
				if {[llength $umatches] > 0} {
					set urlStr [lindex $umatches 1]
					regsub "http:\/\/www.dailymotion.com\/swf\/" $urlStr "http:\/\/www.dailymotion.com\/video\/" urlStr
					set rasia 1
				}
			}
			
			if {$rasia != 0} {
				urllog_log "RasiaTube mangler: $urlStr"
				urllog_verb_msg $urlNick $urlChan "Korjataan haiseva rasiatube-linkki: $urlStr"
			}
		}

		urllog_addurl $urlStr $urlNick $urlHost $urlChan $urlTitle
		return 1
	} else {
		urllog_verb_msg $urlNick $urlChan "$urlmsg_errorgettingdoc ([::http::code $utoken])"
		urllog_log "[::http::code $utoken] - $urlStr"
	}

	::http::cleanup $utoken
}


#-------------------------------------------------------------------------
proc urllog_checkmsg {nick uhost hand chan text} {
	### Check the nick
	if {$nick == "*"} {
		urllog_log "urllog_checkmsg: nick was wc, this should not happen."
		return 0
	}

	### Do the URL checking
	foreach istr [split $text " "] {
		if {[string match "*http://*" $istr] || [string match "*ftp://*" $istr] ||
			[string match "*www.*" $istr] || [string match "*ftp.*" $istr] ||
			[regexp "(ftp|http|https)://\[0-9\]\{1,3\}\\.\[0-9\]\{1,3\}\\.\[0-9\]\{1,3\}\\.\[0-9\]\{1,3\}" $istr imatch]} {
			urllog_checkurl $istr $nick $uhost $chan
		}
	}

	return 0
}


#-------------------------------------------------------------------------
### Find from database according to patterns, up to imax results.
proc urllog_urlfind {ipatterns imax} {
	global urllog_file

	### Search the database for pattern
	### Clear the count, open the URL logfile
	set iresults {}
	set nresults 0
	set fd [open $urllog_file r]

	### Start searching...
	while {![eof $fd]} {

		# Get one URL for inspection
		gets $fd foo
		set irecord [split [string tolower $foo] " "]
		set iurl [lindex $irecord 0]
		set iname [lindex $irecord 2]

		# Match with all given patterns and rules
		set imatched 1
		foreach ipattern $ipatterns {
			set foob [split [string tolower $ipattern] " "]
			set ftoken [lindex $foob 0]
			set fparam [lindex $foob 1]
			set fmatch [string match $fparam $iurl]
			if {$ftoken == "+" && $fmatch == 0} { set imatched 0 }
			if {$ftoken == "-" && $fmatch == 1} { set imatched 0 }
			if {$ftoken == "%" && [string match $fparam $iname] == 0} { set imatched 0 }
		}

		# If the patterns matched, add to the results list
		if {$imatched == 1 && $foo != ""} {
			incr nresults
			lappend iresults $foo
		}
	}

	# Close file
	close $fd

	# Take only last imax results
	return [lrange $iresults [expr $nresults - $imax] $nresults]
}


#-------------------------------------------------------------------------
### Parse arguments, find and show the results
proc urllog_find {unick uhand uchan utext upublic} {
	global botnick urllog_name urllog_version urllog_shorturl
	global urllog_showmax_pub urllog_showmax_priv urlmsg_nomatch

	### Parse the given command
	urllog_log "$unick/$uhand searched URL: $utext"

	set footokens [split $utext " "]
	foreach ftoken $footokens {
		set foomark [string range $ftoken 0 0]
		set foopat  [string range $ftoken 1 end]

		if {$foomark == "-" || $foomark == "+" || $foomark == "%" || $foomark == "@"} {
			lappend ipatlist "$foomark *$foopat*"
		} else {
			lappend ipatlist "+ *$ftoken*"
		}
	}
	
	### Get the matches from database

	if {$upublic == 0} {
		set iresults [urllog_urlfind $ipatlist $urllog_showmax_priv]
	} else {
		set iresults [urllog_urlfind $ipatlist $urllog_showmax_pub]
	}

	### Show the results
	if {$iresults != ""} {
		set j 0
		foreach i $iresults {
			incr j
			set foo [split $i " "]
			set shortURL [lindex $foo 0]
			set shortID [lindex $foo 4]

			if {$urllog_shorturl != 0 && $shortID != ""} {
				set shortURL "$shortURL [urllog_get_short $shortID]"
			}

			urllog_msg $upublic $unick $uchan "#$j: $shortURL ([lindex $foo 2]@[urllog_ctime [lindex $foo 1]])"
		}

	} else {
		# If no URLs were found
		urllog_msg $upublic $unick $uchan $urlmsg_nomatch
	}

	return 0
}


#-------------------------------------------------------------------------
### Finding binded functions
proc urllog_pub_urlfind {unick uhost uhand uchan utext} {
	urllog_find $unick $uhand $uchan $utext 1
	return 0
}


proc urllog_msg_urlfind {unick uhost uhand utext} {
	urllog_find $unick $uhand "" $utext 0
	return 0
}


#-------------------------------------------------------------------------
proc urllog_checkmsg2 {unick uhost uhand utext} {
	urllog_checkurl $utext $unick $uhost "#CHANNEL"
	return 0
}


# end of script