Mercurial > hg > egg-tcls

diff urllog.tcl @ 0:1c4e2814cd41
Initial import.
author: Matti Hamalainen <ccr@tnsp.org>
date: Tue, 21 Sep 2010 13:12:49 +0300
children: 8003090caa35
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/urllog.tcl	Tue Sep 21 13:12:49 2010 +0300
@@ -0,0 +1,711 @@
+##########################################################################
+#
+# URLLog v1.99.12 by ccr/TNSP <ccr@tnsp.org>
+# (C) Copyright 2000-2010 Tecnic Software productions (TNSP)
+#
+##########################################################################
+#
+# NOTICE! If you are upgrading to v1.90+ from any older version, you
+# might want to run a conversion script against your URL-database file.
+#
+# It is NOT strictly necessary, but recommended especially if you
+# utilize the "shorturl" functionality. The conversion script is
+# available at < http://tnsp.org/egg-tcls/ >
+#
+##########################################################################
+###
+### HTTP options
+###
+# Set to 1 if you want to use proxy
+set http_proxy 0
+
+# Proxy host and port number (only used if enabled above)
+set http_proxy_host ""
+set http_proxy_port 8080
+
+
+###
+### General options
+###
+
+# Filename where the logged URL data goes
+set urllog_file "data.urllog"
+
+
+# 1 = Verbose: Say messages when URL is OK, bad, etc.
+# 0 = Quiet  : Be quiet (only speak if asked with !urlfind, etc)
+set urllog_verbose 1
+
+
+# 1 = Put some info to bot's Logfile during operation
+# 0 = Don't.
+set urllog_logmsg 1
+
+
+# 1 = Check URLs for validity and existence before adding.
+# 0 = No checks. Add _anything_ that looks like an URL to the database.
+set urllog_check 1
+
+
+###
+### WWW page creation options
+###
+# 1 = Create the www pages, 0 = Don't.
+set urllog_makewww 0
+
+# TIP: If you don't want the bot to create the HTML-file, you can
+# use a simple Perl/Ruby/Python/PHP/whatnot-scripted page to do that.
+
+# Filename AND FULL PATH of the html-file where the www-format log goes
+# (Remember to set the permissions right after the file has been created)
+set urllog_webfile "/home/niinuska/public_html/urllog.html"
+
+
+###
+### Search related settings
+###
+
+# 0 = No search-commands available
+# 1 = Search enabled
+set urllog_search 1
+
+
+# How many URL's should the !urlfind command show (maximum limit)
+set urllog_showmax_pub 3
+
+
+# For private-search, this is the default limit (user can change it)
+set urllog_showmax_priv 6
+
+
+###
+### ShortURL-settings
+###
+
+# 1 = Use ShortURLs
+# 0 = Don't.
+set urllog_shorturl 1
+
+# Max length of original URL to be shown
+set urllog_shorturl_orig 30
+
+# Path to PHP/CGI-script that redirects ShortURLs
+set urllog_shorturl_prefix "http://tnsp.org/u/"
+
+
+###
+### Message-texts
+###
+
+# No such host was found
+set urlmsg_nosuchhost "ei tommosta oo!"
+
+# Could not connect host (I/O errors etc)
+set urlmsg_ioerror "kraak, virhe yhdynnässä."
+
+# HTTP timeout
+set urlmsg_timeout "ei jaksa ootella"
+
+# No such document was found
+set urlmsg_errorgettingdoc "siitosvirhe"
+
+# URL was already known (was in database)
+set urlmsg_alreadyknown "wanha!"
+#set urlmsg_alreadyknown "Empiiristen havaintojen perusteella ja tällä sovellutusalueella esiintyneisiin aikaisempiin kontekstuaalisiin ilmaisuihin viitaten uskallan todeta, että sovellukseen ilmoittamasi tietoverkko-osoite oli kronologisti ajatellen varsin postpresentuaalisesti sopimaton ja ennestään hyvin tunnettu."
+
+# No match was found when searched with !urlfind or other command
+set urlmsg_nomatch "Ei osumia."
+
+
+###
+### Things that you usually don't need to touch ...
+###
+
+# What IRC "command" should we use to send messages:
+# (Valid alternatives are "PRIVMSG" and "NOTICE")
+set urllog_preferredmsg "PRIVMSG"
+
+# The valid known Top Level Domains (TLDs), but not the country code TLDs
+# (Now includes the new IANA published TLDs)
+set urllog_tlds "org,com,net,mil,gov,biz,edu,coop,aero,info,museum,name,pro,int"
+
+
+##########################################################################
+# No need to look below this line
+##########################################################################
+#-------------------------------------------------------------------------
+set urllog_name "URLLog"
+set urllog_version "1.99.12"
+
+set urllog_tlds [split $urllog_tlds ","]
+set urllog_httprep [split "\@|%40|{|%7B|}|%7D|\[|%5B|\]|%5D" "|"] 
+
+set urllog_html_ent [split "&#x202a;||&#x202c;||&lrm;||&aring;|å|&Aring;|Å|&eacute;|é|&#58;|:|&#xe4;|ä|&#xf6;|ö|&#228;|ä|&#246;|ö|&nbsp;| |&#45;|-|&#8221;|\"|&#8220;|\"|&raquo;|>>|&quot;|\"|&auml;|ä|&ouml;|ö|&Auml;|Ä|&Ouml;|Ö|&amp;|&|&lt;|<|&gt;|>|Ã¤|ä|Ã¶|ö|Ã„|Ä" "|"]
+
+
+### HTTP module initialization
+package require http
+::http::config -useragent "$urllog_name/$urllog_version"
+if {$http_proxy != 0} {
+	::http::config -proxyhost $http_proxy_host -proxyport $http_proxy_port
+}
+
+
+### Binding initializations
+if {$urllog_search != 0} {
+	bind pub - !urlfind urllog_pub_urlfind
+	bind msg - urlfind urllog_msg_urlfind
+}
+
+bind pubm - *.* urllog_checkmsg
+bind topc - *.* urllog_checkmsg
+bind msg - paska urllog_checkmsg2
+
+if {$urllog_makewww != 0} {
+	bind dcc m dowebfile urllog_dowebfile
+	bind time - "*0 % % % %" urllog_timer
+}
+
+
+### Initialization messages
+set urllog_message "$urllog_name v$urllog_version (C) 2000-2010 ccr/TNSP"
+putlog "$urllog_message"
+
+if {$http_proxy != 0} {
+	putlog " (Using proxy $http_proxy_host:$http_proxy_port)"
+}
+
+if {$urllog_check != 0} {
+	putlog " (Additional URL validity checks enabled)"
+}
+
+if {$urllog_verbose != 0} {
+	putlog " (Verbose mode enabled)"
+}
+
+if {$urllog_makewww != 0} {
+	putlog " (Creating WWW page to $urllog_webfile)"
+}
+
+if {$urllog_search != 0} {
+	putlog " (Search commands enabled)"
+}
+
+
+#-------------------------------------------------------------------------
+### Utility functions
+proc urllog_log {arg} {
+	global urllog_logmsg urllog_name
+
+	if {$urllog_logmsg != 0} {
+		putlog "$urllog_name: $arg"
+	}
+}
+
+
+proc urllog_ctime { utime } {
+
+	if {$utime == "" || $utime == "*"} {
+		set utime 0
+	}
+
+	return [clock format $utime -format "%d.%m.%Y %H:%M"]
+}
+
+
+proc urllog_dowebfile {hand idx text} {
+global urllog_name urllog_makewww
+
+if {$urllog_makewww == 1} {
+	urllog_log "Making webfiles..."
+	urllog_make_web_file
+} else {
+	urllog_log "Webfile disabled."
+}
+}
+
+
+proc urllog_isnumber {uarg} {
+	set ufoo 1
+
+	foreach i [split $uarg {}] {
+		if {![string match \[0-9\] $i]} {set ufoo 0}
+	}
+
+	return $ufoo
+}
+
+
+proc urllog_msg {apublic anick achan amsg} {
+	global urllog_preferredmsg
+
+	if {$apublic == 1} {
+		putserv "$urllog_preferredmsg $achan :$amsg"
+	} else {
+		putserv "$urllog_preferredmsg $anick :$amsg" 
+	}
+}
+
+
+proc urllog_verb_msg {anick achan amsg} {
+	global urllog_verbose
+
+	if {$urllog_verbose != 0} {
+		urllog_msg 1 $anick $achan $amsg
+	}
+}
+
+
+proc urllog_convert_ent {udata} {
+	global urllog_html_ent
+	regsub -all "	" $udata " " utmp
+	regsub -all "\r" $udata " " utmp
+	regsub -all "\n" $utmp " " utmp
+	regsub -all "  *" $utmp " " utmp
+	regsub -all "\t" $utmp "" utmp
+	return [string map -nocase $urllog_html_ent $utmp]
+}
+
+
+#-------------------------------------------------------------------------
+proc urllog_make_web_file {} {
+global urllog_file urllog_webfile urllog_message botnick
+
+# Starting message
+urllog_log "Creating HTML-file for WWW..."
+
+# Open files
+set fd [open $urllog_webfile w]
+set fd2 [open $urllog_file r]
+
+# HTML headers
+puts $fd "<html><head><title>Caught URLs</title></head>"
+puts $fd "<body bgcolor=\"#FFFFFF\" text=\"#0020a0\" link=\"#0020a0\" vlink=\"#0020a0\" alink=\"#0020a0\">"
+puts $fd "<font face=\"Helvetica, Arial\">"
+puts $fd "<center><font size=\"6\">URLs caught by $botnick</center><hr>"
+puts $fd "<font size=\"3\">"
+
+# Process database, convert to links & info
+while {![eof $fd2]} {
+	gets $fd2 foo
+	if {$foo != ""} {
+	regsub -all "<|>|\"" $foo "" foo
+	set foo [split $foo " "]
+	puts $fd "<a href=\"[lindex $foo 0]\">[lindex $foo 0]</a><br>Added on <B>[urllog_ctime [lindex $foo 1]]</B> by <B>[lindex $foo 2]</B><br><hr>"
+	}
+	}
+
+# HTML footers
+puts $fd "<center>Generated by $urllog_message<BR>"
+puts $fd "(Last updated <B>[urllog_ctime [unixtime]]</B>)</center>"
+puts $fd "</body></html>"
+
+# Close files
+close $fd
+close $fd2
+
+# OK-message
+urllog_log "HTML-file generated OK."
+}
+
+
+#-------------------------------------------------------------------------
+proc urllog_timer {umin uhour uday umonth uyear} {
+	urllog_make_web_file
+}
+
+
+#-------------------------------------------------------------------------
+proc urllog_get_short {utime} {
+	global urllog_shorturl urllog_shorturl_prefix
+	set ustr "ABCDEFGHIJKLNMOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
+	set ulen [string length $ustr]
+
+	set u1 [expr $utime / ($ulen * $ulen)]
+	set utmp [expr $utime % ($ulen * $ulen)]
+	set u2 [expr $utmp / $ulen]
+	set u3 [expr $utmp % $ulen]
+
+	return "\[ $urllog_shorturl_prefix[string index $ustr $u1][string index $ustr $u2][string index $ustr $u3] \]"
+}
+
+
+#-------------------------------------------------------------------------
+proc urllog_chop_url {url} {
+	global urllog_shorturl_orig
+	if {[string length $url] > $urllog_shorturl_orig} {
+		return "[string range $url 0 $urllog_shorturl_orig]..."
+	} else {
+		return $url
+	}
+}
+
+#-------------------------------------------------------------------------
+proc urllog_addurl {urlStr urlNick urlHost urlChan urlTitle} {
+global urlmsg_alreadyknown urllog_file urllog_shorturl
+
+### Let's check if we already know the URL
+set fd [open $urllog_file a+]
+close $fd
+
+set fd [open $urllog_file r]
+set urlID -1
+while {![eof $fd]} {
+	set qitems [split [gets $fd] " "]
+	set qindex [lindex $qitems 4]
+	if {$qindex != "" && $qindex > $urlID} {
+		set urlID $qindex
+	}
+	if {[lindex $qitems 0] == $urlStr} {
+		urllog_log "URL said by $urlNick ($urlStr) already known"
+		if {$urllog_shorturl != 0} {
+			set qstr "[urllog_get_short $urlID] "
+		} else {
+			set qstr ""
+		}
+		append qstr "([lindex $qitems 2]@[urllog_ctime [lindex $qitems 1]])"
+		if {[string length $urlTitle] > 0} {
+			set qstr "$urlmsg_alreadyknown - '$urlTitle' $qstr"
+		} else {
+			set qstr "$urlmsg_alreadyknown $qstr"
+		}
+		urllog_verb_msg $urlNick $urlChan $qstr
+		return 0
+	}
+}
+close $fd
+
+
+### OK, the URL was not already known - thus we add it
+incr urlID
+set urlTime [unixtime]
+set fd [open $urllog_file a+]
+puts $fd "$urlStr $urlTime $urlNick ($urlHost) $urlID"
+close $fd
+urllog_log "Added URL ($urlNick@$urlChan): $urlStr"
+
+
+### Let's say something, to confirm that everything went well.
+if {$urllog_shorturl != 0} {
+	set qstr "[urllog_get_short $urlID] "
+} else {
+	set qstr ""
+}
+if {[string length $urlTitle] > 0} {
+	urllog_verb_msg $urlNick $urlChan "'$urlTitle' ([urllog_chop_url $urlStr]) $qstr"
+} else {
+	urllog_verb_msg $urlNick $urlChan "[urllog_chop_url $urlStr] $qstr"
+}
+
+return 1
+}
+
+
+#-------------------------------------------------------------------------
+proc urllog_http_handler {utoken utotal ucurrent} {
+	upvar #0 $utoken state
+
+	# Stop after around 3000 received bytes, typically we would assume
+	# that <head> section is contained in this amount of data.
+	if {$state(currentsize) >= 3000} {
+		set state(status) "ok"
+	}
+}
+
+#-------------------------------------------------------------------------
+proc urllog_checkurl {urlStr urlNick urlHost urlChan} {
+global botnick urllog_html urllog_tlds urllog_check urllog_file
+global urlmsg_nosuchhost urlmsg_ioerror urlmsg_timeout urlmsg_errorgettingdoc
+global urllog_httprep urllog_shorturl_prefix urllog_shorturl urllog_encoding
+
+### Some status
+urllog_log "$urlStr ($urlNick@$urlChan)"
+
+### Try to determine the URL protocol component (if it is missing)
+set u_checktld 1
+
+if {[string match "*www.*" $urlStr] && ![string match "http://*" $urlStr] && ![string match "https://*" $urlStr]} {
+	set urlStr "http://$urlStr"
+	}
+
+if {[string match "*ftp.*" $urlStr] && ![string match "ftp://*" $urlStr]} {
+	set urlStr "ftp://$urlStr"
+	}
+
+if {[regexp "(ftp|http|https)://(\[0-9\]{1,3})\\.(\[0-9\]{1,3})\\.(\[0-9\]{1,3})\\.(\[0-9\]{1,3})" $urlStr u_match u_prefix ni1 ni2 ni3 ni4]} {
+	# Check if the IP is on local network
+	if {($ni1 == 127) || ($ni1 == 10) || ($ni1 == 192 && $ni2 == 168) || ($ni1 == 0)} {
+		urllog_log "URL pointing to local or invalid network, ignored ($urlStr)."
+		return 0
+		}
+
+	# Skip TLD check for URLs with IP address
+	set u_checktld 0
+	}
+
+if {$urllog_shorturl != 0 && [string match "*$urllog_shorturl_prefix*" $urlStr]} {
+	urllog_log "Ignoring ShortURL."
+	return 0
+}
+
+### Check the PORT (if the ":" is there)
+set u_record [split $urlStr "/"]
+set u_hostname [lindex $u_record 2]
+set u_port [lindex [split $u_hostname ":"] end]
+
+if {![urllog_isnumber $u_port] && $u_port != "" && $u_port != $u_hostname} {
+	urllog_log "Broken URL from $urlNick: ($urlStr) illegal port $u_port"
+	return 0
+	}
+
+# Default to port 80 (HTTP)
+if {![urllog_isnumber $u_port]} {
+	set u_port 80
+	}
+
+### Is it a http or ftp url? (FIX ME!)
+if {[string range $urlStr 0 3] != "http" && [string range $urlStr 0 2] != "ftp"} {
+	urllog_log "Broken URL from $urlNick: ($urlStr) UNSUPPORTED TYPE (not HTTP or FTP)"
+	return 0
+	}
+
+### Check the Top Level Domain (TLD) validity
+if {$u_checktld != 0} {
+	set u_sane [lindex [split $u_hostname "."] end]
+	set u_tld [lindex [split $u_sane ":"] 0]
+	set u_found 0
+
+	if {[string length $u_tld] == 2} {
+		# Assume all 2-letter domains to be valid :)
+		set u_found 1
+	} else {
+		# Check our list of known TLDs
+		foreach itld $urllog_tlds {
+			if {[string match $itld $u_tld]} {
+				set u_found 1
+			}
+		}
+		}
+
+	if {$u_found == 0} {
+		urllog_log "Broken URL from $urlNick: ($urlStr) illegal TLD: $u_tld."
+		return 0
+		}
+	}
+
+set urlStr [string map $urllog_httprep $urlStr]
+
+### Do we perform additional optional checks?
+if {$urllog_check != 0 && [string range $urlStr 0 6] == "http://"} {
+	# Does the document pointed by the URL exist?
+	if {[catch {set utoken [::http::geturl $urlStr -progress urllog_http_handler -blocksize 1024 -timeout 4000]} uerrmsg]} {
+		urllog_verb_msg $urlNick $urlChan "$urlmsg_ioerror ($uerrmsg)"
+		urllog_log "HTTP request failed: $uerrmsg"
+		return 0
+		}
+
+	upvar #0 $utoken ustate
+	
+	if {$ustate(status) == "timeout"} {
+		urllog_verb_msg $urlNick $urlChan "$urlmsg_timeout"
+		urllog_log "HTTP request timed out ($urlStr)"
+		}
+
+	if {$ustate(status) == "error"} {
+		urllog_verb_msg $urlNick $urlChan "$urlmsg_errorgettingdoc ([::http::error $utoken])"
+		urllog_log "Error in HTTP transaction: [::http::error $utoken] ($urlStr)"
+		}
+
+	# FIXME! Handle redirects!
+	if {$ustate(status) == "ok"} {
+		if {[::http::ncode $utoken] >= 200 && [::http::ncode $utoken] <= 309} {
+			set udata $ustate(body)
+			set umatches [regexp -nocase -inline -- "<meta.\*\?content=\".\*\?charset=(\[^\"\]*)\"/>" $udata]
+			set uconvert 0
+			if {[llength $umatches] > 0} {
+				set uencoding [lindex $umatches 1]
+				if {[string length $uencoding] > 3} {
+					set uconvert 1
+				}
+			}
+			
+			set umatches [regexp -nocase -inline -- "<title>(.\*\?)</title>" $udata]
+			if {[llength $umatches] > 0} {
+				set urlTitle [lindex $umatches 1]
+				if {$uconvert != 0} {
+					set urlTitle [encoding convertfrom $uencoding $urlTitle]
+				}
+				set urlTitle [urllog_convert_ent $urlTitle]
+				regsub -all "(^ *| *$)" $urlTitle "" urlTitle
+			} else {
+				set urlTitle ""
+			}
+			urllog_addurl $urlStr $urlNick $urlHost $urlChan $urlTitle
+		} else {
+			urllog_verb_msg $urlNick $urlChan "$urlmsg_errorgettingdoc ([::http::code $utoken])"
+			urllog_log "[::http::code $utoken] - $urlStr"
+		}
+	}
+
+	::http::cleanup $utoken
+	} else {
+		# No optional checks, just add the URL
+		urllog_addurl $urlStr $urlNick $urlHost $urlChan ""
+	}
+}
+
+
+#-------------------------------------------------------------------------
+proc urllog_checkmsg {nick uhost hand chan text} {
+### Check the nick
+if {$nick == "*"} {
+	urllog_log "urllog_checkmsg: nick was wc, this should not happen."
+	return 0
+	}
+
+### Do the URL checking
+foreach istr [split $text " "] {
+	if {[string match "*http://*" $istr] || [string match "*ftp://*" $istr] ||
+	[string match "*www.*" $istr] || [string match "*ftp.*" $istr] ||
+	[regexp "(ftp|http|https)://\[0-9\]\{1,3\}\\.\[0-9\]\{1,3\}\\.\[0-9\]\{1,3\}\\.\[0-9\]\{1,3\}" $istr imatch]} {
+		urllog_checkurl $istr $nick $uhost $chan
+		}
+	}
+
+return 0
+}
+
+
+#-------------------------------------------------------------------------
+### Find from database according to patterns, up to imax results.
+
+proc urllog_urlfind {ipatterns imax} {
+global urllog_file
+
+### Search the database for pattern
+### Clear the count, open the URL logfile
+set iresults {}
+set nresults 0
+set fd [open $urllog_file r]
+
+### Start searching...
+while {![eof $fd]} {
+
+	# Get one URL for inspection
+	gets $fd foo
+	set irecord [split [string tolower $foo] " "]
+	set iurl [lindex $irecord 0]
+	set iname [lindex $irecord 2]
+
+	# Match with all given patterns and rules
+	set imatched 1
+	foreach ipattern $ipatterns {
+		set foob [split [string tolower $ipattern] " "]
+		set ftoken [lindex $foob 0]
+		set fparam [lindex $foob 1]
+		set fmatch [string match $fparam $iurl]
+
+		if {$ftoken == "+" && $fmatch == 0} { set imatched 0 }
+
+		if {$ftoken == "-" && $fmatch == 1} { set imatched 0 }
+
+		if {$ftoken == "%" && [string match $fparam $iname] == 0} { set imatched 0 }
+	}
+
+	# If the all patterns matched, add to the list...
+	if {$imatched == 1 && $foo != ""} {
+		incr nresults
+		lappend iresults $foo
+	}
+}
+
+# Close file
+close $fd
+
+# Take only last imax results
+return [lrange $iresults [expr $nresults-$imax] $nresults]
+}
+
+
+#-------------------------------------------------------------------------
+### Parse arguments, find and show the results
+proc urllog_find {unick uhand uchan utext upublic} {
+global botnick urllog_name urllog_version urllog_shorturl
+global urllog_showmax_pub urllog_showmax_priv urlmsg_nomatch
+
+### Parse the given command
+	urllog_log "$unick/$uhand searched URL: $utext"
+
+	set footokens [split $utext " "]
+	foreach ftoken $footokens {
+		set foomark [string range $ftoken 0 0]
+		set foopat  [string range $ftoken 1 end]
+
+		if {$foomark == "-" || $foomark == "+" || $foomark == "%" || $foomark == "@"} {
+			lappend ipatlist "$foomark *$foopat*"
+		} else {
+			lappend ipatlist "+ *$ftoken*"
+		}
+	}
+	
+### Get the matches from database
+
+	if {$upublic == 0} {
+		set iresults [urllog_urlfind $ipatlist $urllog_showmax_priv]
+	} else {
+		set iresults [urllog_urlfind $ipatlist $urllog_showmax_pub]
+	}
+
+### Show the results
+	if {$iresults != ""} {
+		set j 0
+		foreach i $iresults {
+			incr j
+			set foo [split $i " "]
+			set shortURL [lindex $foo 0]
+			set shortID [lindex $foo 4]
+
+			if {$urllog_shorturl != 0 && $shortID != ""} {
+				set shortURL "$shortURL [urllog_get_short $shortID]"
+			}
+
+			urllog_msg $upublic $unick $uchan "#$j: $shortURL ([lindex $foo 2]@[urllog_ctime [lindex $foo 1]])"
+		}
+
+	} else {
+		# If no URLs were found
+		urllog_msg $upublic $unick $uchan $urlmsg_nomatch
+	}
+
+return 0
+}
+
+
+#-------------------------------------------------------------------------
+### Finding binded functions
+proc urllog_pub_urlfind {unick uhost uhand uchan utext} {
+
+urllog_find $unick $uhand $uchan $utext 1
+
+return 0
+}
+
+
+proc urllog_msg_urlfind {unick uhost uhand utext} {
+
+urllog_find $unick $uhand "" $utext 0
+
+return 0
+}
+
+#-------------------------------------------------------------------------
+proc urllog_checkmsg2 {unick uhost uhand utext} {
+
+urllog_checkurl $utext $unick $uhost "#CHANNEL"
+
+return 0
+}
+
+
+
+# end of script
author	Matti Hamalainen <ccr@tnsp.org>
date	Tue, 21 Sep 2010 13:12:49 +0300
parents
children	8003090caa35