view urllog.tcl @ 0:1c4e2814cd41

Initial import.
author Matti Hamalainen <ccr@tnsp.org>
date Tue, 21 Sep 2010 13:12:49 +0300
parents
children 8003090caa35
line wrap: on
line source

##########################################################################
#
# URLLog v1.99.12 by ccr/TNSP <ccr@tnsp.org>
# (C) Copyright 2000-2010 Tecnic Software productions (TNSP)
#
##########################################################################
#
# NOTICE! If you are upgrading to v1.90+ from any older version, you
# might want to run a conversion script against your URL-database file.
#
# It is NOT strictly necessary, but recommended especially if you
# utilize the "shorturl" functionality. The conversion script is
# available at < http://tnsp.org/egg-tcls/ >
#
##########################################################################
###
### HTTP options
###
# Set to 1 if you want to use proxy
set http_proxy 0

# Proxy host and port number (only used if enabled above)
set http_proxy_host ""
set http_proxy_port 8080


###
### General options
###

# Filename where the logged URL data goes
set urllog_file "data.urllog"


# 1 = Verbose: Say messages when URL is OK, bad, etc.
# 0 = Quiet  : Be quiet (only speak if asked with !urlfind, etc)
set urllog_verbose 1


# 1 = Put some info to bot's Logfile during operation
# 0 = Don't.
set urllog_logmsg 1


# 1 = Check URLs for validity and existence before adding.
# 0 = No checks. Add _anything_ that looks like an URL to the database.
set urllog_check 1


###
### WWW page creation options
###
# 1 = Create the www pages, 0 = Don't.
set urllog_makewww 0

# TIP: If you don't want the bot to create the HTML-file, you can
# use a simple Perl/Ruby/Python/PHP/whatnot-scripted page to do that.

# Filename AND FULL PATH of the html-file where the www-format log goes
# (Remember to set the permissions right after the file has been created)
set urllog_webfile "/home/niinuska/public_html/urllog.html"


###
### Search related settings
###

# 0 = No search-commands available
# 1 = Search enabled
set urllog_search 1


# How many URL's should the !urlfind command show (maximum limit)
set urllog_showmax_pub 3


# For private-search, this is the default limit (user can change it)
set urllog_showmax_priv 6


###
### ShortURL-settings
###

# 1 = Use ShortURLs
# 0 = Don't.
set urllog_shorturl 1

# Max length of original URL to be shown
set urllog_shorturl_orig 30

# Path to PHP/CGI-script that redirects ShortURLs
set urllog_shorturl_prefix "http://tnsp.org/u/"


###
### Message-texts
###

# No such host was found
set urlmsg_nosuchhost "ei tommosta oo!"

# Could not connect host (I/O errors etc)
set urlmsg_ioerror "kraak, virhe yhdynnässä."

# HTTP timeout
set urlmsg_timeout "ei jaksa ootella"

# No such document was found
set urlmsg_errorgettingdoc "siitosvirhe"

# URL was already known (was in database)
set urlmsg_alreadyknown "wanha!"
#set urlmsg_alreadyknown "Empiiristen havaintojen perusteella ja tällä sovellutusalueella esiintyneisiin aikaisempiin kontekstuaalisiin ilmaisuihin viitaten uskallan todeta, että sovellukseen ilmoittamasi tietoverkko-osoite oli kronologisti ajatellen varsin postpresentuaalisesti sopimaton ja ennestään hyvin tunnettu."

# No match was found when searched with !urlfind or other command
set urlmsg_nomatch "Ei osumia."


###
### Things that you usually don't need to touch ...
###

# What IRC "command" should we use to send messages:
# (Valid alternatives are "PRIVMSG" and "NOTICE")
set urllog_preferredmsg "PRIVMSG"

# The valid known Top Level Domains (TLDs), but not the country code TLDs
# (Now includes the new IANA published TLDs)
set urllog_tlds "org,com,net,mil,gov,biz,edu,coop,aero,info,museum,name,pro,int"


##########################################################################
# No need to look below this line
##########################################################################
#-------------------------------------------------------------------------
set urllog_name "URLLog"
set urllog_version "1.99.12"

set urllog_tlds [split $urllog_tlds ","]
set urllog_httprep [split "\@|%40|{|%7B|}|%7D|\[|%5B|\]|%5D" "|"] 

set urllog_html_ent [split "&#x202a;||&#x202c;||&lrm;||&aring;|å|&Aring;|Å|&eacute;|é|&#58;|:|&#xe4;|ä|&#xf6;|ö|&#228;|ä|&#246;|ö|&nbsp;| |&#45;|-|&#8221;|\"|&#8220;|\"|&raquo;|>>|&quot;|\"|&auml;|ä|&ouml;|ö|&Auml;|Ä|&Ouml;|Ö|&amp;|&|&lt;|<|&gt;|>|ä|ä|ö|ö|Ä|Ä" "|"]


### HTTP module initialization
package require http
::http::config -useragent "$urllog_name/$urllog_version"
if {$http_proxy != 0} {
	::http::config -proxyhost $http_proxy_host -proxyport $http_proxy_port
}


### Binding initializations
if {$urllog_search != 0} {
	bind pub - !urlfind urllog_pub_urlfind
	bind msg - urlfind urllog_msg_urlfind
}

bind pubm - *.* urllog_checkmsg
bind topc - *.* urllog_checkmsg
bind msg - paska urllog_checkmsg2

if {$urllog_makewww != 0} {
	bind dcc m dowebfile urllog_dowebfile
	bind time - "*0 % % % %" urllog_timer
}


### Initialization messages
set urllog_message "$urllog_name v$urllog_version (C) 2000-2010 ccr/TNSP"
putlog "$urllog_message"

if {$http_proxy != 0} {
	putlog " (Using proxy $http_proxy_host:$http_proxy_port)"
}

if {$urllog_check != 0} {
	putlog " (Additional URL validity checks enabled)"
}

if {$urllog_verbose != 0} {
	putlog " (Verbose mode enabled)"
}

if {$urllog_makewww != 0} {
	putlog " (Creating WWW page to $urllog_webfile)"
}

if {$urllog_search != 0} {
	putlog " (Search commands enabled)"
}


#-------------------------------------------------------------------------
### Utility functions
proc urllog_log {arg} {
	global urllog_logmsg urllog_name

	if {$urllog_logmsg != 0} {
		putlog "$urllog_name: $arg"
	}
}


proc urllog_ctime { utime } {

	if {$utime == "" || $utime == "*"} {
		set utime 0
	}

	return [clock format $utime -format "%d.%m.%Y %H:%M"]
}


proc urllog_dowebfile {hand idx text} {
global urllog_name urllog_makewww

if {$urllog_makewww == 1} {
	urllog_log "Making webfiles..."
	urllog_make_web_file
} else {
	urllog_log "Webfile disabled."
}
}


proc urllog_isnumber {uarg} {
	set ufoo 1

	foreach i [split $uarg {}] {
		if {![string match \[0-9\] $i]} {set ufoo 0}
	}

	return $ufoo
}


proc urllog_msg {apublic anick achan amsg} {
	global urllog_preferredmsg

	if {$apublic == 1} {
		putserv "$urllog_preferredmsg $achan :$amsg"
	} else {
		putserv "$urllog_preferredmsg $anick :$amsg" 
	}
}


proc urllog_verb_msg {anick achan amsg} {
	global urllog_verbose

	if {$urllog_verbose != 0} {
		urllog_msg 1 $anick $achan $amsg
	}
}


proc urllog_convert_ent {udata} {
	global urllog_html_ent
	regsub -all "	" $udata " " utmp
	regsub -all "\r" $udata " " utmp
	regsub -all "\n" $utmp " " utmp
	regsub -all "  *" $utmp " " utmp
	regsub -all "\t" $utmp "" utmp
	return [string map -nocase $urllog_html_ent $utmp]
}


#-------------------------------------------------------------------------
proc urllog_make_web_file {} {
global urllog_file urllog_webfile urllog_message botnick

# Starting message
urllog_log "Creating HTML-file for WWW..."

# Open files
set fd [open $urllog_webfile w]
set fd2 [open $urllog_file r]

# HTML headers
puts $fd "<html><head><title>Caught URLs</title></head>"
puts $fd "<body bgcolor=\"#FFFFFF\" text=\"#0020a0\" link=\"#0020a0\" vlink=\"#0020a0\" alink=\"#0020a0\">"
puts $fd "<font face=\"Helvetica, Arial\">"
puts $fd "<center><font size=\"6\">URLs caught by $botnick</center><hr>"
puts $fd "<font size=\"3\">"

# Process database, convert to links & info
while {![eof $fd2]} {
	gets $fd2 foo
	if {$foo != ""} {
	regsub -all "<|>|\"" $foo "" foo
	set foo [split $foo " "]
	puts $fd "<a href=\"[lindex $foo 0]\">[lindex $foo 0]</a><br>Added on <B>[urllog_ctime [lindex $foo 1]]</B> by <B>[lindex $foo 2]</B><br><hr>"
	}
	}

# HTML footers
puts $fd "<center>Generated by $urllog_message<BR>"
puts $fd "(Last updated <B>[urllog_ctime [unixtime]]</B>)</center>"
puts $fd "</body></html>"

# Close files
close $fd
close $fd2

# OK-message
urllog_log "HTML-file generated OK."
}


#-------------------------------------------------------------------------
proc urllog_timer {umin uhour uday umonth uyear} {
	urllog_make_web_file
}


#-------------------------------------------------------------------------
proc urllog_get_short {utime} {
	global urllog_shorturl urllog_shorturl_prefix
	set ustr "ABCDEFGHIJKLNMOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
	set ulen [string length $ustr]

	set u1 [expr $utime / ($ulen * $ulen)]
	set utmp [expr $utime % ($ulen * $ulen)]
	set u2 [expr $utmp / $ulen]
	set u3 [expr $utmp % $ulen]

	return "\[ $urllog_shorturl_prefix[string index $ustr $u1][string index $ustr $u2][string index $ustr $u3] \]"
}


#-------------------------------------------------------------------------
proc urllog_chop_url {url} {
	global urllog_shorturl_orig
	if {[string length $url] > $urllog_shorturl_orig} {
		return "[string range $url 0 $urllog_shorturl_orig]..."
	} else {
		return $url
	}
}

#-------------------------------------------------------------------------
proc urllog_addurl {urlStr urlNick urlHost urlChan urlTitle} {
global urlmsg_alreadyknown urllog_file urllog_shorturl

### Let's check if we already know the URL
set fd [open $urllog_file a+]
close $fd

set fd [open $urllog_file r]
set urlID -1
while {![eof $fd]} {
	set qitems [split [gets $fd] " "]
	set qindex [lindex $qitems 4]
	if {$qindex != "" && $qindex > $urlID} {
		set urlID $qindex
	}
	if {[lindex $qitems 0] == $urlStr} {
		urllog_log "URL said by $urlNick ($urlStr) already known"
		if {$urllog_shorturl != 0} {
			set qstr "[urllog_get_short $urlID] "
		} else {
			set qstr ""
		}
		append qstr "([lindex $qitems 2]@[urllog_ctime [lindex $qitems 1]])"
		if {[string length $urlTitle] > 0} {
			set qstr "$urlmsg_alreadyknown - '$urlTitle' $qstr"
		} else {
			set qstr "$urlmsg_alreadyknown $qstr"
		}
		urllog_verb_msg $urlNick $urlChan $qstr
		return 0
	}
}
close $fd


### OK, the URL was not already known - thus we add it
incr urlID
set urlTime [unixtime]
set fd [open $urllog_file a+]
puts $fd "$urlStr $urlTime $urlNick ($urlHost) $urlID"
close $fd
urllog_log "Added URL ($urlNick@$urlChan): $urlStr"


### Let's say something, to confirm that everything went well.
if {$urllog_shorturl != 0} {
	set qstr "[urllog_get_short $urlID] "
} else {
	set qstr ""
}
if {[string length $urlTitle] > 0} {
	urllog_verb_msg $urlNick $urlChan "'$urlTitle' ([urllog_chop_url $urlStr]) $qstr"
} else {
	urllog_verb_msg $urlNick $urlChan "[urllog_chop_url $urlStr] $qstr"
}

return 1
}


#-------------------------------------------------------------------------
proc urllog_http_handler {utoken utotal ucurrent} {
	upvar #0 $utoken state

	# Stop after around 3000 received bytes, typically we would assume
	# that <head> section is contained in this amount of data.
	if {$state(currentsize) >= 3000} {
		set state(status) "ok"
	}
}

#-------------------------------------------------------------------------
proc urllog_checkurl {urlStr urlNick urlHost urlChan} {
global botnick urllog_html urllog_tlds urllog_check urllog_file
global urlmsg_nosuchhost urlmsg_ioerror urlmsg_timeout urlmsg_errorgettingdoc
global urllog_httprep urllog_shorturl_prefix urllog_shorturl urllog_encoding

### Some status
urllog_log "$urlStr ($urlNick@$urlChan)"

### Try to determine the URL protocol component (if it is missing)
set u_checktld 1

if {[string match "*www.*" $urlStr] && ![string match "http://*" $urlStr] && ![string match "https://*" $urlStr]} {
	set urlStr "http://$urlStr"
	}

if {[string match "*ftp.*" $urlStr] && ![string match "ftp://*" $urlStr]} {
	set urlStr "ftp://$urlStr"
	}

if {[regexp "(ftp|http|https)://(\[0-9\]{1,3})\\.(\[0-9\]{1,3})\\.(\[0-9\]{1,3})\\.(\[0-9\]{1,3})" $urlStr u_match u_prefix ni1 ni2 ni3 ni4]} {
	# Check if the IP is on local network
	if {($ni1 == 127) || ($ni1 == 10) || ($ni1 == 192 && $ni2 == 168) || ($ni1 == 0)} {
		urllog_log "URL pointing to local or invalid network, ignored ($urlStr)."
		return 0
		}

	# Skip TLD check for URLs with IP address
	set u_checktld 0
	}

if {$urllog_shorturl != 0 && [string match "*$urllog_shorturl_prefix*" $urlStr]} {
	urllog_log "Ignoring ShortURL."
	return 0
}

### Check the PORT (if the ":" is there)
set u_record [split $urlStr "/"]
set u_hostname [lindex $u_record 2]
set u_port [lindex [split $u_hostname ":"] end]

if {![urllog_isnumber $u_port] && $u_port != "" && $u_port != $u_hostname} {
	urllog_log "Broken URL from $urlNick: ($urlStr) illegal port $u_port"
	return 0
	}

# Default to port 80 (HTTP)
if {![urllog_isnumber $u_port]} {
	set u_port 80
	}

### Is it a http or ftp url? (FIX ME!)
if {[string range $urlStr 0 3] != "http" && [string range $urlStr 0 2] != "ftp"} {
	urllog_log "Broken URL from $urlNick: ($urlStr) UNSUPPORTED TYPE (not HTTP or FTP)"
	return 0
	}

### Check the Top Level Domain (TLD) validity
if {$u_checktld != 0} {
	set u_sane [lindex [split $u_hostname "."] end]
	set u_tld [lindex [split $u_sane ":"] 0]
	set u_found 0

	if {[string length $u_tld] == 2} {
		# Assume all 2-letter domains to be valid :)
		set u_found 1
	} else {
		# Check our list of known TLDs
		foreach itld $urllog_tlds {
			if {[string match $itld $u_tld]} {
				set u_found 1
			}
		}
		}

	if {$u_found == 0} {
		urllog_log "Broken URL from $urlNick: ($urlStr) illegal TLD: $u_tld."
		return 0
		}
	}

set urlStr [string map $urllog_httprep $urlStr]

### Do we perform additional optional checks?
if {$urllog_check != 0 && [string range $urlStr 0 6] == "http://"} {
	# Does the document pointed by the URL exist?
	if {[catch {set utoken [::http::geturl $urlStr -progress urllog_http_handler -blocksize 1024 -timeout 4000]} uerrmsg]} {
		urllog_verb_msg $urlNick $urlChan "$urlmsg_ioerror ($uerrmsg)"
		urllog_log "HTTP request failed: $uerrmsg"
		return 0
		}

	upvar #0 $utoken ustate
	
	if {$ustate(status) == "timeout"} {
		urllog_verb_msg $urlNick $urlChan "$urlmsg_timeout"
		urllog_log "HTTP request timed out ($urlStr)"
		}

	if {$ustate(status) == "error"} {
		urllog_verb_msg $urlNick $urlChan "$urlmsg_errorgettingdoc ([::http::error $utoken])"
		urllog_log "Error in HTTP transaction: [::http::error $utoken] ($urlStr)"
		}

	# FIXME! Handle redirects!
	if {$ustate(status) == "ok"} {
		if {[::http::ncode $utoken] >= 200 && [::http::ncode $utoken] <= 309} {
			set udata $ustate(body)
			set umatches [regexp -nocase -inline -- "<meta.\*\?content=\".\*\?charset=(\[^\"\]*)\"/>" $udata]
			set uconvert 0
			if {[llength $umatches] > 0} {
				set uencoding [lindex $umatches 1]
				if {[string length $uencoding] > 3} {
					set uconvert 1
				}
			}
			
			set umatches [regexp -nocase -inline -- "<title>(.\*\?)</title>" $udata]
			if {[llength $umatches] > 0} {
				set urlTitle [lindex $umatches 1]
				if {$uconvert != 0} {
					set urlTitle [encoding convertfrom $uencoding $urlTitle]
				}
				set urlTitle [urllog_convert_ent $urlTitle]
				regsub -all "(^ *| *$)" $urlTitle "" urlTitle
			} else {
				set urlTitle ""
			}
			urllog_addurl $urlStr $urlNick $urlHost $urlChan $urlTitle
		} else {
			urllog_verb_msg $urlNick $urlChan "$urlmsg_errorgettingdoc ([::http::code $utoken])"
			urllog_log "[::http::code $utoken] - $urlStr"
		}
	}

	::http::cleanup $utoken
	} else {
		# No optional checks, just add the URL
		urllog_addurl $urlStr $urlNick $urlHost $urlChan ""
	}
}


#-------------------------------------------------------------------------
proc urllog_checkmsg {nick uhost hand chan text} {
### Check the nick
if {$nick == "*"} {
	urllog_log "urllog_checkmsg: nick was wc, this should not happen."
	return 0
	}

### Do the URL checking
foreach istr [split $text " "] {
	if {[string match "*http://*" $istr] || [string match "*ftp://*" $istr] ||
	[string match "*www.*" $istr] || [string match "*ftp.*" $istr] ||
	[regexp "(ftp|http|https)://\[0-9\]\{1,3\}\\.\[0-9\]\{1,3\}\\.\[0-9\]\{1,3\}\\.\[0-9\]\{1,3\}" $istr imatch]} {
		urllog_checkurl $istr $nick $uhost $chan
		}
	}

return 0
}


#-------------------------------------------------------------------------
### Find from database according to patterns, up to imax results.

proc urllog_urlfind {ipatterns imax} {
global urllog_file

### Search the database for pattern
### Clear the count, open the URL logfile
set iresults {}
set nresults 0
set fd [open $urllog_file r]

### Start searching...
while {![eof $fd]} {

	# Get one URL for inspection
	gets $fd foo
	set irecord [split [string tolower $foo] " "]
	set iurl [lindex $irecord 0]
	set iname [lindex $irecord 2]

	# Match with all given patterns and rules
	set imatched 1
	foreach ipattern $ipatterns {
		set foob [split [string tolower $ipattern] " "]
		set ftoken [lindex $foob 0]
		set fparam [lindex $foob 1]
		set fmatch [string match $fparam $iurl]

		if {$ftoken == "+" && $fmatch == 0} { set imatched 0 }

		if {$ftoken == "-" && $fmatch == 1} { set imatched 0 }

		if {$ftoken == "%" && [string match $fparam $iname] == 0} { set imatched 0 }
	}

	# If the all patterns matched, add to the list...
	if {$imatched == 1 && $foo != ""} {
		incr nresults
		lappend iresults $foo
	}
}

# Close file
close $fd

# Take only last imax results
return [lrange $iresults [expr $nresults-$imax] $nresults]
}


#-------------------------------------------------------------------------
### Parse arguments, find and show the results
proc urllog_find {unick uhand uchan utext upublic} {
global botnick urllog_name urllog_version urllog_shorturl
global urllog_showmax_pub urllog_showmax_priv urlmsg_nomatch

### Parse the given command
	urllog_log "$unick/$uhand searched URL: $utext"

	set footokens [split $utext " "]
	foreach ftoken $footokens {
		set foomark [string range $ftoken 0 0]
		set foopat  [string range $ftoken 1 end]

		if {$foomark == "-" || $foomark == "+" || $foomark == "%" || $foomark == "@"} {
			lappend ipatlist "$foomark *$foopat*"
		} else {
			lappend ipatlist "+ *$ftoken*"
		}
	}
	
### Get the matches from database

	if {$upublic == 0} {
		set iresults [urllog_urlfind $ipatlist $urllog_showmax_priv]
	} else {
		set iresults [urllog_urlfind $ipatlist $urllog_showmax_pub]
	}

### Show the results
	if {$iresults != ""} {
		set j 0
		foreach i $iresults {
			incr j
			set foo [split $i " "]
			set shortURL [lindex $foo 0]
			set shortID [lindex $foo 4]

			if {$urllog_shorturl != 0 && $shortID != ""} {
				set shortURL "$shortURL [urllog_get_short $shortID]"
			}

			urllog_msg $upublic $unick $uchan "#$j: $shortURL ([lindex $foo 2]@[urllog_ctime [lindex $foo 1]])"
		}

	} else {
		# If no URLs were found
		urllog_msg $upublic $unick $uchan $urlmsg_nomatch
	}

return 0
}


#-------------------------------------------------------------------------
### Finding binded functions
proc urllog_pub_urlfind {unick uhost uhand uchan utext} {

urllog_find $unick $uhand $uchan $utext 1

return 0
}


proc urllog_msg_urlfind {unick uhost uhand utext} {

urllog_find $unick $uhand "" $utext 0

return 0
}

#-------------------------------------------------------------------------
proc urllog_checkmsg2 {unick uhost uhand utext} {

urllog_checkurl $utext $unick $uhost "#CHANNEL"

return 0
}



# end of script