Mercurial > hg > egg-tcls
diff urllog.tcl @ 0:1c4e2814cd41
Initial import.
author | Matti Hamalainen <ccr@tnsp.org> |
---|---|
date | Tue, 21 Sep 2010 13:12:49 +0300 |
parents | |
children | 8003090caa35 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/urllog.tcl Tue Sep 21 13:12:49 2010 +0300 @@ -0,0 +1,711 @@ +########################################################################## +# +# URLLog v1.99.12 by ccr/TNSP <ccr@tnsp.org> +# (C) Copyright 2000-2010 Tecnic Software productions (TNSP) +# +########################################################################## +# +# NOTICE! If you are upgrading to v1.90+ from any older version, you +# might want to run a conversion script against your URL-database file. +# +# It is NOT strictly necessary, but recommended especially if you +# utilize the "shorturl" functionality. The conversion script is +# available at < http://tnsp.org/egg-tcls/ > +# +########################################################################## +### +### HTTP options +### +# Set to 1 if you want to use proxy +set http_proxy 0 + +# Proxy host and port number (only used if enabled above) +set http_proxy_host "" +set http_proxy_port 8080 + + +### +### General options +### + +# Filename where the logged URL data goes +set urllog_file "data.urllog" + + +# 1 = Verbose: Say messages when URL is OK, bad, etc. +# 0 = Quiet : Be quiet (only speak if asked with !urlfind, etc) +set urllog_verbose 1 + + +# 1 = Put some info to bot's Logfile during operation +# 0 = Don't. +set urllog_logmsg 1 + + +# 1 = Check URLs for validity and existence before adding. +# 0 = No checks. Add _anything_ that looks like an URL to the database. +set urllog_check 1 + + +### +### WWW page creation options +### +# 1 = Create the www pages, 0 = Don't. +set urllog_makewww 0 + +# TIP: If you don't want the bot to create the HTML-file, you can +# use a simple Perl/Ruby/Python/PHP/whatnot-scripted page to do that. + +# Filename AND FULL PATH of the html-file where the www-format log goes +# (Remember to set the permissions right after the file has been created) +set urllog_webfile "/home/niinuska/public_html/urllog.html" + + +### +### Search related settings +### + +# 0 = No search-commands available +# 1 = Search enabled +set urllog_search 1 + + +# How many URL's should the !urlfind command show (maximum limit) +set urllog_showmax_pub 3 + + +# For private-search, this is the default limit (user can change it) +set urllog_showmax_priv 6 + + +### +### ShortURL-settings +### + +# 1 = Use ShortURLs +# 0 = Don't. +set urllog_shorturl 1 + +# Max length of original URL to be shown +set urllog_shorturl_orig 30 + +# Path to PHP/CGI-script that redirects ShortURLs +set urllog_shorturl_prefix "http://tnsp.org/u/" + + +### +### Message-texts +### + +# No such host was found +set urlmsg_nosuchhost "ei tommosta oo!" + +# Could not connect host (I/O errors etc) +set urlmsg_ioerror "kraak, virhe yhdynnässä." + +# HTTP timeout +set urlmsg_timeout "ei jaksa ootella" + +# No such document was found +set urlmsg_errorgettingdoc "siitosvirhe" + +# URL was already known (was in database) +set urlmsg_alreadyknown "wanha!" +#set urlmsg_alreadyknown "Empiiristen havaintojen perusteella ja tällä sovellutusalueella esiintyneisiin aikaisempiin kontekstuaalisiin ilmaisuihin viitaten uskallan todeta, että sovellukseen ilmoittamasi tietoverkko-osoite oli kronologisti ajatellen varsin postpresentuaalisesti sopimaton ja ennestään hyvin tunnettu." + +# No match was found when searched with !urlfind or other command +set urlmsg_nomatch "Ei osumia." + + +### +### Things that you usually don't need to touch ... +### + +# What IRC "command" should we use to send messages: +# (Valid alternatives are "PRIVMSG" and "NOTICE") +set urllog_preferredmsg "PRIVMSG" + +# The valid known Top Level Domains (TLDs), but not the country code TLDs +# (Now includes the new IANA published TLDs) +set urllog_tlds "org,com,net,mil,gov,biz,edu,coop,aero,info,museum,name,pro,int" + + +########################################################################## +# No need to look below this line +########################################################################## +#------------------------------------------------------------------------- +set urllog_name "URLLog" +set urllog_version "1.99.12" + +set urllog_tlds [split $urllog_tlds ","] +set urllog_httprep [split "\@|%40|{|%7B|}|%7D|\[|%5B|\]|%5D" "|"] + +set urllog_html_ent [split "‪||‬||‎||å|å|Å|Å|é|é|:|:|ä|ä|ö|ö|ä|ä|ö|ö| | |-|-|”|\"|“|\"|»|>>|"|\"|ä|ä|ö|ö|Ä|Ä|Ö|Ö|&|&|<|<|>|>|ä|ä|ö|ö|Ä|Ä" "|"] + + +### HTTP module initialization +package require http +::http::config -useragent "$urllog_name/$urllog_version" +if {$http_proxy != 0} { + ::http::config -proxyhost $http_proxy_host -proxyport $http_proxy_port +} + + +### Binding initializations +if {$urllog_search != 0} { + bind pub - !urlfind urllog_pub_urlfind + bind msg - urlfind urllog_msg_urlfind +} + +bind pubm - *.* urllog_checkmsg +bind topc - *.* urllog_checkmsg +bind msg - paska urllog_checkmsg2 + +if {$urllog_makewww != 0} { + bind dcc m dowebfile urllog_dowebfile + bind time - "*0 % % % %" urllog_timer +} + + +### Initialization messages +set urllog_message "$urllog_name v$urllog_version (C) 2000-2010 ccr/TNSP" +putlog "$urllog_message" + +if {$http_proxy != 0} { + putlog " (Using proxy $http_proxy_host:$http_proxy_port)" +} + +if {$urllog_check != 0} { + putlog " (Additional URL validity checks enabled)" +} + +if {$urllog_verbose != 0} { + putlog " (Verbose mode enabled)" +} + +if {$urllog_makewww != 0} { + putlog " (Creating WWW page to $urllog_webfile)" +} + +if {$urllog_search != 0} { + putlog " (Search commands enabled)" +} + + +#------------------------------------------------------------------------- +### Utility functions +proc urllog_log {arg} { + global urllog_logmsg urllog_name + + if {$urllog_logmsg != 0} { + putlog "$urllog_name: $arg" + } +} + + +proc urllog_ctime { utime } { + + if {$utime == "" || $utime == "*"} { + set utime 0 + } + + return [clock format $utime -format "%d.%m.%Y %H:%M"] +} + + +proc urllog_dowebfile {hand idx text} { +global urllog_name urllog_makewww + +if {$urllog_makewww == 1} { + urllog_log "Making webfiles..." + urllog_make_web_file +} else { + urllog_log "Webfile disabled." +} +} + + +proc urllog_isnumber {uarg} { + set ufoo 1 + + foreach i [split $uarg {}] { + if {![string match \[0-9\] $i]} {set ufoo 0} + } + + return $ufoo +} + + +proc urllog_msg {apublic anick achan amsg} { + global urllog_preferredmsg + + if {$apublic == 1} { + putserv "$urllog_preferredmsg $achan :$amsg" + } else { + putserv "$urllog_preferredmsg $anick :$amsg" + } +} + + +proc urllog_verb_msg {anick achan amsg} { + global urllog_verbose + + if {$urllog_verbose != 0} { + urllog_msg 1 $anick $achan $amsg + } +} + + +proc urllog_convert_ent {udata} { + global urllog_html_ent + regsub -all " " $udata " " utmp + regsub -all "\r" $udata " " utmp + regsub -all "\n" $utmp " " utmp + regsub -all " *" $utmp " " utmp + regsub -all "\t" $utmp "" utmp + return [string map -nocase $urllog_html_ent $utmp] +} + + +#------------------------------------------------------------------------- +proc urllog_make_web_file {} { +global urllog_file urllog_webfile urllog_message botnick + +# Starting message +urllog_log "Creating HTML-file for WWW..." + +# Open files +set fd [open $urllog_webfile w] +set fd2 [open $urllog_file r] + +# HTML headers +puts $fd "<html><head><title>Caught URLs</title></head>" +puts $fd "<body bgcolor=\"#FFFFFF\" text=\"#0020a0\" link=\"#0020a0\" vlink=\"#0020a0\" alink=\"#0020a0\">" +puts $fd "<font face=\"Helvetica, Arial\">" +puts $fd "<center><font size=\"6\">URLs caught by $botnick</center><hr>" +puts $fd "<font size=\"3\">" + +# Process database, convert to links & info +while {![eof $fd2]} { + gets $fd2 foo + if {$foo != ""} { + regsub -all "<|>|\"" $foo "" foo + set foo [split $foo " "] + puts $fd "<a href=\"[lindex $foo 0]\">[lindex $foo 0]</a><br>Added on <B>[urllog_ctime [lindex $foo 1]]</B> by <B>[lindex $foo 2]</B><br><hr>" + } + } + +# HTML footers +puts $fd "<center>Generated by $urllog_message<BR>" +puts $fd "(Last updated <B>[urllog_ctime [unixtime]]</B>)</center>" +puts $fd "</body></html>" + +# Close files +close $fd +close $fd2 + +# OK-message +urllog_log "HTML-file generated OK." +} + + +#------------------------------------------------------------------------- +proc urllog_timer {umin uhour uday umonth uyear} { + urllog_make_web_file +} + + +#------------------------------------------------------------------------- +proc urllog_get_short {utime} { + global urllog_shorturl urllog_shorturl_prefix + set ustr "ABCDEFGHIJKLNMOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789" + set ulen [string length $ustr] + + set u1 [expr $utime / ($ulen * $ulen)] + set utmp [expr $utime % ($ulen * $ulen)] + set u2 [expr $utmp / $ulen] + set u3 [expr $utmp % $ulen] + + return "\[ $urllog_shorturl_prefix[string index $ustr $u1][string index $ustr $u2][string index $ustr $u3] \]" +} + + +#------------------------------------------------------------------------- +proc urllog_chop_url {url} { + global urllog_shorturl_orig + if {[string length $url] > $urllog_shorturl_orig} { + return "[string range $url 0 $urllog_shorturl_orig]..." + } else { + return $url + } +} + +#------------------------------------------------------------------------- +proc urllog_addurl {urlStr urlNick urlHost urlChan urlTitle} { +global urlmsg_alreadyknown urllog_file urllog_shorturl + +### Let's check if we already know the URL +set fd [open $urllog_file a+] +close $fd + +set fd [open $urllog_file r] +set urlID -1 +while {![eof $fd]} { + set qitems [split [gets $fd] " "] + set qindex [lindex $qitems 4] + if {$qindex != "" && $qindex > $urlID} { + set urlID $qindex + } + if {[lindex $qitems 0] == $urlStr} { + urllog_log "URL said by $urlNick ($urlStr) already known" + if {$urllog_shorturl != 0} { + set qstr "[urllog_get_short $urlID] " + } else { + set qstr "" + } + append qstr "([lindex $qitems 2]@[urllog_ctime [lindex $qitems 1]])" + if {[string length $urlTitle] > 0} { + set qstr "$urlmsg_alreadyknown - '$urlTitle' $qstr" + } else { + set qstr "$urlmsg_alreadyknown $qstr" + } + urllog_verb_msg $urlNick $urlChan $qstr + return 0 + } +} +close $fd + + +### OK, the URL was not already known - thus we add it +incr urlID +set urlTime [unixtime] +set fd [open $urllog_file a+] +puts $fd "$urlStr $urlTime $urlNick ($urlHost) $urlID" +close $fd +urllog_log "Added URL ($urlNick@$urlChan): $urlStr" + + +### Let's say something, to confirm that everything went well. +if {$urllog_shorturl != 0} { + set qstr "[urllog_get_short $urlID] " +} else { + set qstr "" +} +if {[string length $urlTitle] > 0} { + urllog_verb_msg $urlNick $urlChan "'$urlTitle' ([urllog_chop_url $urlStr]) $qstr" +} else { + urllog_verb_msg $urlNick $urlChan "[urllog_chop_url $urlStr] $qstr" +} + +return 1 +} + + +#------------------------------------------------------------------------- +proc urllog_http_handler {utoken utotal ucurrent} { + upvar #0 $utoken state + + # Stop after around 3000 received bytes, typically we would assume + # that <head> section is contained in this amount of data. + if {$state(currentsize) >= 3000} { + set state(status) "ok" + } +} + +#------------------------------------------------------------------------- +proc urllog_checkurl {urlStr urlNick urlHost urlChan} { +global botnick urllog_html urllog_tlds urllog_check urllog_file +global urlmsg_nosuchhost urlmsg_ioerror urlmsg_timeout urlmsg_errorgettingdoc +global urllog_httprep urllog_shorturl_prefix urllog_shorturl urllog_encoding + +### Some status +urllog_log "$urlStr ($urlNick@$urlChan)" + +### Try to determine the URL protocol component (if it is missing) +set u_checktld 1 + +if {[string match "*www.*" $urlStr] && ![string match "http://*" $urlStr] && ![string match "https://*" $urlStr]} { + set urlStr "http://$urlStr" + } + +if {[string match "*ftp.*" $urlStr] && ![string match "ftp://*" $urlStr]} { + set urlStr "ftp://$urlStr" + } + +if {[regexp "(ftp|http|https)://(\[0-9\]{1,3})\\.(\[0-9\]{1,3})\\.(\[0-9\]{1,3})\\.(\[0-9\]{1,3})" $urlStr u_match u_prefix ni1 ni2 ni3 ni4]} { + # Check if the IP is on local network + if {($ni1 == 127) || ($ni1 == 10) || ($ni1 == 192 && $ni2 == 168) || ($ni1 == 0)} { + urllog_log "URL pointing to local or invalid network, ignored ($urlStr)." + return 0 + } + + # Skip TLD check for URLs with IP address + set u_checktld 0 + } + +if {$urllog_shorturl != 0 && [string match "*$urllog_shorturl_prefix*" $urlStr]} { + urllog_log "Ignoring ShortURL." + return 0 +} + +### Check the PORT (if the ":" is there) +set u_record [split $urlStr "/"] +set u_hostname [lindex $u_record 2] +set u_port [lindex [split $u_hostname ":"] end] + +if {![urllog_isnumber $u_port] && $u_port != "" && $u_port != $u_hostname} { + urllog_log "Broken URL from $urlNick: ($urlStr) illegal port $u_port" + return 0 + } + +# Default to port 80 (HTTP) +if {![urllog_isnumber $u_port]} { + set u_port 80 + } + +### Is it a http or ftp url? (FIX ME!) +if {[string range $urlStr 0 3] != "http" && [string range $urlStr 0 2] != "ftp"} { + urllog_log "Broken URL from $urlNick: ($urlStr) UNSUPPORTED TYPE (not HTTP or FTP)" + return 0 + } + +### Check the Top Level Domain (TLD) validity +if {$u_checktld != 0} { + set u_sane [lindex [split $u_hostname "."] end] + set u_tld [lindex [split $u_sane ":"] 0] + set u_found 0 + + if {[string length $u_tld] == 2} { + # Assume all 2-letter domains to be valid :) + set u_found 1 + } else { + # Check our list of known TLDs + foreach itld $urllog_tlds { + if {[string match $itld $u_tld]} { + set u_found 1 + } + } + } + + if {$u_found == 0} { + urllog_log "Broken URL from $urlNick: ($urlStr) illegal TLD: $u_tld." + return 0 + } + } + +set urlStr [string map $urllog_httprep $urlStr] + +### Do we perform additional optional checks? +if {$urllog_check != 0 && [string range $urlStr 0 6] == "http://"} { + # Does the document pointed by the URL exist? + if {[catch {set utoken [::http::geturl $urlStr -progress urllog_http_handler -blocksize 1024 -timeout 4000]} uerrmsg]} { + urllog_verb_msg $urlNick $urlChan "$urlmsg_ioerror ($uerrmsg)" + urllog_log "HTTP request failed: $uerrmsg" + return 0 + } + + upvar #0 $utoken ustate + + if {$ustate(status) == "timeout"} { + urllog_verb_msg $urlNick $urlChan "$urlmsg_timeout" + urllog_log "HTTP request timed out ($urlStr)" + } + + if {$ustate(status) == "error"} { + urllog_verb_msg $urlNick $urlChan "$urlmsg_errorgettingdoc ([::http::error $utoken])" + urllog_log "Error in HTTP transaction: [::http::error $utoken] ($urlStr)" + } + + # FIXME! Handle redirects! + if {$ustate(status) == "ok"} { + if {[::http::ncode $utoken] >= 200 && [::http::ncode $utoken] <= 309} { + set udata $ustate(body) + set umatches [regexp -nocase -inline -- "<meta.\*\?content=\".\*\?charset=(\[^\"\]*)\"/>" $udata] + set uconvert 0 + if {[llength $umatches] > 0} { + set uencoding [lindex $umatches 1] + if {[string length $uencoding] > 3} { + set uconvert 1 + } + } + + set umatches [regexp -nocase -inline -- "<title>(.\*\?)</title>" $udata] + if {[llength $umatches] > 0} { + set urlTitle [lindex $umatches 1] + if {$uconvert != 0} { + set urlTitle [encoding convertfrom $uencoding $urlTitle] + } + set urlTitle [urllog_convert_ent $urlTitle] + regsub -all "(^ *| *$)" $urlTitle "" urlTitle + } else { + set urlTitle "" + } + urllog_addurl $urlStr $urlNick $urlHost $urlChan $urlTitle + } else { + urllog_verb_msg $urlNick $urlChan "$urlmsg_errorgettingdoc ([::http::code $utoken])" + urllog_log "[::http::code $utoken] - $urlStr" + } + } + + ::http::cleanup $utoken + } else { + # No optional checks, just add the URL + urllog_addurl $urlStr $urlNick $urlHost $urlChan "" + } +} + + +#------------------------------------------------------------------------- +proc urllog_checkmsg {nick uhost hand chan text} { +### Check the nick +if {$nick == "*"} { + urllog_log "urllog_checkmsg: nick was wc, this should not happen." + return 0 + } + +### Do the URL checking +foreach istr [split $text " "] { + if {[string match "*http://*" $istr] || [string match "*ftp://*" $istr] || + [string match "*www.*" $istr] || [string match "*ftp.*" $istr] || + [regexp "(ftp|http|https)://\[0-9\]\{1,3\}\\.\[0-9\]\{1,3\}\\.\[0-9\]\{1,3\}\\.\[0-9\]\{1,3\}" $istr imatch]} { + urllog_checkurl $istr $nick $uhost $chan + } + } + +return 0 +} + + +#------------------------------------------------------------------------- +### Find from database according to patterns, up to imax results. + +proc urllog_urlfind {ipatterns imax} { +global urllog_file + +### Search the database for pattern +### Clear the count, open the URL logfile +set iresults {} +set nresults 0 +set fd [open $urllog_file r] + +### Start searching... +while {![eof $fd]} { + + # Get one URL for inspection + gets $fd foo + set irecord [split [string tolower $foo] " "] + set iurl [lindex $irecord 0] + set iname [lindex $irecord 2] + + # Match with all given patterns and rules + set imatched 1 + foreach ipattern $ipatterns { + set foob [split [string tolower $ipattern] " "] + set ftoken [lindex $foob 0] + set fparam [lindex $foob 1] + set fmatch [string match $fparam $iurl] + + if {$ftoken == "+" && $fmatch == 0} { set imatched 0 } + + if {$ftoken == "-" && $fmatch == 1} { set imatched 0 } + + if {$ftoken == "%" && [string match $fparam $iname] == 0} { set imatched 0 } + } + + # If the all patterns matched, add to the list... + if {$imatched == 1 && $foo != ""} { + incr nresults + lappend iresults $foo + } +} + +# Close file +close $fd + +# Take only last imax results +return [lrange $iresults [expr $nresults-$imax] $nresults] +} + + +#------------------------------------------------------------------------- +### Parse arguments, find and show the results +proc urllog_find {unick uhand uchan utext upublic} { +global botnick urllog_name urllog_version urllog_shorturl +global urllog_showmax_pub urllog_showmax_priv urlmsg_nomatch + +### Parse the given command + urllog_log "$unick/$uhand searched URL: $utext" + + set footokens [split $utext " "] + foreach ftoken $footokens { + set foomark [string range $ftoken 0 0] + set foopat [string range $ftoken 1 end] + + if {$foomark == "-" || $foomark == "+" || $foomark == "%" || $foomark == "@"} { + lappend ipatlist "$foomark *$foopat*" + } else { + lappend ipatlist "+ *$ftoken*" + } + } + +### Get the matches from database + + if {$upublic == 0} { + set iresults [urllog_urlfind $ipatlist $urllog_showmax_priv] + } else { + set iresults [urllog_urlfind $ipatlist $urllog_showmax_pub] + } + +### Show the results + if {$iresults != ""} { + set j 0 + foreach i $iresults { + incr j + set foo [split $i " "] + set shortURL [lindex $foo 0] + set shortID [lindex $foo 4] + + if {$urllog_shorturl != 0 && $shortID != ""} { + set shortURL "$shortURL [urllog_get_short $shortID]" + } + + urllog_msg $upublic $unick $uchan "#$j: $shortURL ([lindex $foo 2]@[urllog_ctime [lindex $foo 1]])" + } + + } else { + # If no URLs were found + urllog_msg $upublic $unick $uchan $urlmsg_nomatch + } + +return 0 +} + + +#------------------------------------------------------------------------- +### Finding binded functions +proc urllog_pub_urlfind {unick uhost uhand uchan utext} { + +urllog_find $unick $uhand $uchan $utext 1 + +return 0 +} + + +proc urllog_msg_urlfind {unick uhost uhand utext} { + +urllog_find $unick $uhand "" $utext 0 + +return 0 +} + +#------------------------------------------------------------------------- +proc urllog_checkmsg2 {unick uhost uhand utext} { + +urllog_checkurl $utext $unick $uhost "#CHANNEL" + +return 0 +} + + + +# end of script