Mercurial > hg > egg-tcls
view urllog.tcl @ 0:1c4e2814cd41
Initial import.
author | Matti Hamalainen <ccr@tnsp.org> |
---|---|
date | Tue, 21 Sep 2010 13:12:49 +0300 |
parents | |
children | 8003090caa35 |
line wrap: on
line source
########################################################################## # # URLLog v1.99.12 by ccr/TNSP <ccr@tnsp.org> # (C) Copyright 2000-2010 Tecnic Software productions (TNSP) # ########################################################################## # # NOTICE! If you are upgrading to v1.90+ from any older version, you # might want to run a conversion script against your URL-database file. # # It is NOT strictly necessary, but recommended especially if you # utilize the "shorturl" functionality. The conversion script is # available at < http://tnsp.org/egg-tcls/ > # ########################################################################## ### ### HTTP options ### # Set to 1 if you want to use proxy set http_proxy 0 # Proxy host and port number (only used if enabled above) set http_proxy_host "" set http_proxy_port 8080 ### ### General options ### # Filename where the logged URL data goes set urllog_file "data.urllog" # 1 = Verbose: Say messages when URL is OK, bad, etc. # 0 = Quiet : Be quiet (only speak if asked with !urlfind, etc) set urllog_verbose 1 # 1 = Put some info to bot's Logfile during operation # 0 = Don't. set urllog_logmsg 1 # 1 = Check URLs for validity and existence before adding. # 0 = No checks. Add _anything_ that looks like an URL to the database. set urllog_check 1 ### ### WWW page creation options ### # 1 = Create the www pages, 0 = Don't. set urllog_makewww 0 # TIP: If you don't want the bot to create the HTML-file, you can # use a simple Perl/Ruby/Python/PHP/whatnot-scripted page to do that. # Filename AND FULL PATH of the html-file where the www-format log goes # (Remember to set the permissions right after the file has been created) set urllog_webfile "/home/niinuska/public_html/urllog.html" ### ### Search related settings ### # 0 = No search-commands available # 1 = Search enabled set urllog_search 1 # How many URL's should the !urlfind command show (maximum limit) set urllog_showmax_pub 3 # For private-search, this is the default limit (user can change it) set urllog_showmax_priv 6 ### ### ShortURL-settings ### # 1 = Use ShortURLs # 0 = Don't. set urllog_shorturl 1 # Max length of original URL to be shown set urllog_shorturl_orig 30 # Path to PHP/CGI-script that redirects ShortURLs set urllog_shorturl_prefix "http://tnsp.org/u/" ### ### Message-texts ### # No such host was found set urlmsg_nosuchhost "ei tommosta oo!" # Could not connect host (I/O errors etc) set urlmsg_ioerror "kraak, virhe yhdynnässä." # HTTP timeout set urlmsg_timeout "ei jaksa ootella" # No such document was found set urlmsg_errorgettingdoc "siitosvirhe" # URL was already known (was in database) set urlmsg_alreadyknown "wanha!" #set urlmsg_alreadyknown "Empiiristen havaintojen perusteella ja tällä sovellutusalueella esiintyneisiin aikaisempiin kontekstuaalisiin ilmaisuihin viitaten uskallan todeta, että sovellukseen ilmoittamasi tietoverkko-osoite oli kronologisti ajatellen varsin postpresentuaalisesti sopimaton ja ennestään hyvin tunnettu." # No match was found when searched with !urlfind or other command set urlmsg_nomatch "Ei osumia." ### ### Things that you usually don't need to touch ... ### # What IRC "command" should we use to send messages: # (Valid alternatives are "PRIVMSG" and "NOTICE") set urllog_preferredmsg "PRIVMSG" # The valid known Top Level Domains (TLDs), but not the country code TLDs # (Now includes the new IANA published TLDs) set urllog_tlds "org,com,net,mil,gov,biz,edu,coop,aero,info,museum,name,pro,int" ########################################################################## # No need to look below this line ########################################################################## #------------------------------------------------------------------------- set urllog_name "URLLog" set urllog_version "1.99.12" set urllog_tlds [split $urllog_tlds ","] set urllog_httprep [split "\@|%40|{|%7B|}|%7D|\[|%5B|\]|%5D" "|"] set urllog_html_ent [split "‪||‬||‎||å|å|Å|Å|é|é|:|:|ä|ä|ö|ö|ä|ä|ö|ö| | |-|-|”|\"|“|\"|»|>>|"|\"|ä|ä|ö|ö|Ä|Ä|Ö|Ö|&|&|<|<|>|>|ä|ä|ö|ö|Ä|Ä" "|"] ### HTTP module initialization package require http ::http::config -useragent "$urllog_name/$urllog_version" if {$http_proxy != 0} { ::http::config -proxyhost $http_proxy_host -proxyport $http_proxy_port } ### Binding initializations if {$urllog_search != 0} { bind pub - !urlfind urllog_pub_urlfind bind msg - urlfind urllog_msg_urlfind } bind pubm - *.* urllog_checkmsg bind topc - *.* urllog_checkmsg bind msg - paska urllog_checkmsg2 if {$urllog_makewww != 0} { bind dcc m dowebfile urllog_dowebfile bind time - "*0 % % % %" urllog_timer } ### Initialization messages set urllog_message "$urllog_name v$urllog_version (C) 2000-2010 ccr/TNSP" putlog "$urllog_message" if {$http_proxy != 0} { putlog " (Using proxy $http_proxy_host:$http_proxy_port)" } if {$urllog_check != 0} { putlog " (Additional URL validity checks enabled)" } if {$urllog_verbose != 0} { putlog " (Verbose mode enabled)" } if {$urllog_makewww != 0} { putlog " (Creating WWW page to $urllog_webfile)" } if {$urllog_search != 0} { putlog " (Search commands enabled)" } #------------------------------------------------------------------------- ### Utility functions proc urllog_log {arg} { global urllog_logmsg urllog_name if {$urllog_logmsg != 0} { putlog "$urllog_name: $arg" } } proc urllog_ctime { utime } { if {$utime == "" || $utime == "*"} { set utime 0 } return [clock format $utime -format "%d.%m.%Y %H:%M"] } proc urllog_dowebfile {hand idx text} { global urllog_name urllog_makewww if {$urllog_makewww == 1} { urllog_log "Making webfiles..." urllog_make_web_file } else { urllog_log "Webfile disabled." } } proc urllog_isnumber {uarg} { set ufoo 1 foreach i [split $uarg {}] { if {![string match \[0-9\] $i]} {set ufoo 0} } return $ufoo } proc urllog_msg {apublic anick achan amsg} { global urllog_preferredmsg if {$apublic == 1} { putserv "$urllog_preferredmsg $achan :$amsg" } else { putserv "$urllog_preferredmsg $anick :$amsg" } } proc urllog_verb_msg {anick achan amsg} { global urllog_verbose if {$urllog_verbose != 0} { urllog_msg 1 $anick $achan $amsg } } proc urllog_convert_ent {udata} { global urllog_html_ent regsub -all " " $udata " " utmp regsub -all "\r" $udata " " utmp regsub -all "\n" $utmp " " utmp regsub -all " *" $utmp " " utmp regsub -all "\t" $utmp "" utmp return [string map -nocase $urllog_html_ent $utmp] } #------------------------------------------------------------------------- proc urllog_make_web_file {} { global urllog_file urllog_webfile urllog_message botnick # Starting message urllog_log "Creating HTML-file for WWW..." # Open files set fd [open $urllog_webfile w] set fd2 [open $urllog_file r] # HTML headers puts $fd "<html><head><title>Caught URLs</title></head>" puts $fd "<body bgcolor=\"#FFFFFF\" text=\"#0020a0\" link=\"#0020a0\" vlink=\"#0020a0\" alink=\"#0020a0\">" puts $fd "<font face=\"Helvetica, Arial\">" puts $fd "<center><font size=\"6\">URLs caught by $botnick</center><hr>" puts $fd "<font size=\"3\">" # Process database, convert to links & info while {![eof $fd2]} { gets $fd2 foo if {$foo != ""} { regsub -all "<|>|\"" $foo "" foo set foo [split $foo " "] puts $fd "<a href=\"[lindex $foo 0]\">[lindex $foo 0]</a><br>Added on <B>[urllog_ctime [lindex $foo 1]]</B> by <B>[lindex $foo 2]</B><br><hr>" } } # HTML footers puts $fd "<center>Generated by $urllog_message<BR>" puts $fd "(Last updated <B>[urllog_ctime [unixtime]]</B>)</center>" puts $fd "</body></html>" # Close files close $fd close $fd2 # OK-message urllog_log "HTML-file generated OK." } #------------------------------------------------------------------------- proc urllog_timer {umin uhour uday umonth uyear} { urllog_make_web_file } #------------------------------------------------------------------------- proc urllog_get_short {utime} { global urllog_shorturl urllog_shorturl_prefix set ustr "ABCDEFGHIJKLNMOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789" set ulen [string length $ustr] set u1 [expr $utime / ($ulen * $ulen)] set utmp [expr $utime % ($ulen * $ulen)] set u2 [expr $utmp / $ulen] set u3 [expr $utmp % $ulen] return "\[ $urllog_shorturl_prefix[string index $ustr $u1][string index $ustr $u2][string index $ustr $u3] \]" } #------------------------------------------------------------------------- proc urllog_chop_url {url} { global urllog_shorturl_orig if {[string length $url] > $urllog_shorturl_orig} { return "[string range $url 0 $urllog_shorturl_orig]..." } else { return $url } } #------------------------------------------------------------------------- proc urllog_addurl {urlStr urlNick urlHost urlChan urlTitle} { global urlmsg_alreadyknown urllog_file urllog_shorturl ### Let's check if we already know the URL set fd [open $urllog_file a+] close $fd set fd [open $urllog_file r] set urlID -1 while {![eof $fd]} { set qitems [split [gets $fd] " "] set qindex [lindex $qitems 4] if {$qindex != "" && $qindex > $urlID} { set urlID $qindex } if {[lindex $qitems 0] == $urlStr} { urllog_log "URL said by $urlNick ($urlStr) already known" if {$urllog_shorturl != 0} { set qstr "[urllog_get_short $urlID] " } else { set qstr "" } append qstr "([lindex $qitems 2]@[urllog_ctime [lindex $qitems 1]])" if {[string length $urlTitle] > 0} { set qstr "$urlmsg_alreadyknown - '$urlTitle' $qstr" } else { set qstr "$urlmsg_alreadyknown $qstr" } urllog_verb_msg $urlNick $urlChan $qstr return 0 } } close $fd ### OK, the URL was not already known - thus we add it incr urlID set urlTime [unixtime] set fd [open $urllog_file a+] puts $fd "$urlStr $urlTime $urlNick ($urlHost) $urlID" close $fd urllog_log "Added URL ($urlNick@$urlChan): $urlStr" ### Let's say something, to confirm that everything went well. if {$urllog_shorturl != 0} { set qstr "[urllog_get_short $urlID] " } else { set qstr "" } if {[string length $urlTitle] > 0} { urllog_verb_msg $urlNick $urlChan "'$urlTitle' ([urllog_chop_url $urlStr]) $qstr" } else { urllog_verb_msg $urlNick $urlChan "[urllog_chop_url $urlStr] $qstr" } return 1 } #------------------------------------------------------------------------- proc urllog_http_handler {utoken utotal ucurrent} { upvar #0 $utoken state # Stop after around 3000 received bytes, typically we would assume # that <head> section is contained in this amount of data. if {$state(currentsize) >= 3000} { set state(status) "ok" } } #------------------------------------------------------------------------- proc urllog_checkurl {urlStr urlNick urlHost urlChan} { global botnick urllog_html urllog_tlds urllog_check urllog_file global urlmsg_nosuchhost urlmsg_ioerror urlmsg_timeout urlmsg_errorgettingdoc global urllog_httprep urllog_shorturl_prefix urllog_shorturl urllog_encoding ### Some status urllog_log "$urlStr ($urlNick@$urlChan)" ### Try to determine the URL protocol component (if it is missing) set u_checktld 1 if {[string match "*www.*" $urlStr] && ![string match "http://*" $urlStr] && ![string match "https://*" $urlStr]} { set urlStr "http://$urlStr" } if {[string match "*ftp.*" $urlStr] && ![string match "ftp://*" $urlStr]} { set urlStr "ftp://$urlStr" } if {[regexp "(ftp|http|https)://(\[0-9\]{1,3})\\.(\[0-9\]{1,3})\\.(\[0-9\]{1,3})\\.(\[0-9\]{1,3})" $urlStr u_match u_prefix ni1 ni2 ni3 ni4]} { # Check if the IP is on local network if {($ni1 == 127) || ($ni1 == 10) || ($ni1 == 192 && $ni2 == 168) || ($ni1 == 0)} { urllog_log "URL pointing to local or invalid network, ignored ($urlStr)." return 0 } # Skip TLD check for URLs with IP address set u_checktld 0 } if {$urllog_shorturl != 0 && [string match "*$urllog_shorturl_prefix*" $urlStr]} { urllog_log "Ignoring ShortURL." return 0 } ### Check the PORT (if the ":" is there) set u_record [split $urlStr "/"] set u_hostname [lindex $u_record 2] set u_port [lindex [split $u_hostname ":"] end] if {![urllog_isnumber $u_port] && $u_port != "" && $u_port != $u_hostname} { urllog_log "Broken URL from $urlNick: ($urlStr) illegal port $u_port" return 0 } # Default to port 80 (HTTP) if {![urllog_isnumber $u_port]} { set u_port 80 } ### Is it a http or ftp url? (FIX ME!) if {[string range $urlStr 0 3] != "http" && [string range $urlStr 0 2] != "ftp"} { urllog_log "Broken URL from $urlNick: ($urlStr) UNSUPPORTED TYPE (not HTTP or FTP)" return 0 } ### Check the Top Level Domain (TLD) validity if {$u_checktld != 0} { set u_sane [lindex [split $u_hostname "."] end] set u_tld [lindex [split $u_sane ":"] 0] set u_found 0 if {[string length $u_tld] == 2} { # Assume all 2-letter domains to be valid :) set u_found 1 } else { # Check our list of known TLDs foreach itld $urllog_tlds { if {[string match $itld $u_tld]} { set u_found 1 } } } if {$u_found == 0} { urllog_log "Broken URL from $urlNick: ($urlStr) illegal TLD: $u_tld." return 0 } } set urlStr [string map $urllog_httprep $urlStr] ### Do we perform additional optional checks? if {$urllog_check != 0 && [string range $urlStr 0 6] == "http://"} { # Does the document pointed by the URL exist? if {[catch {set utoken [::http::geturl $urlStr -progress urllog_http_handler -blocksize 1024 -timeout 4000]} uerrmsg]} { urllog_verb_msg $urlNick $urlChan "$urlmsg_ioerror ($uerrmsg)" urllog_log "HTTP request failed: $uerrmsg" return 0 } upvar #0 $utoken ustate if {$ustate(status) == "timeout"} { urllog_verb_msg $urlNick $urlChan "$urlmsg_timeout" urllog_log "HTTP request timed out ($urlStr)" } if {$ustate(status) == "error"} { urllog_verb_msg $urlNick $urlChan "$urlmsg_errorgettingdoc ([::http::error $utoken])" urllog_log "Error in HTTP transaction: [::http::error $utoken] ($urlStr)" } # FIXME! Handle redirects! if {$ustate(status) == "ok"} { if {[::http::ncode $utoken] >= 200 && [::http::ncode $utoken] <= 309} { set udata $ustate(body) set umatches [regexp -nocase -inline -- "<meta.\*\?content=\".\*\?charset=(\[^\"\]*)\"/>" $udata] set uconvert 0 if {[llength $umatches] > 0} { set uencoding [lindex $umatches 1] if {[string length $uencoding] > 3} { set uconvert 1 } } set umatches [regexp -nocase -inline -- "<title>(.\*\?)</title>" $udata] if {[llength $umatches] > 0} { set urlTitle [lindex $umatches 1] if {$uconvert != 0} { set urlTitle [encoding convertfrom $uencoding $urlTitle] } set urlTitle [urllog_convert_ent $urlTitle] regsub -all "(^ *| *$)" $urlTitle "" urlTitle } else { set urlTitle "" } urllog_addurl $urlStr $urlNick $urlHost $urlChan $urlTitle } else { urllog_verb_msg $urlNick $urlChan "$urlmsg_errorgettingdoc ([::http::code $utoken])" urllog_log "[::http::code $utoken] - $urlStr" } } ::http::cleanup $utoken } else { # No optional checks, just add the URL urllog_addurl $urlStr $urlNick $urlHost $urlChan "" } } #------------------------------------------------------------------------- proc urllog_checkmsg {nick uhost hand chan text} { ### Check the nick if {$nick == "*"} { urllog_log "urllog_checkmsg: nick was wc, this should not happen." return 0 } ### Do the URL checking foreach istr [split $text " "] { if {[string match "*http://*" $istr] || [string match "*ftp://*" $istr] || [string match "*www.*" $istr] || [string match "*ftp.*" $istr] || [regexp "(ftp|http|https)://\[0-9\]\{1,3\}\\.\[0-9\]\{1,3\}\\.\[0-9\]\{1,3\}\\.\[0-9\]\{1,3\}" $istr imatch]} { urllog_checkurl $istr $nick $uhost $chan } } return 0 } #------------------------------------------------------------------------- ### Find from database according to patterns, up to imax results. proc urllog_urlfind {ipatterns imax} { global urllog_file ### Search the database for pattern ### Clear the count, open the URL logfile set iresults {} set nresults 0 set fd [open $urllog_file r] ### Start searching... while {![eof $fd]} { # Get one URL for inspection gets $fd foo set irecord [split [string tolower $foo] " "] set iurl [lindex $irecord 0] set iname [lindex $irecord 2] # Match with all given patterns and rules set imatched 1 foreach ipattern $ipatterns { set foob [split [string tolower $ipattern] " "] set ftoken [lindex $foob 0] set fparam [lindex $foob 1] set fmatch [string match $fparam $iurl] if {$ftoken == "+" && $fmatch == 0} { set imatched 0 } if {$ftoken == "-" && $fmatch == 1} { set imatched 0 } if {$ftoken == "%" && [string match $fparam $iname] == 0} { set imatched 0 } } # If the all patterns matched, add to the list... if {$imatched == 1 && $foo != ""} { incr nresults lappend iresults $foo } } # Close file close $fd # Take only last imax results return [lrange $iresults [expr $nresults-$imax] $nresults] } #------------------------------------------------------------------------- ### Parse arguments, find and show the results proc urllog_find {unick uhand uchan utext upublic} { global botnick urllog_name urllog_version urllog_shorturl global urllog_showmax_pub urllog_showmax_priv urlmsg_nomatch ### Parse the given command urllog_log "$unick/$uhand searched URL: $utext" set footokens [split $utext " "] foreach ftoken $footokens { set foomark [string range $ftoken 0 0] set foopat [string range $ftoken 1 end] if {$foomark == "-" || $foomark == "+" || $foomark == "%" || $foomark == "@"} { lappend ipatlist "$foomark *$foopat*" } else { lappend ipatlist "+ *$ftoken*" } } ### Get the matches from database if {$upublic == 0} { set iresults [urllog_urlfind $ipatlist $urllog_showmax_priv] } else { set iresults [urllog_urlfind $ipatlist $urllog_showmax_pub] } ### Show the results if {$iresults != ""} { set j 0 foreach i $iresults { incr j set foo [split $i " "] set shortURL [lindex $foo 0] set shortID [lindex $foo 4] if {$urllog_shorturl != 0 && $shortID != ""} { set shortURL "$shortURL [urllog_get_short $shortID]" } urllog_msg $upublic $unick $uchan "#$j: $shortURL ([lindex $foo 2]@[urllog_ctime [lindex $foo 1]])" } } else { # If no URLs were found urllog_msg $upublic $unick $uchan $urlmsg_nomatch } return 0 } #------------------------------------------------------------------------- ### Finding binded functions proc urllog_pub_urlfind {unick uhost uhand uchan utext} { urllog_find $unick $uhand $uchan $utext 1 return 0 } proc urllog_msg_urlfind {unick uhost uhand utext} { urllog_find $unick $uhand "" $utext 0 return 0 } #------------------------------------------------------------------------- proc urllog_checkmsg2 {unick uhost uhand utext} { urllog_checkurl $utext $unick $uhost "#CHANNEL" return 0 } # end of script