Mercurial > hg > egg-tcls
view urllog.tcl @ 523:89aaf279c12b
feeds: Bump copyrights.
author | Matti Hamalainen <ccr@tnsp.org> |
---|---|
date | Tue, 07 Jul 2020 12:04:12 +0300 |
parents | fdd1f0b83685 |
children | 137d61a0b5b7 |
line wrap: on
line source
########################################################################## # # URLLog v2.4.3 by Matti 'ccr' Hamalainen <ccr@tnsp.org> # (C) Copyright 2000-2020 Tecnic Software productions (TNSP) # # This script is freely distributable under GNU GPL (version 2) license. # ########################################################################## # # NOTICE! NOTICE! This script REQUIRES tcl-tls 1.7.13+ if you wish to # support SSL/TLS https for URL checking. And you probably do. # # URL-logger script for EggDrop IRC robot, utilizing SQLite3 database # This script requires SQLite TCL extension. Under Debian, you need: # tcl8.5 libsqlite3-tcl (and eggdrop eggdrop-data, of course) # # If you are doing a fresh install, you will need to create the initial # database with the required table schemas. You can do that by running # "create_urllog_db.tcl". You also need to set up the configuration in # "config.urllog" file. # ########################################################################## ### The configuration should be in config.urllog in same directory ### as this script. Or change the line below to point where ever ### you wish. See "config.urllog.example" for an example config file. source [file dirname [info script]]/config.urllog ### Required utillib.tcl source [file dirname [info script]]/utillib.tcl ########################################################################## # No need to look below this line ########################################################################## set urllog_name "URLLog" set urllog_version "2.4.3" set urllog_tld_list [split $urllog_tld_list ","] set urllog_httprep [split "\@|%40|{|%7B|}|%7D|\[|%5B|\]|%5D" "|"] ### Require packages package require sqlite3 package require http ### Binding initializations bind pub - !urlfind urllog_pub_urlfind bind msg - !urlfind urllog_msg_urlfind bind pubm - *.* urllog_check_line bind topc - *.* urllog_check_line ### Initialization messages set urllog_message "$urllog_name v$urllog_version (C) 2000-2020 ccr/TNSP" putlog "$urllog_message" ### Miscellaneous init messages if {$urllog_extra_checks != 0} { putlog " (Additional URL validity checks enabled)" } if {$urllog_check_tld != 0} { putlog " (Check TLD)" } if {$urllog_verbose != 0} { putlog " (Verbose mode enabled)" } ### HTTP module initialization if {[info exists http_user_agent] && $http_user_agent != ""} { ::http::config -useragent $http_user_agent } else { ::http::config -useragent "$urllog_name/$urllog_version" } if {[info exists http_use_proxy] && $http_use_proxy != 0} { ::http::config -proxyhost $http_proxy_host -proxyport $http_proxy_port putlog " (Using proxy $http_proxy_host:$http_proxy_port)" } if {[info exists http_tls_support] && $http_tls_support != 0} { package require tls ::http::register https 443 [list ::tls::socket -request true -require true -ssl2 false -ssl3 false -tls1 true -tls1.1 true -tls1.2 true -cadir $http_tls_cadir -autoservername true] putlog " (TLS/SSL support enabled)" } ### SQLite database initialization if {[catch {sqlite3 urldb $urllog_db_file} uerrmsg]} { putlog " Could not open SQLite3 database '$urllog_db_file': $uerrmsg" exit 2 } #------------------------------------------------------------------------- ### Utility functions proc urllog_log {arg} { global urllog_log_enable urllog_name if {$urllog_log_enable != 0} { putlog "$urllog_name: $arg" } } proc urllog_isnumber {uarg} { foreach i [split $uarg {}] { if {![string match \[0-9\] $i]} { return 0 } } return 1 } proc urllog_msg {apublic anick achan amsg} { global urllog_preferredmsg if {$apublic == 1} { putserv "$urllog_preferredmsg $achan :$amsg" } else { putserv "$urllog_preferredmsg $anick :$amsg" } } proc urllog_verb_msg {anick achan amsg} { global urllog_verbose if {$urllog_verbose != 0} { urllog_msg 1 $anick $achan $amsg } } proc urllog_sanitize_encoding {uencoding} { regsub -- "^\[a-z\]\[a-z\]_\[A-Z\]\[A-Z\]\." $uencoding "" uencoding set uencoding [string tolower $uencoding] regsub -- "^iso-" $uencoding "iso" uencoding return $uencoding } #------------------------------------------------------------------------- set urllog_shorturl_str "ABCDEFGHIJKLNMOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789" proc urllog_get_short {utime} { global urllog_shorturl_prefix urllog_shorturl_str set ulen [string length $urllog_shorturl_str] set u1 [expr $utime / ($ulen * $ulen)] set utmp [expr $utime % ($ulen * $ulen)] set u2 [expr $utmp / $ulen] set u3 [expr $utmp % $ulen] return "\[ $urllog_shorturl_prefix[string index $urllog_shorturl_str $u1][string index $urllog_shorturl_str $u2][string index $urllog_shorturl_str $u3] \]" } #------------------------------------------------------------------------- proc urllog_chop_url {url} { global urllog_shorturl_orig if {[string length $url] > $urllog_shorturl_orig} { return "[string range $url 0 $urllog_shorturl_orig]..." } else { return $url } } #------------------------------------------------------------------------- proc urllog_exists {urlStr urlNick urlHost urlChan} { global urldb urlmsg_alreadyknown urllog_shorturl global urllog_msg_channels set usql "SELECT id AS uid, utime AS utime, url AS uurl, user AS uuser, host AS uhost, chan AS uchan, title AS utitle FROM urls WHERE url='[utl_escape $urlStr]'" urldb eval $usql { urllog_log "URL said by $urlNick ($urlStr) already known" if {$urllog_shorturl != 0} { set qstr "[urllog_get_short $uid] " } else { set qstr "" } append qstr "($uuser/$uchan@[utl_ctime $utime])" if {[string length $utitle] > 0} { set qstr "$urlmsg_alreadyknown - '$utitle' $qstr" } else { set qstr "$urlmsg_alreadyknown $qstr" } if {[utl_match_delim_list $urllog_msg_channels $uchan]} { urllog_verb_msg $urlNick $urlChan $qstr } return 0 } return 1 } #------------------------------------------------------------------------- proc urllog_addurl {urlStr urlNick urlHost urlChan urlTitle} { global urldb urllog_shorturl if {$urlTitle == ""} { set uins "NULL" } else { set uins "'[utl_escape $urlTitle]'" } set usql "INSERT INTO urls (utime,url,user,host,chan,title) VALUES ([unixtime], '[utl_escape $urlStr]', '[utl_escape $urlNick]', '[utl_escape $urlHost]', '[utl_escape $urlChan]', $uins)" if {[catch {urldb eval $usql} uerrmsg]} { urllog_log "$uerrmsg on SQL:\n$usql" return 0 } set uid [urldb last_insert_rowid] urllog_log "Added URL ($urlNick@$urlChan): $urlStr" ### Let's say something, to confirm that everything went well. if {$urllog_shorturl != 0} { set qstr "[urllog_get_short $uid] " } else { set qstr "" } if {[string length $urlTitle] > 0} { urllog_verb_msg $urlNick $urlChan "'$urlTitle' ([urllog_chop_url $urlStr]) $qstr" } else { urllog_verb_msg $urlNick $urlChan "[urllog_chop_url $urlStr] $qstr" } return 1 } #------------------------------------------------------------------------- proc urllog_dorequest { urlNick urlChan urlStr urlStatus urlSCode urlCode urlData urlMeta } { global urlmsg_ioerror urlmsg_timeout urlmsg_errorgettingdoc upvar 1 $urlStatus ustatus upvar 1 $urlSCode uscode upvar 1 $urlCode ucode upvar 1 $urlData udata upvar 1 $urlMeta umeta set urlHeaders {} lappend urlHeaders "Accept-Encoding" "identity" # lappend urlHeaders "Connection" "keep-alive" if {[catch {set utoken [::http::geturl $urlStr -timeout 6000 -binary 1 -headers $urlHeaders]} uerrmsg]} { urllog_verb_msg $urlNick $urlChan "$urlmsg_ioerror ($uerrmsg)" urllog_log "HTTP request failed: $uerrmsg" return 0 } set ustatus [::http::status $utoken] if {$ustatus == "timeout"} { urllog_verb_msg $urlNick $urlChan "$urlmsg_timeout" urllog_log "HTTP request timed out ($urlStr)" return 0 } if {$ustatus != "ok"} { urllog_verb_msg $urlNick $urlChan "$urlmsg_errorgettingdoc ([::http::error $utoken])" urllog_log "Error in HTTP transaction: [::http::error $utoken] ($urlStr)" return 0 } set ustatus [::http::status $utoken] set uscode [::http::code $utoken] set ucode [::http::ncode $utoken] set udata [::http::data $utoken] array set umeta [::http::meta $utoken] ::http::cleanup $utoken return 1 } #------------------------------------------------------------------------- proc urllog_validate_url { urlNick urlChan urlMStr urlMProto urlMHostName } { global urllog_tld_list urlmsg_nosuchhost urllog_httprep urlmsg_unknown_tld global urllog_shorturl_prefix urllog_shorturl urllog_check_tld upvar 1 $urlMStr urlStr upvar 1 $urlMProto urlProto upvar 1 $urlMHostName urlHostName ### Try to guess the URL protocol component (if it is missing) set u_checktld 1 if {![string match "http://*" $urlStr] && ![string match "https://*" $urlStr] && ![string match "ftp://*" $urlStr] && ![string match "*://*" $urlStr]} { if {[string match "*www.*" $urlStr]} { set urlStr "http://$urlStr" } elseif {[string match "*ftp.*" $urlStr]} { set urlStr "ftp://$urlStr" } } ### Handle URLs that have an IPv4-address if {[regexp "(\[a-z\]+)://(\[0-9\]{1,3})\\.(\[0-9\]{1,3})\\.(\[0-9\]{1,3})\\.(\[0-9\]{1,3})" $urlStr urlMatch urlProto ni1 ni2 ni3 ni4]} { # Check if the IP is on local network if {$ni1 == 127 || $ni1 == 10 || ($ni1 == 192 && $ni2 == 168) || $ni1 == 0} { urllog_log "URL pointing to local or invalid network, ignored ($urlStr)." return 0 } # Skip TLD check for URLs with IP address set u_checktld 0 } ### Check now if we have an ShortURL here ... if {[string match "$urllog_shorturl_prefix*" $urlStr]} { urllog_log "Ignoring ShortURL from $urlNick: $urlStr" # set uud "" # set usql "SELECT id AS uid, url AS uurl, user AS uuser, host AS uhost, chan AS uchan, title AS utitle FROM urls WHERE utime=$uud" # urldb eval $usql { # urllog_verb_msg $urlNick $urlChan "'$utitle' - $uurl" # return 1 # } return 0 } ### Get URL protocol component set urlProto "" if {[regexp "(\[a-z\]+)://" $urlStr urlMatch urlProto]} { ### Is it a http or ftp url? if {$urlProto != "http" && $urlProto != "https" && $urlProto != "ftp"} { urllog_log "Broken URL from $urlNick: ($urlStr) UNSUPPORTED protocol class ($urlProto)." return 0 } } else { urllog_log "Broken URL from $urlNick: ($urlStr), no protocol specifier." return 0 } ### Check the PORT (if the ":" is there) set urlRecord [split $urlStr "/"] set urlHostName [lindex $urlRecord 2] set urlPort [lindex [split $urlHostName ":"] end] if {![urllog_isnumber $urlPort] && $urlPort != "" && $urlPort != $urlHostName} { urllog_log "Broken URL from $urlNick: ($urlStr) illegal port $urlPort" return 0 } ### Check the Top Level Domain (TLD) validity if {$urllog_check_tld != 0 && $u_checktld != 0} { set u_sane [lindex [split $urlHostName "."] end] set u_tld [lindex [split $u_sane ":"] 0] set u_found 0 if {[string length $u_tld] == 2} { # Assume all 2-letter domains to be valid :) set u_found 1 } else { # Check our list of known TLDs foreach itld $urllog_tld_list { if {[string match $itld $u_tld]} { set u_found 1 } } } if {$u_found == 0} { urllog_log "Broken URL from $urlNick: ($urlStr) unknown TLD: ${u_tld}." urllog_verb_msg $urlNick $urlChan $urlmsg_unknown_tld return 0 } } set urlStr [string map $urllog_httprep $urlStr] return 1 } #------------------------------------------------------------------------- proc urllog_check_url {urlStr urlNick urlHost urlChan} { global urllog_encoding http_tls_support urlmsg_errorgettingdoc global urllog_extra_checks urllog_extra_strict urllog_rasiatube_hack ### Does the URL already exist? if {![urllog_exists $urlStr $urlNick $urlHost $urlChan]} { return 1 } ### Validate URL compoments, etc. set urlProto "" set urlHostName "" if {![urllog_validate_url $urlNick $urlChan urlStr urlProto urlHostName]} { return 1 } ### Do we perform additional checks? if {$urllog_extra_checks == 0 || !(($http_tls_support != 0 && $urlProto == "https") || $urlProto == "http")} { # No optional checks, or it's not http/https. if {$urllog_extra_strict == 0} { # Strict checking disabled, so add the URL, if it does not exist already. urllog_addurl $urlStr $urlNick $urlHost $urlChan "" return 1 } elseif {$http_tls_support == 0 && $urlProto == "https"} { # Strict ENABLED: If TLS support is disabled and we have https, do nothing return 1 } elseif {$urlProto != "http" && $urlProto != "https"} { # Strict ENABLED: It's not http, or https return 1 } } ### Does the document pointed by the URL exist? if {![urllog_dorequest $urlNick $urlChan $urlStr ustatus uscode ucode udata umeta]} { return 1 } ### Handle redirects if {$ucode >= 301 && $ucode <= 302} { set nurlStr $umeta(Location) if {![regexp "\[a-z\]+://" $nurlStr]} { if {[string range $nurlStr 0 0] != "/"} { append nurlStr "/" } set nurlStr "${urlProto}://${urlHostName}${nurlStr}" } urllog_log "Redirection: $urlStr -> $nurlStr" set urlStr $nurlStr if {![urllog_validate_url $urlNick $urlChan urlStr urlProto urlHostName]} { return 1 } if {![urllog_dorequest $urlNick $urlChan $urlStr ustatus uscode ucode udata umeta]} { return 1 } } ### Handle 2nd level redirects if {$ucode >= 301 && $ucode <= 302} { set nurlStr $umeta(Location) if {![regexp "\[a-z\]+://" $nurlStr]} { if {[string range $nurlStr 0 0] != "/"} { append nurlStr "/" } set nurlStr "${urlProto}://${urlHostName}${nurlStr}" } urllog_log "Redirection #2: $urlStr -> $nurlStr" set urlStr $nurlStr if {![urllog_validate_url $urlNick $urlChan urlStr urlProto urlHostName]} { return 1 } if {![urllog_dorequest $urlNick $urlChan $urlStr ustatus uscode ucode udata umeta]} { return 1 } } # Final document if {$ucode >= 200 && $ucode <= 205} { set uenc_doc "" set uenc_http "" set uencoding "" # Get information about specified character encodings if {[info exists umeta(Content-Type)] && [regexp -nocase {charset\s*=\s*([a-z0-9._-]+)} $umeta(Content-Type) umatches uenc_http]} { # Found character set encoding information in HTTP headers } if {[regexp -nocase -- "<meta.\*\?content=\"text/html.\*\?charset=(\[^\"\]*)\".\*\?/\?>" $udata umatches uenc_doc]} { # Found old style HTML meta tag with character set information } elseif {[regexp -nocase -- "<meta.\*\?charset=\"(\[^\"\]*)\".\*\?/\?>" $udata umatches uenc_doc]} { # Found HTML5 style meta tag with character set information } # Make sanitized versions of the encoding strings set uenc_http2 [urllog_sanitize_encoding $uenc_http] set uenc_doc2 [urllog_sanitize_encoding $uenc_doc] # Check if the document has specified encoding # KLUDGE! set uencoding $uenc_http2 if {$uencoding == "" && $uenc_doc2 != ""} { set uencoding $uenc_doc2 } elseif {$uencoding == ""} { # If _NO_ known encoding of any kind, assume the default of iso8859-1 set uencoding "iso8859-1" } urllog_log "Charsets: http='$uenc_http', doc='$uenc_doc' / sanitized http='$uenc_http2', doc='$uenc_doc2' -> '$uencoding'" # Get the document title, if any set urlTitle "" if {[regexp -nocase -- "<meta name=\"twitter:title\" content=\"(.\*\?)\"\\s\*\/\?>" $udata umatches urlTitle]} { # ... } elseif {[regexp -nocase -- "<title.\*\?>(.\*\?)</title>" $udata umatches urlTitle]} { # ... } # If facebook, get meta info if {[regexp -nocase -- "(http|https):\/\/www.facebook.com" $urlStr]} { if {[regexp -nocase -- "<meta name=\"description\" content=\"(.\*\?)\"" $udata umatches urlTmp]} { if {$urlTitle != ""} { append urlTitle " :: " } append urlTitle $urlTmp } } # If character set conversion is required, do it now if {$urlTitle != "" && $uencoding != ""} { if {[catch {set urlTitle [encoding convertfrom $uencoding $urlTitle]} cerrmsg]} { urllog_log "Error in charset conversion: $cerrmsg" } # Convert some HTML entities to plaintext and do some cleanup set utmp [utl_convert_html_ent $urlTitle] regsub -all "\r|\n|\t" $utmp " " utmp regsub -all " *" $utmp " " utmp set urlTitle [string trim $utmp] } # Check if the URL already exists, just in case we had some redirects if {[urllog_exists $urlStr $urlNick $urlHost $urlChan]} { urllog_addurl $urlStr $urlNick $urlHost $urlChan $urlTitle } return 1 } else { urllog_verb_msg $urlNick $urlChan "$urlmsg_errorgettingdoc ($ucode)" urllog_log "Error fetching document: status=$ustatus, code=$ucode, scode=$uscode, url=$urlStr" } } #------------------------------------------------------------------------- proc urllog_check_line {unick uhost uhand uchan utext} { global urllog_log_channels ### Check the nick if {$unick == "*"} { urllog_log "urllog_check_line: Nick was wc, this should not happen." return 0 } ### Check the channel if {[utl_match_delim_list $urllog_log_channels $uchan]} { ### Do the URL checking foreach str [split $utext " "] { if {[regexp "(\[a-z]+://\[^\[:space:\]\]+|^(www|ftp)\.\[^\[:space:\]\]+)" $str ulink]} { urllog_check_url $str $unick $uhost $uchan } } } return 0 } #------------------------------------------------------------------------- ### Parse arguments, find and show the results proc urllog_find {unick uhand uchan utext upublic} { global urllog_shorturl urldb global urllog_showmax_pub urllog_showmax_priv urlmsg_nomatch if {$upublic == 0} { set ulimit 5 } else { set ulimit 3 } ### Parse the given command urllog_log "$unick/$uhand searched URL: $utext" set ftokens [split $utext " "] set fpatlist "" foreach ftoken $ftokens { set fprefix [string range $ftoken 0 0] set fpattern [string range $ftoken 1 end] set qpattern "'%[utl_escape $fpattern]%'" if {$fprefix == "-"} { lappend fpatlist "(url NOT LIKE $qpattern OR title NOT LIKE $qpattern)" } elseif {$fprefix == "%"} { lappend fpatlist "user LIKE $qpattern" } elseif {$fprefix == "@"} { # foo } elseif {$fprefix == "+"} { lappend fpatlist "(url LIKE $qpattern OR title LIKE $qpattern)" } else { set qpattern "'%[utl_escape $ftoken]%'" lappend fpatlist "(url LIKE $qpattern OR title LIKE $qpattern)" } } if {[llength $fpatlist] > 0} { set fquery "WHERE [join $fpatlist " AND "]" } else { set fquery "" } set iresults 0 set usql "SELECT id AS uid, utime AS utime, url AS uurl, user AS uuser, host AS uhost FROM urls $fquery ORDER BY utime DESC LIMIT $ulimit" urldb eval $usql { incr iresults set shortURL $uurl if {$urllog_shorturl != 0 && $uid != ""} { set shortURL "$shortURL [urllog_get_short $uid]" } urllog_msg $upublic $unick $uchan "#$iresults: $shortURL ($uuser@[utl_ctime $utime])" } if {$iresults == 0} { # If no URLs were found urllog_msg $upublic $unick $uchan $urlmsg_nomatch } return 0 } #------------------------------------------------------------------------- ### Finding binded functions proc urllog_pub_urlfind {unick uhost uhand uchan utext} { global urllog_search_channels if {[utl_match_delim_list $urllog_search_channels $uchan]} { return [urllog_find $unick $uhand $uchan $utext 1] } return 0 } proc urllog_msg_urlfind {unick uhost uhand utext} { urllog_find $unick $uhand "" $utext 0 return 0 } # end of script