Mercurial > hg > egg-tcls
view urllog.tcl @ 675:09be9264ee79
quotedb: Bump version.
author | Matti Hamalainen <ccr@tnsp.org> |
---|---|
date | Tue, 23 Feb 2021 15:10:52 +0200 |
parents | afe4b1fe5e79 |
children | 46fc0bc95195 |
line wrap: on
line source
########################################################################## # # URLLog v2.8.0 by Matti 'ccr' Hamalainen <ccr@tnsp.org> # (C) Copyright 2000-2021 Tecnic Software productions (TNSP) # # This script is freely distributable under GNU GPL (version 2) license. # ########################################################################## # # URL-logger script for EggDrop IRC robot, utilizing TCL SQLite3 database # interface. Requirements for this script are as follows: # # - utillib.tcl (available from same repository as this script) # - tcl-tls 1.7.13+ for SSL/TLS support # - TCL 8.6 or later # - Eggdrop 1.6.20 or later # - SQLite3 and TCL bindings for it. # # On Debian: tcl8.6 tcl-tls libsqlite3-tcl eggdrop eggdrop-data # # If you are doing a fresh install, you will need to create the initial # database with the required table schemas. You can do that by running # "create_urllog_db.tcl". You also need to set up the configuration in # "config.urllog" file. # ########################################################################## ### The configuration should be in config.urllog in same directory ### as this script. Or change the line below to point where ever ### you wish. See "config.urllog.example" for an example config file. source [file dirname [info script]]/config.urllog ### Required utillib.tcl source [file dirname [info script]]/utillib.tcl ########################################################################## # No need to look below this line ########################################################################## package require sqlite3 package require http package require textutil::split set urllog_name "URLLog" set urllog_version "2.8.0" set urllog_message "$urllog_name v$urllog_version (C) 2000-2021 ccr/TNSP" set urllog_httprep [split "\@|%40|{|%7B|}|%7D|\[|%5B|\]|%5D" "|"] set urllog_shorturl_codes "ABCDEFGHIJKLNMOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789" #------------------------------------------------------------------------- ### Utility functions proc urllog_log {umsg} { global urllog_log_enable urllog_name if {$urllog_log_enable != 0} { putlog "${urllog_name}: $umsg" } } proc urllog_qm {uid} { global urllog_messages if {[info exists urllog_messages($uid)]} { return $urllog_messages($uid) } else { return $uid } } proc urllog_smsg {apublic anick achan amsg {aargs {}}} { global urllog_preferredmsg urllog_cmd_name set amsg [string map [list "@cmd@" $urllog_cmd_name] $amsg] utl_msg_args $urllog_preferredmsg $apublic $anick $achan $amsg $aargs } proc urllog_msg {apublic anick achan aid {aargs {}}} { urllog_smsg $apublic $anick $achan [urllog_qm $aid] $aargs } proc urllog_verb_msg {apublic anick achan aid {aargs {}}} { global urllog_verbose if {$urllog_verbose != 0} { urllog_msg $apublic $anick $achan $aid $aargs } } proc urllog_isnumber {uarg} { foreach i [split $uarg {}] { if {![string match \[0-9\] $i]} { return 0 } } return 1 } proc urllog_is_enabled {uval} { if {$uval} { return "ON." } else { return "OFF." } } proc urllog_sanitize_encoding {uencoding} { regsub -- "^\[a-z\]\[a-z\]_\[A-Z\]\[A-Z\]\." $uencoding "" uencoding set uencoding [string tolower $uencoding] regsub -- "^iso-" $uencoding "iso" uencoding return $uencoding } proc urllog_get_ss {uindex} { global urllog_shorturl_codes return [string index $urllog_shorturl_codes $uindex] } proc urllog_get_short {utime} { global urllog_shorturl_prefix urllog_shorturl_codes set ulen [string length $urllog_shorturl_codes] set u1 [expr $utime / ($ulen * $ulen)] set utmp [expr $utime % ($ulen * $ulen)] set u2 [expr $utmp / $ulen] set u3 [expr $utmp % $ulen] return "\[ $urllog_shorturl_prefix[urllog_get_ss $u1][urllog_get_ss $u2][urllog_get_ss $u3] \]" } proc urllog_chop_str {ustr umax} { if {[string length $ustr] > $umax} { return "[string range $ustr 0 $umax]..." } else { return $ustr } } #------------------------------------------------------------------------- proc urllog_add_url {urlStr urlNick urlHost urlChan urlTitle} { global urllog_db urllog_shorturl_enable urllog_pub_channels global urllog_add_title_max urllog_add_url_max ### Does the URL already exist? set usql "SELECT id AS uid, utime AS utime, url AS uurl, user AS uuser, host AS uhost, chan AS uchan, title AS utitle FROM urls WHERE url='[utl_escape $urlStr]'" urllog_db eval $usql { urllog_log "URL said by $urlNick ($urlStr) already known" if {[utl_match_delim_list $urllog_pub_channels $uchan]} { if {[string length $utitle] > 0} { set stitle [utl_str_map_values [urllog_qm "url_known_has_title"] [list $utitle [urllog_chop_str $utitle $urllog_add_title_max]]] } else { set stitle [urllog_qm "url_known_no_title"] } if {$urllog_shorturl_enable != 0} { urllog_verb_msg 1 $urlNick $urlChan "url_known_short" [list $uuser $uchan $uhost [utl_ctime $utime] $stitle $uurl [urllog_get_short $uid]] } else { urllog_verb_msg 1 $urlNick $urlChan "url_known_long" [list $uuser $uchan $uhost [utl_ctime $utime] $stitle $uurl] } } return 1 } ### Validate title if {$urlTitle == ""} { set uins "NULL" } else { set uins "'[utl_escape $urlTitle]'" } ### Attempt to insert into database set usql "INSERT INTO urls (utime,url,user,host,chan,title) VALUES ([unixtime], '[utl_escape $urlStr]', '[utl_escape $urlNick]', '[utl_escape $urlHost]', '[utl_escape $urlChan]', $uins)" if {[catch {urllog_db eval $usql} uerrmsg]} { urllog_log "$uerrmsg on SQL:\n$usql" return 0 } set uid [urllog_db last_insert_rowid] urllog_log "Added URL ($urlNick@$urlChan): $urlStr" ### Let's say something, to confirm that everything went well. if {$urllog_shorturl_enable != 0} { set urlShort [urllog_get_short $uid] set ushort "short" } else { set urlShort "" set ushort "long" } if {[string length $urlTitle] > 0} { set umode "url_added_${ushort}_has_title" } else { set umode "url_added_${ushort}_no_title" } urllog_verb_msg 1 $urlNick $urlChan $umode [list $urlTitle [urllog_chop_str $urlTitle $urllog_add_title_max] $urlStr [urllog_chop_str $urlStr $urllog_add_url_max] $urlShort] return 1 } #------------------------------------------------------------------------- proc urllog_do_request { urlNick urlChan urlStr urlStatus urlSCode urlCode urlData urlMeta } { upvar $urlStatus ustatus upvar $urlSCode uscode upvar $urlCode ucode upvar $urlData udata upvar $urlMeta umeta set urlHeaders {} lappend urlHeaders "Accept-Encoding" "identity" #lappend urlHeaders "Connection" "keep-alive" set uresult [utl_http_do_request $urlHeaders $urlStr ustatus uscode ucode udata umeta] if {$uresult == -1} { urllog_verb_msg 1 $urlNick $urlChan "err_http_get" [list $urlStr $uscode] urllog_log "HTTP request failed: $uscode" return 0 } elseif {$uresult < 0} { urllog_verb_msg 1 $urlNick $urlChan "err_http_status" [list $urlStr $ustatus $uscode $ucode] urllog_log "Error in HTTP request: $ustatus / $uscode ($urlStr)" return 0 } return 1 } #------------------------------------------------------------------------- proc urllog_validate_url { urlNick urlChan urlMStr urlMProto urlMHostName } { global urllog_httprep urllog_shorturl_prefix urllog_shorturl_enable upvar $urlMStr urlStr upvar $urlMProto urlProto upvar $urlMHostName urlHostName ### Hack for removing parenthesis around an URL if {[regexp {^\((.+)\)$} $urlStr -> urlClean]} { set urlStr $urlClean } if {[regexp {^\[(.+)\]$} $urlStr -> urlClean]} { set urlStr $urlClean } ### Clean excess stuff, if any, and attempt to ### guess the URL protocol component if it is missing if {[regexp "(\[a-z\]+)://\[^ \]+" $urlStr urlMatch urlProto]} { set urlStr $urlMatch } elseif {[regexp "www\.\[^ \]+" $urlStr urlMatch]} { set urlStr "http://$urlMatch" } elseif {[regexp "ftp\.\[^ \]+" $urlStr urlMatch]} { set urlStr "ftp://$urlMatch" } ### Handle URLs that have an IPv4-address if {[regexp "(\[a-z\]+)://(\[0-9\]{1,3})\\.(\[0-9\]{1,3})\\.(\[0-9\]{1,3})\\.(\[0-9\]{1,3})" $urlStr -> urlProto ni1 ni2 ni3 ni4]} { # Check if the IP is on local network if {$ni1 == 127 || $ni1 == 10 || ($ni1 == 192 && $ni2 == 168)} { urllog_verb_msg 1 $urlNick $urlChan "err_url_local_net" [list $urlStr] urllog_log "URL pointing to local network, ignored (${urlStr})." return 0 } if {$ni1 == 0 || $ni1 >= 255 || $ni2 >= 255 || $ni3 >= 255 || $ni4 >= 255} { urllog_verb_msg 1 $urlNick $urlChan "err_url_invalid_net" [list $urlStr] urllog_log "URL pointing to invalid network, ignored (${urlStr})." return 0 } } ### Check now if we have an ShortURL here ... if {[string match "${urllog_shorturl_prefix}*" $urlStr]} { urllog_log "Ignoring ShortURL: ${urlStr}" # set uud "" # set usql "SELECT id AS uid, url AS uurl, user AS uuser, host AS uhost, chan AS uchan, title AS utitle FROM urls WHERE utime=$uud" # urllog_db eval $usql { # urllog_msg 1 $urlNick $urlChan "'$utitle' - $uurl" # return 1 # } return 0 } ### Get URL protocol component set urlProto "" if {[regexp "(\[a-z\]+)://" $urlStr -> urlProto]} { ### Is it a http or ftp url? if {$urlProto != "http" && $urlProto != "https" && $urlProto != "ftp"} { urllog_verb_msg 1 $urlNick $urlChan "err_url_proto_class" [list $urlStr $urlProto] urllog_log "Broken URL: ${urlStr} - unsupported protocol class (${urlProto})." return 0 } } else { urllog_verb_msg 1 $urlNick $urlChan "err_url_proto_no_class" [list $urlStr] urllog_log "Broken URL: ${urlStr} - no protocol specifier." return 0 } ### Check the PORT (if the ":" is there) set urlRecord [split $urlStr "/"] set urlHostName [lindex $urlRecord 2] set urlPort [lindex [split $urlHostName ":"] end] if {$urlPort != "" && ![urllog_isnumber $urlPort] && $urlPort != $urlHostName} { urllog_verb_msg 1 $urlNick $urlChan "err_url_invalid_port" [list $urlStr $urlPort] urllog_log "Broken URL: ${urlStr} - illegal or invalid port '${urlPort}'" return 0 } set urlStr [string map $urllog_httprep $urlStr] return 1 } #------------------------------------------------------------------------- proc urllog_handle_redirect {urlNick urlHost urlChan urlRedirLevel urlProto urlHostName urlStr urlStatus urlSCode urlCode urlData urlMeta} { upvar $urlProto uproto upvar $urlHostName uhostname upvar $urlStr ustr upvar $urlStatus ustatus upvar $urlSCode uscode upvar $urlCode ucode upvar $urlData udata upvar $urlMeta umeta ### Was result a redirect? if {$ucode >= 301 && $ucode <= 303} { ### Check that we have a location header if {![info exists umeta(location)]} { urllog_verb_msg 1 $urlNick $urlChan "err_redirect_invalid" [list $ustr $ustatus $uscode $ucode $urlRedirLevel] urllog_log "Invalid redirect without location header: status=$ustatus, code=$ucode, scode=$uscode, url=$ustr, redirLevel=$urlRedirLevel" return 0 } ### Fix up location URI set nustr $umeta(location) if {![regexp "\[a-z\]+://" $nustr]} { if {[string range $nustr 0 0] != "/"} { append nustr "/" } set nustr "${uproto}://${uhostname}${nustr}" } ### Validate the target URI urllog_log "Redirection #${urlRedirLevel}: $ustr -> $nustr" set ustr $nustr if {![urllog_validate_url $urlNick $urlChan ustr uproto uhostname]} { return 0 } ### Attempt to fetch redirection target utl_http_clear_request ustatus uscode ucode udata umeta if {![urllog_do_request $urlNick $urlChan $ustr ustatus uscode ucode udata umeta]} { urllog_verb_msg 1 $urlNick $urlChan "err_redirect_fail" [list $ustr $ustatus $uscode $ucode $urlRedirLevel] urllog_log "Error fetching redirect: status=$ustatus, code=$ucode, scode=$uscode, url=$ustr, redirLevel=$urlRedirLevel" return 0 } } return 1 } #------------------------------------------------------------------------- proc urllog_check_url {urlStr urlNick urlHost urlChan} { global urllog_encoding http_tls_support global urllog_extra_checks urllog_extra_strict ### Validate URL compoments, etc. set urlProto "" set urlHostName "" if {![urllog_validate_url $urlNick $urlChan urlStr urlProto urlHostName]} { return 1 } ### Do we perform additional checks? if {$urllog_extra_checks == 0 || !(($http_tls_support != 0 && $urlProto == "https") || $urlProto == "http")} { # No optional checks, or it's not http/https. if {$urllog_extra_strict == 0} { # Strict checking disabled, so add the URL, if it does not exist already. urllog_add_url $urlStr $urlNick $urlHost $urlChan "" return 1 } elseif {$http_tls_support == 0 && $urlProto == "https"} { # Strict ENABLED: If TLS support is disabled and we have https, do nothing return 1 } elseif {$urlProto != "http" && $urlProto != "https"} { # Strict ENABLED: It's not http, or https return 1 } } else { ### Does the document pointed by the URL exist? if {![urllog_do_request $urlNick $urlChan $urlStr ustatus uscode ucode udata umeta]} { return 1 } ### Handle redirects of 3 levels if {![urllog_handle_redirect $urlNick $urlHost $urlChan 1 urlProto urlHostName urlStr ustatus uscode ucode udata umeta]} { return 1 } if {![urllog_handle_redirect $urlNick $urlHost $urlChan 2 urlProto urlHostName urlStr ustatus uscode ucode udata umeta]} { return 1 } if {![urllog_handle_redirect $urlNick $urlHost $urlChan 3 urlProto urlHostName urlStr ustatus uscode ucode udata umeta]} { return 1 } # Final document if {$ucode >= 200 && $ucode <= 205} { set uenc_doc "" set uenc_http "" set uencoding "" # Get information about specified character encodings if {[info exists umeta(content-type)] && [regexp -nocase {charset\s*=\s*([a-z0-9._-]+)} $umeta(content-type) -> uenc_http]} { # Found character set encoding information in HTTP headers } if {[regexp -nocase -- "<meta.\*\?content=\"text/html.\*\?charset=(\[^\"\]*)\".\*\?/\?>" $udata -> uenc_doc]} { # Found old style HTML meta tag with character set information } elseif {[regexp -nocase -- "<meta.\*\?charset=\"(\[^\"\]*)\".\*\?/\?>" $udata -> uenc_doc]} { # Found HTML5 style meta tag with character set information } # Make sanitized versions of the encoding strings set uenc_http2 [urllog_sanitize_encoding $uenc_http] set uenc_doc2 [urllog_sanitize_encoding $uenc_doc] # Check if the document has specified encoding # KLUDGE! set uencoding $uenc_http2 if {$uencoding == "" && $uenc_doc2 != ""} { set uencoding $uenc_doc2 } elseif {$uencoding == ""} { # If _NO_ known encoding of any kind, assume the default of iso8859-1 set uencoding "iso8859-1" } urllog_log "Charsets: http='$uenc_http', doc='$uenc_doc' / sanitized http='$uenc_http2', doc='$uenc_doc2' -> '$uencoding'" # Get the document title, if any set urlTitle "" if {[regexp -nocase -- "<meta name=\"twitter:title\" content=\"(.\*\?)\"\\s\*\/\?>" $udata -> urlTitle]} { # ... } elseif {[regexp -nocase -- "<title.\*\?>(.\*\?)</title>" $udata -> urlTitle]} { # ... } # If facebook, get meta info if {[regexp -nocase -- "(http|https):\/\/www.facebook.com" $urlStr]} { if {[regexp -nocase -- "<meta name=\"description\" content=\"(.\*\?)\"" $udata -> urlTmp]} { if {$urlTitle != ""} { append urlTitle " :: " } append urlTitle $urlTmp } } # If character set conversion is required, do it now if {$urlTitle != "" && $uencoding != ""} { if {[catch {set urlTitle [encoding convertfrom $uencoding $urlTitle]} cerrmsg]} { urllog_verb_msg 1 $urlNick $urlChan "err_charset" [list $urlStr $urlTitle $cerrmsg] urllog_log "Error in charset conversion: $urlStr / '$urlTitle': $cerrmsg" } # Convert some HTML entities to plaintext and do some cleanup set utmp [utl_convert_html_ent $urlTitle] regsub -all "\r|\n|\t" $utmp " " utmp regsub -all " *" $utmp " " utmp set urlTitle [string trim $utmp] } # Check if the URL already exists, just in case we had some redirects urllog_add_url $urlStr $urlNick $urlHost $urlChan $urlTitle } else { urllog_verb_msg 1 $urlNick $urlChan "err_http_fail" [list $urlStr $ustatus $uscode $ucode] urllog_log "Error fetching document: status=$ustatus, code=$ucode, scode=$uscode, url=$urlStr" } } } #------------------------------------------------------------------------- proc urllog_check_line {unick uhost uhand uchan utext} { global urllog_log_channels ### Check the nick if {$unick == "*"} { return 0 } ### Check the channel if {[utl_match_delim_list $urllog_log_channels $uchan]} { ### Do the URL checking foreach str [::textutil::split::splitx $utext {\s+}] { if {[regexp "(\[a-z]+://\[^\[:space:\]\]+|^(www|ftp)\.\[^\[:space:\]\]+)" $str ulink]} { urllog_check_url $str $unick $uhost $uchan } } } return 0 } #------------------------------------------------------------------------- ### Parse arguments, find and show the results proc urllog_cmd_search {unick uhand uchan utext upublic} { global urllog_db urllog_shorturl_enable urllog_showmax_pub urllog_showmax_priv global urllog_search_title_max urllog_search_url_max if {$upublic == 0} { set ulimit 5 } else { set ulimit 3 } ### Parse the given command urllog_log "${unick}/${uhand} searched URL: ${utext}" set ftokens [::textutil::split::splitx $utext {\s+}] set fpatlist "" foreach ftoken $ftokens { set fprefix [string range $ftoken 0 0] set fpattern [string range $ftoken 1 end] set qpattern "'%[utl_escape $fpattern]%'" if {$fprefix == "-"} { lappend fpatlist "(url NOT LIKE $qpattern OR title NOT LIKE $qpattern)" } elseif {$fprefix == "%"} { lappend fpatlist "user LIKE $qpattern" } elseif {$fprefix == "@"} { # foo } elseif {$fprefix == "+"} { lappend fpatlist "(url LIKE $qpattern OR title LIKE $qpattern)" } else { set qpattern "'%[utl_escape $ftoken]%'" lappend fpatlist "(url LIKE $qpattern OR title LIKE $qpattern)" } } if {[llength $fpatlist] > 0} { set fquery "WHERE [join $fpatlist " AND "]" } else { set fquery "" } ### Perform SQL query and show results if any set nresult 0 set usql "SELECT id AS uid, utime AS utime, url AS uurl, user AS uuser, host AS uhost, title AS utitle FROM urls $fquery ORDER BY utime DESC LIMIT $ulimit" urllog_db eval $usql { incr nresult if {[string length $utitle] > 0} { set stitle [utl_str_map_values [urllog_qm "search_result_has_title"] [list $utitle [urllog_chop_str $utitle $urllog_search_title_max]]] } else { set stitle [urllog_qm "search_result_no_title"] } if {$urllog_shorturl_enable != 0 && $uid != ""} { urllog_msg $upublic $unick $uchan "search_result_short" [list $nresult $uuser [utl_ctime $utime] $stitle [urllog_chop_str $uurl $urllog_search_url_max] [urllog_get_short $uid]] } else { urllog_msg $upublic $unick $uchan "search_result_long" [list $nresult $uuser [utl_ctime $utime] $stitle $uurl] } } if {$nresult == 0} { # If no URLs were found urllog_msg $upublic $unick $uchan "search_no_match" [list $utext] } return 1 } #------------------------------------------------------------------------- ### Finding binded functions proc urllog_cmd_pub_search {unick uhost uhand uchan utext} { global urllog_search_channels if {[utl_match_delim_list $urllog_search_channels $uchan]} { return [urllog_cmd_search $unick $uhand $uchan $utext 1] } return 0 } proc urllog_cmd_msg_search {unick uhost uhand utext} { return [urllog_cmd_search $unick $uhand "" $utext 0] } #------------------------------------------------------------------------- # Script initialization #------------------------------------------------------------------------- ### Initialization messages putlog "$urllog_message" ### Miscellaneous init messages putlog " - Log messages [urllog_is_enabled $urllog_log_enable]" putlog " - Verbose mode [urllog_is_enabled $urllog_verbose]" putlog " - Additional URL validity checks [urllog_is_enabled $urllog_extra_checks]" putlog " - Strict checks [urllog_is_enabled $urllog_extra_strict]" ### HTTP module initialization if {[info exists http_user_agent] && $http_user_agent != ""} { ::http::config -useragent $http_user_agent } else { ::http::config -useragent "${urllog_name}/${urllog_version}" } if {[info exists http_use_proxy] && $http_use_proxy != 0} { ::http::config -proxyhost $http_proxy_host -proxyport $http_proxy_port putlog " - Using proxy ${http_proxy_host}:${http_proxy_port}" } if {[info exists http_tls_support] && $http_tls_support != 0} { package require tls ::http::register https 443 [list ::tls::socket -request true -require true -ssl2 false -ssl3 false -tls1 true -tls1.1 true -tls1.2 true -cadir $http_tls_cadir -autoservername true] putlog " - TLS/SSL support enabled." } ### SQLite database initialization if {[catch {sqlite3 urllog_db $urllog_db_file} uerrmsg]} { putlog "Could not open SQLite3 database '${urllog_db_file}': ${uerrmsg}" exit 2 } # end of script