view urllog.tcl @ 68:3762c621d1c3

urllog: Cosmetics.
author Matti Hamalainen <ccr@tnsp.org>
date Sun, 11 Sep 2011 16:48:48 +0300
parents 31c8c4f50aa6
children 15fc72bc3f3e
line wrap: on
line source

##########################################################################
#
# URLLog v2.0.1 by ccr/TNSP <ccr@tnsp.org>
# (C) Copyright 2000-2011 Tecnic Software productions (TNSP)
#
##########################################################################
#
# URL-logger script for EggDrop IRC robot, utilizing SQLite3 database
#
# NOTICE! If you are upgrading to v2.0+ from any 1.x version, you
# may want to run a conversion script against your URL-database file,
# if you wish to preserve the old data.
#
# See convert_urllog_db.tcl for more information.
#
# If you are making a fresh install, you will need to create the
# initial SQLite3 database with the required table schemas. You
# can do that by running: create_urllog_db.tcl
#
##########################################################################

###
### HTTP options
###
# Set to 1 if you want to use proxy
set http_proxy 0

# Proxy host and port number (only used if enabled above)
set http_proxy_host ""
set http_proxy_port 8080

# 1 = Enable experimental TLS/SSL support. This may not work.
set http_tls_support 0


###
### General options
###

# Filename of the SQLite URL database file
set urllog_db_file "urllog.sqlite"


# 1 = Verbose: Say messages when URL is OK, bad, etc.
# 0 = Quiet  : Be quiet (only speak if asked with !urlfind, etc)
set urllog_verbose 1


# 1 = Enable logging of various script actions into bot's log
# 0 = Don't.
set urllog_logmsg 1


# 1 = Check URLs for validity and existence before adding.
# 0 = No checks. Add _anything_ that looks like an URL to the database.
set urllog_check 1


###
### Search related settings
###

# 0 = No search-commands available
# 1 = Search enabled
set urllog_search 1


# How many URL's should the !urlfind command show (maximum limit)
set urllog_showmax_pub 3


# For private-search, this is the default limit (user can change it)
set urllog_showmax_priv 6


###
### ShortURL-settings
###

# 1 = Use ShortURLs
# 0 = Don't.
set urllog_shorturl 1

# Max length of original URL to be shown
set urllog_shorturl_orig 30

# Path to PHP/CGI-script that redirects ShortURLs
set urllog_shorturl_prefix "http://tnsp.org/u/"


###
### Message-texts
###

# No such host was found
set urlmsg_nosuchhost "ei tommosta oo!"

# Could not connect host (I/O errors etc)
set urlmsg_ioerror "kraak, virhe yhdynnässä."

# HTTP timeout
set urlmsg_timeout "ei jaksa ootella"

# No such document was found
set urlmsg_errorgettingdoc "siitosvirhe"

# URL was already known (was in database)
set urlmsg_alreadyknown "wanha!"
#set urlmsg_alreadyknown "Empiiristen havaintojen perusteella ja tällä sovellutusalueella esiintyneisiin aikaisempiin kontekstuaalisiin ilmaisuihin viitaten uskallan todeta, että sovellukseen ilmoittamasi tietoverkko-osoite oli kronologisti ajatellen varsin postpresentuaalisesti sopimaton ja ennestään hyvin tunnettu."

# No match was found when searched with !urlfind or other command
set urlmsg_nomatch "Ei osumia."


###
### Things that you usually don't need to touch ...
###

# What IRC "command" should we use to send messages:
# (Valid alternatives are "PRIVMSG" and "NOTICE")
set urllog_preferredmsg "PRIVMSG"

# The valid known Top Level Domains (TLDs), but not the country code TLDs
# (Now includes the new IANA published TLDs)
set urllog_tlds "org,com,net,mil,gov,biz,edu,coop,aero,info,museum,name,pro,int"


##########################################################################
# No need to look below this line
##########################################################################
#-------------------------------------------------------------------------
set urllog_name "URLLog"
set urllog_version "2.0.1"

set urllog_tlds [split $urllog_tlds ","]
set urllog_httprep [split "\@|%40|{|%7B|}|%7D|\[|%5B|\]|%5D" "|"] 

set urllog_html_ent [split "&rlm;||&#8212;|-|&#x202a;||&#x202c;||&lrm;||&aring;|å|&Aring;|Å|&eacute;|é|&#58;|:|&#xe4;|ä|&#xf6;|ö|&#228;|ä|&#246;|ö|&nbsp;| |&#45;|-|&#8221;|\"|&#8220;|\"|&raquo;|>>|&quot;|\"|&auml;|ä|&ouml;|ö|&Auml;|Ä|&Ouml;|Ö|&amp;|&|&lt;|<|&gt;|>|ä|ä|ö|ö|Ä|Ä" "|"]

### Require packages
package require sqlite3
package require http

### Binding initializations
if {$urllog_search != 0} {
  bind pub - !urlfind urllog_pub_urlfind
  bind msg - urlfind urllog_msg_urlfind
}

bind pubm - *.* urllog_checkmsg
bind topc - *.* urllog_checkmsg


### Initialization messages
set urllog_message "$urllog_name v$urllog_version (C) 2000-2011 ccr/TNSP"
putlog "$urllog_message"

### HTTP module initialization
::http::config -useragent "$urllog_name/$urllog_version"
if {$http_proxy != 0} {
  ::http::config -proxyhost $http_proxy_host -proxyport $http_proxy_port
}

if {$http_tls_support != 0} {
  package require tls
  ::http::register https 443 [list ::tls::socket -request 1 -require 1 -cadir "/etc/certs/"]
}

### SQLite database initialization
if {[catch {sqlite3 urldb $urllog_db_file} uerrmsg]} {
  putlog " Could not open SQLite3 database '$urllog_db_file': $uerrmsg"
  exit 2
}


if {$http_proxy != 0} {
  putlog " (Using proxy $http_proxy_host:$http_proxy_port)"
}

if {$urllog_check != 0} {
  putlog " (Additional URL validity checks enabled)"
}

if {$urllog_verbose != 0} {
  putlog " (Verbose mode enabled)"
}

if {$urllog_search != 0} {
  putlog " (Search commands enabled)"
}

#-------------------------------------------------------------------------
### Utility functions
proc urllog_log {arg} {
  global urllog_logmsg urllog_name

  if {$urllog_logmsg != 0} {
    putlog "$urllog_name: $arg"
  }
}


proc urllog_ctime { utime } {

  if {$utime == "" || $utime == "*"} {
    set utime 0
  }

  return [clock format $utime -format "%d.%m.%Y %H:%M"]
}


proc urllog_isnumber {uarg} {

  foreach i [split $uarg {}] {
    if {![string match \[0-9\] $i]} { return 0 }
  }

  return 1
}


proc urllog_msg {apublic anick achan amsg} {
  global urllog_preferredmsg

  if {$apublic == 1} {
    putserv "$urllog_preferredmsg $achan :$amsg"
  } else {
    putserv "$urllog_preferredmsg $anick :$amsg" 
  }
}


proc urllog_verb_msg {anick achan amsg} {
  global urllog_verbose

  if {$urllog_verbose != 0} {
    urllog_msg 1 $anick $achan $amsg
  }
}


proc urllog_convert_ent {udata} {
  global urllog_html_ent
  regsub -all "  " $udata " " utmp
  regsub -all "\r" $udata " " utmp
  regsub -all "\n" $utmp " " utmp
  regsub -all "  *" $utmp " " utmp
  regsub -all "\t" $utmp "" utmp
  return [string map -nocase $urllog_html_ent $utmp]
}


proc urllog_escape { str } {
  return [string map {' ''} $str]
}

#-------------------------------------------------------------------------
proc urllog_get_short {utime} {
  global urllog_shorturl_prefix

  set ustr "ABCDEFGHIJKLNMOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
  set ulen [string length $ustr]

  set u1 [expr $utime / ($ulen * $ulen)]
  set utmp [expr $utime % ($ulen * $ulen)]
  set u2 [expr $utmp / $ulen]
  set u3 [expr $utmp % $ulen]

  return "\[ $urllog_shorturl_prefix[string index $ustr $u1][string index $ustr $u2][string index $ustr $u3] \]"
}


#-------------------------------------------------------------------------
proc urllog_chop_url {url} {
  global urllog_shorturl_orig

  if {[string length $url] > $urllog_shorturl_orig} {
    return "[string range $url 0 $urllog_shorturl_orig]..."
  } else {
    return $url
  }
}

#-------------------------------------------------------------------------
proc urllog_addurl {urlStr urlNick urlHost urlChan urlTitle} {
  global urldb urlmsg_alreadyknown urllog_shorturl

   ### Let's check if we already know the URL
   set tmpURL [urllog_escape $urlStr]
  urldb eval "SELECT id AS urlID, utime AS utime, url AS uurl, user AS uuser, host AS uhost, chan AS uchan FROM urls WHERE url='$tmpURL'" {
    urllog_log "URL said by $urlNick ($urlStr) already known"
    if {$urllog_shorturl != 0} {
      set qstr "[urllog_get_short $urlID] "
    } else {
      set qstr ""
    }
    append qstr "($uuser/$uchan@[urllog_ctime $utime])"
    if {[string length $urlTitle] > 0} {
      set qstr "$urlmsg_alreadyknown - '$urlTitle' $qstr"
    } else {
      set qstr "$urlmsg_alreadyknown $qstr"
    }
    urllog_verb_msg $urlNick $urlChan $qstr
    return 0
  }


  ### OK, the URL was not already known - thus we add it
  set sql "INSERT INTO urls (utime,url,user,host,chan) VALUES ([unixtime], '[urllog_escape $urlStr]', '[urllog_escape $urlNick]', '[urllog_escape $urlHost]', '[urllog_escape $urlChan]')"
  if {[catch {urldb eval $sql} uerrmsg]} {
    urllog_log "$uerrmsg on SQL:\n$sql"
    return 0
  }
  set urlID [urldb last_insert_rowid]
  urllog_log "Added URL ($urlNick@$urlChan): $urlStr"


  ### Let's say something, to confirm that everything went well.
  if {$urllog_shorturl != 0} {
    set qstr "[urllog_get_short $urlID] "
  } else {
    set qstr ""
  }
  if {[string length $urlTitle] > 0} {
    urllog_verb_msg $urlNick $urlChan "'$urlTitle' ([urllog_chop_url $urlStr]) $qstr"
  } else {
    urllog_verb_msg $urlNick $urlChan "[urllog_chop_url $urlStr] $qstr"
  }

  return 1
}


#-------------------------------------------------------------------------
proc urllog_http_handler {utoken utotal ucurr} {
  upvar #0 $utoken state

  # Stop fetching data after 3000 bytes, this should be enough to
  # contain the head section of a HTML page.
  if {$ucurr > 64000} {
    set state(status) "ok"
  }
}

#-------------------------------------------------------------------------
proc urllog_checkurl {urlStr urlNick urlHost urlChan} {
  global urllog_tlds urllog_check urlmsg_nosuchhost urlmsg_ioerror
  global urlmsg_timeout urlmsg_errorgettingdoc urllog_httprep
  global urllog_shorturl_prefix urllog_shorturl urllog_encoding

  ### Print status to bot's log
  urllog_log "$urlStr ($urlNick@$urlChan)"

  ### Try to determine the URL protocol component (if it is missing)
  set u_checktld 1
  if {[string match "*www.*" $urlStr] && ![string match "http://*" $urlStr] && ![string match "https://*" $urlStr]} {
    set urlStr "http://$urlStr"
  } elseif {[string match "*ftp.*" $urlStr] && ![string match "ftp://*" $urlStr]} {
    set urlStr "ftp://$urlStr"
  }

  if {[regexp "(ftp|http|https)://(\[0-9\]{1,3})\\.(\[0-9\]{1,3})\\.(\[0-9\]{1,3})\\.(\[0-9\]{1,3})" $urlStr u_match u_prefix ni1 ni2 ni3 ni4]} {
    # Check if the IP is on local network
    if {($ni1 == 127) || ($ni1 == 10) || ($ni1 == 192 && $ni2 == 168) || ($ni1 == 0)} {
      urllog_log "URL pointing to local or invalid network, ignored ($urlStr)."
      return 0
    }

    # Skip TLD check for URLs with IP address
    set u_checktld 0
  }

  if {$urllog_shorturl != 0 && [string match "*$urllog_shorturl_prefix*" $urlStr]} {
    urllog_log "Ignoring ShortURL."
    return 0
  }

  ### Check the PORT (if the ":" is there)
  set u_record [split $urlStr "/"]
  set u_hostname [lindex $u_record 2]
  set u_port [lindex [split $u_hostname ":"] end]

  if {![urllog_isnumber $u_port] && $u_port != "" && $u_port != $u_hostname} {
    urllog_log "Broken URL from $urlNick: ($urlStr) illegal port $u_port"
    return 0
  }

  # Default to port 80 (HTTP)
  if {![urllog_isnumber $u_port]} {
    set u_port 80
  }

  ### Is it a http or ftp url? (FIX ME!)
  if {[string range $urlStr 0 3] != "http" && [string range $urlStr 0 2] != "ftp"} {
    urllog_log "Broken URL from $urlNick: ($urlStr) UNSUPPORTED TYPE (not HTTP or FTP)"
    return 0
  }

  ### Check the Top Level Domain (TLD) validity
  if {$u_checktld != 0} {
    set u_sane [lindex [split $u_hostname "."] end]
    set u_tld [lindex [split $u_sane ":"] 0]
    set u_found 0

    if {[string length $u_tld] == 2} {
      # Assume all 2-letter domains to be valid :)
      set u_found 1
    } else {
      # Check our list of known TLDs
      foreach itld $urllog_tlds {
        if {[string match $itld $u_tld]} {
          set u_found 1
        }
      }
    }

    if {$u_found == 0} {
      urllog_log "Broken URL from $urlNick: ($urlStr) illegal TLD: $u_tld."
      return 0
    }
  }

  set urlStr [string map $urllog_httprep $urlStr]


  ### Do we perform additional optional checks?
  if {$urllog_check == 0 || [string range $urlStr 0 4] != "http:"} {
    # No optional checks, just add the URL
    urllog_addurl $urlStr $urlNick $urlHost $urlChan ""
    return 1
  }

  ### Does the document pointed by the URL exist?
  if {[catch {set utoken [::http::geturl $urlStr -progress urllog_http_handler -blocksize 1024 -timeout 3000]} uerrmsg]} {
    urllog_verb_msg $urlNick $urlChan "$urlmsg_ioerror ($uerrmsg)"
    urllog_log "HTTP request failed: $uerrmsg"
    return 0
  }

  if {[::http::status $utoken] == "timeout"} {
    urllog_verb_msg $urlNick $urlChan "$urlmsg_timeout"
    urllog_log "HTTP request timed out ($urlStr)"
    return 0
  }

  if {[::http::status $utoken] != "ok"} {
    urllog_verb_msg $urlNick $urlChan "$urlmsg_errorgettingdoc ([::http::error $utoken])"
    urllog_log "Error in HTTP transaction: [::http::error $utoken] ($urlStr)"
    return 0
  }

  # Fixme! Handle redirects!
  set ucode [::http::ncode $utoken]
  if {$ucode >= 200 && $ucode <= 309} {
    set udata [::http::data $utoken]
    set umatches [regexp -nocase -inline -- "<meta.\*\?content=\".\*\?charset=(\[^\"\]*)\"/>" $udata]
    set uconvert 0
    if {[llength $umatches] > 0} {
      set uencoding [lindex $umatches 1]
      if {[string length $uencoding] > 3} {
        set uconvert 1
      }
    }

    set umatches [regexp -nocase -inline -- "<title>(.\*\?)</title>" $udata]
    if {[llength $umatches] > 0} {
      set urlTitle [lindex $umatches 1]
      if {$uconvert != 0} {
        if {[catch {set urlTitle [encoding convertfrom $uencoding $urlTitle]} cerrmsg]} {
          urllog_log "Error in charset conversion: $cerrmsg"
        }
      }
      set urlTitle [urllog_convert_ent $urlTitle]
      regsub -all "(^ *| *$)" $urlTitle "" urlTitle
    } else {
      set urlTitle ""
    }

    # Rasiatube hack
    if {[string match "*/rasiatube/view*" $urlStr]} {
      set rasia 0
      set umatches [regexp -nocase -inline -- "<link rel=\"video_src\"\.\*\?file=(http://\[^&\]+)&" $udata]
      if {[llength $umatches] > 0} {
        set urlStr [lindex $umatches 1]
        regsub -all "\/v\/" $urlStr "\/watch\?v=" urlStr
        set rasia 1
      } else {
        set umatches [regexp -nocase -inline -- "SWFObject.\"(\[^\"\]+)\", *\"flashvideo" $udata]
        if {[llength $umatches] > 0} {
          set urlStr [lindex $umatches 1]
          regsub "http:\/\/www.dailymotion.com\/swf\/" $urlStr "http:\/\/www.dailymotion.com\/video\/" urlStr
          set rasia 1
        }
      }
      
      if {$rasia != 0} {
        urllog_log "RasiaTube mangler: $urlStr"
        urllog_verb_msg $urlNick $urlChan "Korjataan haiseva rasiatube-linkki: $urlStr"
      }
    }

    urllog_addurl $urlStr $urlNick $urlHost $urlChan $urlTitle
    return 1
  } else {
    urllog_verb_msg $urlNick $urlChan "$urlmsg_errorgettingdoc ([::http::code $utoken])"
    urllog_log "[::http::code $utoken] - $urlStr"
  }

  ::http::cleanup $utoken
}


#-------------------------------------------------------------------------
proc urllog_checkmsg {nick uhost hand chan text} {
  ### Check the nick
  if {$nick == "*"} {
    urllog_log "urllog_checkmsg: nick was wc, this should not happen."
    return 0
  }

  ### Do the URL checking
  foreach istr [split $text " "] {
    if {[regexp "(ftp|http|https)://|www\..+|ftp\..*" $istr]} {
      urllog_checkurl $istr $nick $uhost $chan
    }
  }

  return 0
}


#-------------------------------------------------------------------------
### Parse arguments, find and show the results
proc urllog_find {unick uhand uchan utext upublic} {
  global urllog_shorturl urldb
  global urllog_showmax_pub urllog_showmax_priv urlmsg_nomatch

  if {$upublic == 0} {
    set ulimit 5
  } else {
    set ulimit 3
  }

  ### Parse the given command
  urllog_log "$unick/$uhand searched URL: $utext"

  set ftokens [split $utext " "]
  set fpatlist ""
  foreach ftoken $ftokens {
    set fprefix [string range $ftoken 0 0]
    set fpattern [string range $ftoken 1 end]

    if {$fprefix == "-"} {
      lappend fpatlist "url NOT LIKE '%[urllog_escape $fpattern]%'"
    } elseif {$fprefix == "%"} {
      lappend fpatlist "user='[urllog_escape $fpattern]'"
    } elseif {$fprefix == "@"} {
      # foo
    } else {
      lappend fpatlist "url LIKE '%[urllog_escape $ftoken]%'"
    }
  }

  if {[llength $fpatlist] > 0} {
    set fquery "WHERE [join $fpatlist " AND "]"
  } else {
    set fquery ""
  }

  set iresults 0
  set usql "SELECT id AS urlID, utime AS utime, url AS uurl, user AS uuser, host AS uhost FROM urls $fquery ORDER BY utime DESC LIMIT $ulimit"
  urldb eval $usql {
    incr iresults
    set shortURL $uurl
    if {$urllog_shorturl != 0 && $urlID != ""} {
      set shortURL "$shortURL [urllog_get_short $urlID]"
    }
    urllog_msg $upublic $unick $uchan "#$iresults: $shortURL ($uuser@[urllog_ctime $utime])"
  }
  
  if {$iresults == 0} {
    # If no URLs were found
    urllog_msg $upublic $unick $uchan $urlmsg_nomatch
  }

  return 0
}


#-------------------------------------------------------------------------
### Finding binded functions
proc urllog_pub_urlfind {unick uhost uhand uchan utext} {
  urllog_find $unick $uhand $uchan $utext 1
  return 0
}


proc urllog_msg_urlfind {unick uhost uhand utext} {
  urllog_find $unick $uhand "" $utext 0
  return 0
}


# end of script