view urllog.tcl @ 456:102dc89488af

urllog: Improve how http headers are formed.
author Matti Hamalainen <ccr@tnsp.org>
date Mon, 16 Oct 2017 11:28:56 +0300
parents e5810c52d376
children a7029d65796b
line wrap: on
line source

##########################################################################
#
# URLLog v2.4.3 by Matti 'ccr' Hamalainen <ccr@tnsp.org>
# (C) Copyright 2000-2017 Tecnic Software productions (TNSP)
#
# This script is freely distributable under GNU GPL (version 2) license.
#
##########################################################################
#
# URL-logger script for EggDrop IRC robot, utilizing SQLite3 database
# This script requires SQLite TCL extension. Under Debian, you need:
# tcl8.5 libsqlite3-tcl (and eggdrop eggdrop-data, of course)
#
# NOTICE! If you are upgrading to URLLog v2.0+ from any 1.x version, you
# may want to run a conversion script against your URL-database file,
# if you wish to preserve the old data.
#
# See convert_urllog_db.tcl for more information.
#
# If you are doing a fresh install, you will need to create the
# initial SQLite3 database with the required table schemas. You
# can do that by running: create_urllog_db.tcl
#
##########################################################################

### The configuration should be in config.urllog in same directory
### as this script. Or change the line below to point where ever
### you wish. See "config.urllog.example" for an example config file.
source [file dirname [info script]]/config.urllog

### Required utillib.tcl
source [file dirname [info script]]/utillib.tcl


##########################################################################
# No need to look below this line
##########################################################################
set urllog_name "URLLog"
set urllog_version "2.4.3"

set urllog_tld_list [split $urllog_tld_list ","]
set urllog_httprep [split "\@|%40|{|%7B|}|%7D|\[|%5B|\]|%5D" "|"]


### Require packages
package require sqlite3
package require http

### Binding initializations
bind pub - !urlfind urllog_pub_urlfind
bind msg - !urlfind urllog_msg_urlfind
bind pubm - *.* urllog_check_line
bind topc - *.* urllog_check_line


### Initialization messages
set urllog_message "$urllog_name v$urllog_version (C) 2000-2017 ccr/TNSP"
putlog "$urllog_message"

### Miscellaneous init messages
if {$urllog_extra_checks != 0} {
  putlog " (Additional URL validity checks enabled)"
}

if {$urllog_check_tld != 0} {
  putlog " (Check TLD)"
}

if {$urllog_verbose != 0} {
  putlog " (Verbose mode enabled)"
}

### HTTP module initialization
if {[info exists http_user_agent] && $http_user_agent != ""} {
  ::http::config -useragent $http_user_agent
} else {
  ::http::config -useragent "$urllog_name/$urllog_version"
}

if {[info exists http_use_proxy] && $http_use_proxy != 0} {
  ::http::config -proxyhost $http_proxy_host -proxyport $http_proxy_port
  putlog " (Using proxy $http_proxy_host:$http_proxy_port)"
}

if {[info exists http_tls_support] && $http_tls_support != 0} {
  package require tls
  ::http::register https 443 [list ::tls::socket -request 1 -require 1 -tls1 1 -cadir $http_tls_cadir]
  putlog " (TLS/SSL support enabled)"
}

### SQLite database initialization
if {[catch {sqlite3 urldb $urllog_db_file} uerrmsg]} {
  putlog " Could not open SQLite3 database '$urllog_db_file': $uerrmsg"
  exit 2
}


#-------------------------------------------------------------------------
### Utility functions
proc urllog_log {arg} {
  global urllog_log_enable urllog_name

  if {$urllog_log_enable != 0} {
    putlog "$urllog_name: $arg"
  }
}


proc urllog_isnumber {uarg} {
  foreach i [split $uarg {}] {
    if {![string match \[0-9\] $i]} { return 0 }
  }
  return 1
}


proc urllog_msg {apublic anick achan amsg} {
  global urllog_preferredmsg

  if {$apublic == 1} {
    putserv "$urllog_preferredmsg $achan :$amsg"
  } else {
    putserv "$urllog_preferredmsg $anick :$amsg"
  }
}


proc urllog_verb_msg {anick achan amsg} {
  global urllog_verbose

  if {$urllog_verbose != 0} {
    urllog_msg 1 $anick $achan $amsg
  }
}


proc urllog_sanitize_encoding {uencoding} {
  regsub -- "^\[a-z\]\[a-z\]_\[A-Z\]\[A-Z\]\." $uencoding "" uencoding
  set uencoding [string tolower $uencoding]
  regsub -- "^iso-" $uencoding "iso" uencoding
  return $uencoding
}


#-------------------------------------------------------------------------
set urllog_shorturl_str "ABCDEFGHIJKLNMOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"

proc urllog_get_short {utime} {
  global urllog_shorturl_prefix urllog_shorturl_str

  set ulen [string length $urllog_shorturl_str]

  set u1 [expr $utime / ($ulen * $ulen)]
  set utmp [expr $utime % ($ulen * $ulen)]
  set u2 [expr $utmp / $ulen]
  set u3 [expr $utmp % $ulen]

  return "\[ $urllog_shorturl_prefix[string index $urllog_shorturl_str $u1][string index $urllog_shorturl_str $u2][string index $urllog_shorturl_str $u3] \]"
}


#-------------------------------------------------------------------------
proc urllog_chop_url {url} {
  global urllog_shorturl_orig

  if {[string length $url] > $urllog_shorturl_orig} {
    return "[string range $url 0 $urllog_shorturl_orig]..."
  } else {
    return $url
  }
}


#-------------------------------------------------------------------------
proc urllog_exists {urlStr urlNick urlHost urlChan} {
  global urldb urlmsg_alreadyknown urllog_shorturl
  global urllog_msg_channels

  set usql "SELECT id AS uid, utime AS utime, url AS uurl, user AS uuser, host AS uhost, chan AS uchan, title AS utitle FROM urls WHERE url='[utl_escape $urlStr]'"
  urldb eval $usql {
    urllog_log "URL said by $urlNick ($urlStr) already known"
    if {$urllog_shorturl != 0} {
      set qstr "[urllog_get_short $uid] "
    } else {
      set qstr ""
    }
    append qstr "($uuser/$uchan@[utl_ctime $utime])"
    if {[string length $utitle] > 0} {
      set qstr "$urlmsg_alreadyknown - '$utitle' $qstr"
    } else {
      set qstr "$urlmsg_alreadyknown $qstr"
    }

    if {[utl_match_delim_list $urllog_msg_channels $uchan]} {
      urllog_verb_msg $urlNick $urlChan $qstr
    }
    return 0
  }
  return 1
}


#-------------------------------------------------------------------------
proc urllog_addurl {urlStr urlNick urlHost urlChan urlTitle} {
  global urldb urllog_shorturl

  if {$urlTitle == ""} {
    set uins "NULL"
  } else {
    set uins "'[utl_escape $urlTitle]'"
  }
  set usql "INSERT INTO urls (utime,url,user,host,chan,title) VALUES ([unixtime], '[utl_escape $urlStr]', '[utl_escape $urlNick]', '[utl_escape $urlHost]', '[utl_escape $urlChan]', $uins)"
  if {[catch {urldb eval $usql} uerrmsg]} {
    urllog_log "$uerrmsg on SQL:\n$usql"
    return 0
  }
  set uid [urldb last_insert_rowid]
  urllog_log "Added URL ($urlNick@$urlChan): $urlStr"


  ### Let's say something, to confirm that everything went well.
  if {$urllog_shorturl != 0} {
    set qstr "[urllog_get_short $uid] "
  } else {
    set qstr ""
  }
  if {[string length $urlTitle] > 0} {
    urllog_verb_msg $urlNick $urlChan "'$urlTitle' ([urllog_chop_url $urlStr]) $qstr"
  } else {
    urllog_verb_msg $urlNick $urlChan "[urllog_chop_url $urlStr] $qstr"
  }

  return 1
}


#-------------------------------------------------------------------------
proc urllog_dorequest { urlNick urlChan urlStr urlStatus urlSCode urlCode urlData urlMeta } {
  global urlmsg_ioerror urlmsg_timeout urlmsg_errorgettingdoc

  upvar 1 $urlStatus ustatus
  upvar 1 $urlSCode uscode
  upvar 1 $urlCode ucode
  upvar 1 $urlData udata
  upvar 1 $urlMeta umeta

  set urlHeaders {}
  lappend urlHeaders "Accept-Encoding" "identity"
  lappend urlHeaders "Connection" "keep-alive"

  if {[catch {set utoken [::http::geturl $urlStr -timeout 6000 -binary 1 -headers $urlHeaders]} uerrmsg]} {
    urllog_verb_msg $urlNick $urlChan "$urlmsg_ioerror ($uerrmsg)"
    urllog_log "HTTP request failed: $uerrmsg"
    return 0
  }

  set ustatus [::http::status $utoken]
  if {$ustatus == "timeout"} {
    urllog_verb_msg $urlNick $urlChan "$urlmsg_timeout"
    urllog_log "HTTP request timed out ($urlStr)"
    return 0
  }

  if {$ustatus != "ok"} {
    urllog_verb_msg $urlNick $urlChan "$urlmsg_errorgettingdoc ([::http::error $utoken])"
    urllog_log "Error in HTTP transaction: [::http::error $utoken] ($urlStr)"
    return 0
  }

  set ustatus [::http::status $utoken]
  set uscode [::http::code $utoken]
  set ucode [::http::ncode $utoken]
  set udata [::http::data $utoken]
  array set umeta [::http::meta $utoken]
  ::http::cleanup $utoken

  return 1
}

#-------------------------------------------------------------------------
proc urllog_validate_url { urlNick urlChan urlMStr urlMProto urlMHostName } {
  global urllog_tld_list urlmsg_nosuchhost urllog_httprep urlmsg_unknown_tld
  global urllog_shorturl_prefix urllog_shorturl urllog_check_tld
  upvar 1 $urlMStr urlStr
  upvar 1 $urlMProto urlProto
  upvar 1 $urlMHostName urlHostName

  ### Try to guess the URL protocol component (if it is missing)
  set u_checktld 1
  if {![string match "http://*" $urlStr] && ![string match "https://*" $urlStr] && ![string match "ftp://*" $urlStr]} {
    if {[string match "*www.*" $urlStr]} {
      set urlStr "http://$urlStr"
    } elseif {[string match "*ftp.*" $urlStr]} {
      set urlStr "ftp://$urlStr"
    }
  }

  ### Handle URLs that have an IPv4-address
  if {[regexp "(\[a-z\]+)://(\[0-9\]{1,3})\\.(\[0-9\]{1,3})\\.(\[0-9\]{1,3})\\.(\[0-9\]{1,3})" $urlStr urlMatch urlProto ni1 ni2 ni3 ni4]} {
    # Check if the IP is on local network
    if {$ni1 == 127 || $ni1 == 10 || ($ni1 == 192 && $ni2 == 168) || $ni1 == 0} {
      urllog_log "URL pointing to local or invalid network, ignored ($urlStr)."
      return 0
    }
    # Skip TLD check for URLs with IP address
    set u_checktld 0
  }

  ### Check now if we have an ShortURL here ...
  if {[string match "$urllog_shorturl_prefix*" $urlStr]} {
    urllog_log "Ignoring ShortURL from $urlNick: $urlStr"
#    set uud ""
#    set usql "SELECT id AS uid, url AS uurl, user AS uuser, host AS uhost, chan AS uchan, title AS utitle FROM urls WHERE utime=$uud"
#    urldb eval $usql {
#      urllog_verb_msg $urlNick $urlChan "'$utitle' - $uurl"
#      return 1
#    }
    return 0
  }

  ### Get URL protocol component
  set urlProto ""
  regexp "(\[a-z\]+)://" $urlStr urlMatch urlProto

  ### Check the PORT (if the ":" is there)
  set urlRecord [split $urlStr "/"]
  set urlHostName [lindex $urlRecord 2]
  set urlPort [lindex [split $urlHostName ":"] end]

  if {![urllog_isnumber $urlPort] && $urlPort != "" && $urlPort != $urlHostName} {
    urllog_log "Broken URL from $urlNick: ($urlStr) illegal port $urlPort"
    return 0
  }

  ### Is it a http or ftp url?
  if {$urlProto != "http" && $urlProto != "https" && $urlProto != "ftp"} {
    urllog_log "Broken URL from $urlNick: ($urlStr) UNSUPPORTED protocol class ($urlProto)."
    return 0
  }

  ### Check the Top Level Domain (TLD) validity
  if {$urllog_check_tld != 0 && $u_checktld != 0} {
    set u_sane [lindex [split $urlHostName "."] end]
    set u_tld [lindex [split $u_sane ":"] 0]
    set u_found 0

    if {[string length $u_tld] == 2} {
      # Assume all 2-letter domains to be valid :)
      set u_found 1
    } else {
      # Check our list of known TLDs
      foreach itld $urllog_tld_list {
        if {[string match $itld $u_tld]} {
          set u_found 1
        }
      }
    }

    if {$u_found == 0} {
      urllog_log "Broken URL from $urlNick: ($urlStr) unknown TLD: ${u_tld}."
      urllog_verb_msg $urlNick $urlChan $urlmsg_unknown_tld
      return 0
    }
  }

  set urlStr [string map $urllog_httprep $urlStr]
  return 1
}


#-------------------------------------------------------------------------
proc urllog_check_url {urlStr urlNick urlHost urlChan} {
  global urllog_encoding http_tls_support urlmsg_errorgettingdoc
  global urllog_extra_checks urllog_extra_strict urllog_rasiatube_hack

  ### Does the URL already exist?
  if {![urllog_exists $urlStr $urlNick $urlHost $urlChan]} {
    return 1
  }

  ### Validate URL compoments, etc.
  set urlProto ""
  set urlHostName ""
  if {![urllog_validate_url $urlNick $urlChan urlStr urlProto urlHostName]} {
    return 1
  }

  ### Do we perform additional checks?
  if {$urllog_extra_checks == 0 || !(($http_tls_support != 0 && $urlProto == "https") || $urlProto == "http")} {
    # No optional checks, or it's not http/https.
    if {$urllog_extra_strict == 0} {
      # Strict checking disabled, so add the URL, if it does not exist already.
      urllog_addurl $urlStr $urlNick $urlHost $urlChan ""
      return 1
    } elseif {$http_tls_support == 0 && $urlProto == "https"} {
      # Strict ENABLED: If TLS support is disabled and we have https, do nothing
      return 1
    } elseif {$urlProto != "http" && $urlProto != "https"} {
      # Strict ENABLED: It's not http, or https
      return 1
    }
  }

  ### Does the document pointed by the URL exist?
  if {![urllog_dorequest $urlNick $urlChan $urlStr ustatus uscode ucode udata umeta]} {
    return 1
  }

  ### Handle redirects
  if {$ucode >= 301 && $ucode <= 302} {
    set nurlStr $umeta(Location)
    if {![regexp "\[a-z\]+://" $nurlStr]} {
      if {[string range $nurlStr 0 0] != "/"} {
        append nurlStr "/"
      }
      set nurlStr "${urlProto}://${urlHostName}${nurlStr}"
    }
    urllog_log "Redirection: $urlStr -> $nurlStr"
    set urlStr $nurlStr

    if {![urllog_validate_url $urlNick $urlChan urlStr urlProto urlHostName]} {
      return 1
    }

    if {![urllog_dorequest $urlNick $urlChan $urlStr ustatus uscode ucode udata umeta]} {
      return 1
    }
  }

  ### Handle 2nd level redirects
  if {$ucode >= 301 && $ucode <= 302} {
    set nurlStr $umeta(Location)
    if {![regexp "\[a-z\]+://" $nurlStr]} {
      if {[string range $nurlStr 0 0] != "/"} {
        append nurlStr "/"
      }
      set nurlStr "${urlProto}://${urlHostName}${nurlStr}"
    }
    urllog_log "Redirection #2: $urlStr -> $nurlStr"
    set urlStr $nurlStr

    if {![urllog_validate_url $urlNick $urlChan urlStr urlProto urlHostName]} {
      return 1
    }

    if {![urllog_dorequest $urlNick $urlChan $urlStr ustatus uscode ucode udata umeta]} {
      return 1
    }
  }

  # Final document
  if {$ucode >= 200 && $ucode <= 205} {
    set uenc_doc ""
    set uenc_http ""
    set uencoding ""

    # Get information about specified character encodings
    if {[info exists umeta(Content-Type)] && [regexp -nocase {charset\s*=\s*([a-z0-9._-]+)} $umeta(Content-Type) umatches uenc_http]} {
      # Found character set encoding information in HTTP headers
    }

    if {[regexp -nocase -- "<meta.\*\?content=\"text/html.\*\?charset=(\[^\"\]*)\".\*\?/\?>" $udata umatches uenc_doc]} {
      # Found old style HTML meta tag with character set information
    } elseif {[regexp -nocase -- "<meta.\*\?charset=\"(\[^\"\]*)\".\*\?/\?>" $udata umatches uenc_doc]} {
      # Found HTML5 style meta tag with character set information
    }

    # Make sanitized versions of the encoding strings
    set uenc_http2 [urllog_sanitize_encoding $uenc_http]
    set uenc_doc2 [urllog_sanitize_encoding $uenc_doc]

    # Check if the document has specified encoding
    # KLUDGE!
    set uencoding $uenc_http2
    if {$uencoding == "" && $uenc_doc2 != ""} {
      set uencoding $uenc_doc2
    } elseif {$uencoding == ""} {
      # If _NO_ known encoding of any kind, assume the default of iso8859-1
      set uencoding "iso8859-1"
    }

    urllog_log "Charsets: http='$uenc_http', doc='$uenc_doc' / sanitized http='$uenc_http2', doc='$uenc_doc2' -> '$uencoding'"

    # Get the document title, if any
    set urlTitle ""
    set tmpRes [regexp -nocase -- "<title.\*\?>(.\*\?)</title>" $udata umatches urlTitle]

    # If facebook, get meta info
    if {[regexp -nocase -- "(http|https):\/\/www.facebook.com" $urlStr]} {
      if {[regexp -nocase -- "<meta name=\"description\" content=\"(.\*\?)\"" $udata umatches urlTmp]} {
        if {$urlTitle != ""} { append urlTitle " :: " }
        append urlTitle $urlTmp
      }
    }

    # If character set conversion is required, do it now
    if {$urlTitle != "" && $uencoding != ""} {
      if {[catch {set urlTitle [encoding convertfrom $uencoding $urlTitle]} cerrmsg]} {
        urllog_log "Error in charset conversion: $cerrmsg"
      }

      # Convert some HTML entities to plaintext and do some cleanup
      set utmp [utl_convert_html_ent $urlTitle]
      regsub -all "\r|\n|\t" $utmp " " utmp
      regsub -all "  *" $utmp " " utmp
      set urlTitle [string trim $utmp]
    }

    # Rasiatube hack
    if {$urllog_rasiatube_hack != 0 && [string match "*/rasiatube/view*" $urlStr]} {
      set rasia 0
      if {[regexp -nocase -- "<link rel=\"video_src\"\.\*\?file=(http://\[^&\]+)&" $udata umatches utmp]} {
        regsub -all "\/v\/" $utmp "\/watch\?v=" urlStr
        set rasia 1
      } else {
        if {[regexp -nocase -- "SWFObject.\"(\[^\"\]+)\", *\"flashvideo" $udata umatches utmp]} {
          regsub "http:\/\/www.dailymotion.com\/swf\/" $utmp "http:\/\/www.dailymotion.com\/video\/" urlStr
          set rasia 1
        }
      }
      if {$rasia != 0} {
        urllog_log "RasiaTube mangler: $urlStr"
        urllog_verb_msg $urlNick $urlChan "Korjataan haiseva rasiatube-linkki: $urlStr"
      }
    }

    # Check if the URL already exists, just in case we had some redirects
    if {[urllog_exists $urlStr $urlNick $urlHost $urlChan]} {
      urllog_addurl $urlStr $urlNick $urlHost $urlChan $urlTitle
    }
    return 1
  } else {
    urllog_verb_msg $urlNick $urlChan "$urlmsg_errorgettingdoc ($ucode)"
    urllog_log "Error fetching document: status=$ustatus, code=$ucode, scode=$uscode, url=$urlStr"
  }
}


#-------------------------------------------------------------------------


proc urllog_check_line {unick uhost uhand uchan utext} {
  global urllog_log_channels

  ### Check the nick
  if {$unick == "*"} {
    urllog_log "urllog_check_line: Nick was wc, this should not happen."
    return 0
  }

  ### Check the channel
  if {[utl_match_delim_list $urllog_log_channels $uchan]} {
    ### Do the URL checking
    foreach str [split $utext " "] {
      if {[regexp "((ftp|http|https)://\[^\[:space:\]\]+|^(www|ftp)\.\[^\[:space:\]\]+)" $str ulink]} {
        urllog_check_url $str $unick $uhost $uchan
      }
    }
  }

  return 0
}


#-------------------------------------------------------------------------
### Parse arguments, find and show the results
proc urllog_find {unick uhand uchan utext upublic} {
  global urllog_shorturl urldb
  global urllog_showmax_pub urllog_showmax_priv urlmsg_nomatch

  if {$upublic == 0} {
    set ulimit 5
  } else {
    set ulimit 3
  }

  ### Parse the given command
  urllog_log "$unick/$uhand searched URL: $utext"

  set ftokens [split $utext " "]
  set fpatlist ""
  foreach ftoken $ftokens {
    set fprefix [string range $ftoken 0 0]
    set fpattern [string range $ftoken 1 end]
    set qpattern "'%[utl_escape $fpattern]%'"

    if {$fprefix == "-"} {
      lappend fpatlist "(url NOT LIKE $qpattern OR title NOT LIKE $qpattern)"
    } elseif {$fprefix == "%"} {
      lappend fpatlist "user LIKE $qpattern"
    } elseif {$fprefix == "@"} {
      # foo
    } elseif {$fprefix == "+"} {
      lappend fpatlist "(url LIKE $qpattern OR title LIKE $qpattern)"
    } else {
      set qpattern "'%[utl_escape $ftoken]%'"
      lappend fpatlist "(url LIKE $qpattern OR title LIKE $qpattern)"
    }
  }

  if {[llength $fpatlist] > 0} {
    set fquery "WHERE [join $fpatlist " AND "]"
  } else {
    set fquery ""
  }

  set iresults 0
  set usql "SELECT id AS uid, utime AS utime, url AS uurl, user AS uuser, host AS uhost FROM urls $fquery ORDER BY utime DESC LIMIT $ulimit"
  urldb eval $usql {
    incr iresults
    set shortURL $uurl
    if {$urllog_shorturl != 0 && $uid != ""} {
      set shortURL "$shortURL [urllog_get_short $uid]"
    }
    urllog_msg $upublic $unick $uchan "#$iresults: $shortURL ($uuser@[utl_ctime $utime])"
  }

  if {$iresults == 0} {
    # If no URLs were found
    urllog_msg $upublic $unick $uchan $urlmsg_nomatch
  }

  return 0
}


#-------------------------------------------------------------------------
### Finding binded functions
proc urllog_pub_urlfind {unick uhost uhand uchan utext} {
  global urllog_search_channels

  if {[utl_match_delim_list $urllog_search_channels $uchan]} {
    return [urllog_find $unick $uhand $uchan $utext 1]
  }
  return 0
}


proc urllog_msg_urlfind {unick uhost uhand utext} {
  urllog_find $unick $uhand "" $utext 0
  return 0
}

# end of script