view parsedata.pl @ 181:9c3100ab29cc

Cosmetics.
author Matti Hamalainen <ccr@tnsp.org>
date Tue, 08 Sep 2015 10:48:48 +0300
parents eaac0a84a7d2
children 9f7eb4db99b4
line wrap: on
line source

#!/usr/bin/perl -w
#
# Fetch and parse HTML format class timetable into more sane formats
# (C) Copyright 2010-2015 Matti Hämäläinen <ccr@tnsp.org>
#
use strict;
use Data::Dumper;
use HTML::Entities;

###
### Some globals
###
my $modes = "php|xml";
my $opt_mode = "php";
my $opt_dump = 0;
my $opt_filename;
my $opt_outfile;

my $cid = 0;
my $hourTimes = [];
my $hourDefs = {};
my $hourTable = {};
my $hourFillTable = {};
my $maxDays = 0;
my $firstHour = 0;
my $lastHour = 0;
my $totalHours = 0;


sub str_trim($)
{
  my $str = $_[0];
  if (defined($str))
  {
    $str =~ s/^\s*//;
    $str =~ s/\s*$//;
  }
  return $str;
}


sub pop_token_a($)
{
  my $tokens = $_[0];
  return shift(@$tokens);
}


sub pop_token($)
{
  return str_trim(pop_token_a($_[0]));
}


sub parse_html_str($)
{
  my $tokens = $_[0];
  my $token = pop_token($tokens);
  my $str = "";
  $token =~ s/^\s*//;

  return undef unless (substr($token, 0, 1) eq '"');
  $token = substr($token, 1);
  
  while (defined($token))
  {
    my $tmp = $token;
    $tmp =~ s/\s*$//;
    if (substr($tmp, -1) eq '"')
    {
      $str .= substr($tmp, 0, -1);
      return $str;
    }
    else
    {
      $str .= $token;
    }
    $token = shift(@$tokens);
  }
  return undef;
}


sub parse_html_tree($$);

sub parse_html_tree($$)
{
  my ($tokens, $tree) = @_;

  while (my $token = pop_token($tokens))
  {
    if ($token =~ /^<[!\/]?[a-zA-Z]+/)
    {
      $token = lc($token);
      if ($token =~ /^<\!.*>$/)
      {
        # Ignore comments etc.
      }
      elsif ($token =~ /^<([a-z]+)(.*)>$/)
      {
        my ($name, $args) = ($1, $2);
        if ($name eq "meta" || $name eq "img")
        {
          my $tmp = {};
          parse_html_tree($tokens, $tree);
          return $tree;
        }
        else
        {
          my $tmp = { "name" => $name, "args" => str_trim($args) };
          parse_html_tree($tokens, $tmp);
          push(@{$$tree{"nodes"}}, $tmp);
        }
      }
      elsif ($token =~ /^<\/([a-z]+)>$/)
      {
        return $tree;
      }
      else
      {
        die("HORROR TERROR ELITE: $token\n");
      }
    }
    else
    {
      $token = str_trim(decode_entities($token));
      push(@{$$tree{"nodes"}}, { "name" => "text", "args" => "", "text" => $token }) if length($token) > 0;
    }
  }
  
  return $tree;
}


sub parse_html($)
{
  return undef unless defined($_[0]);
  my $str = $_[0];
  my $res = { "name" => "", "args" => "" };
  $str =~ tr/\r/ /;
  $str =~ tr/\n/ /;
  my @tokens = grep { !($_ =~ /^\s*$/) } split(/(<\/?[a-zA-Z]+.*?>)/, $str);
  parse_html_tree(\@tokens, $res);
  return $res;
}

sub html_find_node($$$);

sub html_find_node($$$)
{
  my ($node, $name, $args) = @_;
  
  if (defined($node))
  {
    if (ref($node) eq "ARRAY")
    {
      foreach my $n (@$node)
      {
        my $tmp = html_find_node($n, $name, $args);
        # Must do it like this, in order not to break the loop
        return $tmp if defined($tmp);
      }
    }
    elsif (ref($node) eq "HASH")
    {
      if (defined($$node{"name"}))
      {
        if ($$node{"name"} eq $name)
        {
          if ($args ne "") {
            if (defined($$node{"args"}) && $$node{"args"} =~ /$args/)
            {
            }
            else
            {
              return html_find_node($$node{"nodes"}, $name, $args);
            }
          }
          return $node;
        }
        else
        {
          return html_find_node($$node{"nodes"}, $name, $args);
        }
      }
    }
  }
  return undef;
}


sub fnode($$)
{
  return html_find_node($_[0], $_[1], "");
}


sub fnodea($$$)
{
  return html_find_node($_[0], $_[1], $_[2]);
}


sub escape($)
{
  my $s = $_[0];
  $s =~ s/(['"])/\\$1/g;
  return $s;
}


sub html_collapse($$);

sub html_collapse($$)
{
  my ($node, $strip) = @_;
  my $str = "";

  if ($$node{"name"} eq "text")
  {
    $str .= $$node{"text"};
  }
  else
  {
    $str .= "<".$$node{"name"}.">" unless ($strip);
    foreach my $n (@{$$node{"nodes"}})
    {
      $str .= html_collapse($n, $strip);
    }
    $str .= "</".$$node{"name"}.">" unless ($strip);
  }
  
  return $str;
}


sub parse_hour_data($$)
{
  my ($l, $rowspan) = @_;
  my $chours = $rowspan / 2; # The table is actually in half cells
  my $cdata = [];
  my $cturns = 0;
  my $cgrouped = 0;

  # Pull in data for the class/hour cell
  foreach my $h (@{$l})
  {
    if (defined($$h{"nodes"}))
    {
      foreach my $b (@{$$h{"nodes"}})
      {
        if (defined($$b{"nodes"}))
        {
          my $text = $$b{"nodes"}[0]{"text"};
          $text =~ s/\.$//;

          if ($text =~ /^vuorov/i)
          {
            $cturns = 1;
          }
          else
          {
            push(@$cdata, $text);
          }
        }
      }
    }
  }

  # Increased ID if there is data in this class/hour cell
  my $tid;
  if (scalar(@$cdata) > 0) {
    $cid++;
    $tid = $cid;
  } else {
    $tid = 0;
  }

  # Determine current day
  my $cday = 0;
  for (my $x = 0; $x < 7; $x++)
  {
    if (!defined($$hourFillTable{$lastHour}{$x}))
    {
      $cday = $x;
      last;
    }
  }
  for (my $t = 0; $t < $chours; $t++)
  {
    $$hourFillTable{$lastHour + $t}{$cday} = $tid;
  }
  
  if ($tid)
  {
    $maxDays = $cday + 1 if ($cday + 1 > $maxDays);

    # Grouped, if there is another class ID in second slot
    $cgrouped = 1 if ($$cdata[1] =~ /^([A-Z]\d{5,6}[A-Z]*|[A-Z0-9]{6,8})$/);
    if ($cgrouped)
    {
      my $cdata1 = [];
      my $cdata2 = [];
      for (my $i = 0; $i < length($cdata); $i += 2)
      {
        push(@$cdata1, $$cdata[$i]) if defined($$cdata[$i]);
        push(@$cdata2, $$cdata[$i+1]) if defined($$cdata[$i+1]);
      }
      $$hourDefs{$cid} = { "turns" => $cturns, "grouped" => $cgrouped, "start" => $lastHour, "hours" => $chours, "data" => [ $cdata1, $cdata2 ] };
    }
    else
    {
      $$hourDefs{$cid} = { "turns" => $cturns, "grouped" => $cgrouped, "start" => $lastHour, "hours" => $chours, "data" => [ $cdata ] };
    }

    push(@{$$hourTable{$cday}}, $tid);
    $totalHours += $chours;
  }
}

sub parse_hour_header($)
{
  if ($_[0] =~ /(\d+):(\d+)/)
  {
    return ((int($1) * 60 + int($2)) * 60);
  }
  return undef;
}


sub get_hour_data_struct($$);

sub get_hour_data_struct($$)
{
  my @out = ();
  my ($tmp, $first) = @_;

  if (ref($tmp) eq "ARRAY")
  {
    my @str = ();
    foreach my $item (@{$tmp})
    {
      push(@str, get_hour_data_struct($item, 0));
    }
    if (scalar(@str) > 0)
    {
      push(@out, "array(".join(", ", @str).")") if ($opt_mode eq "php");
      push(@out, "<group>".join("", @str)."</group>") if ($opt_mode eq "xml");
    }
  }
  elsif (ref($tmp) eq "HASH")
  {
    my @str = ();
    foreach my $key (keys %{$tmp})
    {
      push(@out, "\"".$key."\" => ".get_hour_data_struct($$tmp{$key}, 1)) if ($opt_mode eq "php");
      push(@out, "<".$key.">".get_hour_data_struct($$tmp{$key}, 1)."</".$key.">") if ($opt_mode eq "xml");
    }
    if (scalar(@str) > 0)
    {
      push(@out, "array(".join(", ", @str).")") if ($opt_mode eq "php");
      push(@out, join("", @str)) if ($opt_mode eq "xml");
    }
  }
  elsif ($tmp =~ /^\d+$/)
  {
    if ($opt_mode eq "php")
    {
      push(@out, $tmp);
    }
    else
    {
      push(@out, $first ? $tmp : "<item>".$tmp."</item>");
    }
  }
  else
  {
    if ($opt_mode eq "php")
    {
      push(@out, "\"".$tmp."\"");
    }
    else
    {
      push(@out, $first ? $tmp : "<item>".$tmp."</item>");
    }
  }

  return join(", ", @out) if ($opt_mode eq "php");
  return join("", @out);
}


###
### Main program
###

while (defined(my $arg = shift))
{
  if (substr($arg, 0, 1) eq "-")
  {
    if ($arg =~ /^-($modes)$/o)
    {
      $opt_mode = $1;
    }
    elsif ($arg eq "-dump")
    {
      $opt_dump = 1;
    }
    elsif ($arg eq "-o")
    {
      $opt_outfile = shift or die("Output filename option -o requires an argument.\n");
    }
    else
    {
      die("Invalid option '$arg'.\n");
    }
  }
  else
  {
    $opt_filename = $arg;
  }
}

die("Usage: $0 [options] <filename>

  -php               Output a PHP include file with data in arrays (default)
  -xml               Output a simple XML file.

  -o <filename>	     Set output filename. Default is to use stdout.

  -dump	             Dump HTML tree to stdout and quit.

") unless defined($opt_filename);


my $data;
open(my $fh, '<:encoding(iso-8859-1)', $opt_filename) or die("Error opening '$opt_filename': $!\n");
$data = do { local $/; <$fh> };
close($fh);

die("No data in input.\n") unless (defined($data) && $data ne "");


# Filter out certain unneeded elements
$data =~ s/<font[^>]*>//ig;
$data =~ s/<\/font>//ig;
$data =~ s/<\/?center>//ig;
$data =~ s/<br>//ig;
$data =~ s/&nbsp;/ /ig;

### Get some general information
my $otree = parse_html($data);
if ($opt_dump)
{
  print Dumper(fnode($otree, "html"));
  exit;
}

my %class = ();
my $body = fnode($otree, "body");
if (defined($body) && defined($$body{"nodes"}))
{
  foreach my $n (@{$$body{"nodes"}})
  {
    if ($$n{"name"} eq "text")
    {
      push(@{$class{"info"}}, $$n{"text"});
    }
    elsif ($$n{"name"} eq "b")
    {
      push(@{$class{"data"}}, $n);
    }
  }
}

# Filter out some more, for easier tree access during table parsing
$data =~ s/<\/?b>//ig;
my $tree = parse_html($data);
my $node = fnodea(fnode($tree, "body"), "table", "border=\"3\" rules=\"all\" cellpadding=\"1\" cellspacing=\"1\"");
die("No table element found in document. Perhaps the format has changed? :(\n") unless defined($node);


### Parse through the HTML document node tree to find the data we need

# Skip zero position this way (can't use foreach here)
my $q = $$node{"nodes"};
for (my $i = 1; $i < scalar(@{$q}); $i++)
{
  my $d = $$q[$i]{"nodes"};
  if (defined($d))
  {
    foreach my $n (@{$d})
    {
      my $l = $$n{"nodes"}[0]{"nodes"};
      if (defined($l))
      {
        if ($$n{"args"} =~ /colspan=12\s+rowspan=(\d+)/)
        {
          parse_hour_data($l, $1);
        }
        elsif ($$n{"args"} =~ /rowspan=2\s+align/)
        {
          my $qstart = parse_hour_header($$l[0]{"nodes"}[0]{"nodes"}[0]{"text"});
          my $qend = parse_hour_header($$l[1]{"nodes"}[0]{"nodes"}[0]{"text"});
          if (defined($qstart) && defined($qend))
          {
            push(@$hourTimes, {"start" => $qstart, "end" => $qend});
          }
        }
      }
    }
    $lastHour++;
  }
}


### Go through hour table, find last day and hour of the week, crop
my $flag = 1;
for (my $y = 0; $y < $lastHour && $flag; $y++) {
  for (my $x = 0; $x < $maxDays && $flag; $x++) {
    $flag = 0 if (defined($$hourFillTable{$y}{$x}) && $$hourFillTable{$y}{$x} != 0);
  }
  $firstHour++ if ($flag);
}

$flag = 1;
for (my $y = $lastHour - 1; $y >= 0 && $flag; $y--) {
  for (my $x = 0; $x < $maxDays && $flag; $x++) {
    $flag = 0 if (defined($$hourFillTable{$y}{$x}) && $$hourFillTable{$y}{$x} != 0);
  }
  $lastHour-- if ($flag);
}


### Open output file, if specified
if (defined($opt_outfile)) {
  open(STDOUT, '>', $opt_outfile) or die("Could not open output file '$opt_outfile'.\n");
}

binmode STDOUT, ':encoding(utf-8)';

### Output data in desired format
if ($opt_mode eq "php")
{
  print "<?\n".
  "\$classInfo = array(\n".
  "  \"general\" => array(".join(", ", map { "\"".escape($_)."\""; } @{$class{"info"}})."),\n".
  "  \"info\" => array(".join(", ", map { "\"".escape(html_collapse($_, 1))."\""; } @{$class{"data"}})."),\n".
  "  \"tags\" => array(".join(", ", map { "\"".escape(html_collapse($_, 0))."\""; } @{$class{"data"}})."),\n".
  "  \"maxDays\" => $maxDays,\n".
  "  \"firstHour\" => $firstHour,\n".
  "  \"lastHour\" => $lastHour,\n".
  "  \"totalHours\" => $totalHours\n".
  ");\n\n";

  print "\$classHourTimes = array(\n";
  foreach my $chour (@$hourTimes)
  {
    print "  array(\"start\" => ".$$chour{"start"}.", \"end\" => ".$$chour{"end"}."),\n";
  }
  print ");\n\n";

  print "\$classHourDefs = array(\n";
  foreach my $cid (sort { $a <=> $b } keys %{$hourDefs})
  {
    print "  $cid => array(".get_hour_data_struct($$hourDefs{$cid}, 0)."),\n";
  }
  print ");\n\n";

  print
  "\$classDayTable = array(\n";
  for (my $y = 0; $y < $maxDays; $y++)
  {
    if (defined($$hourTable{$y}))
    {
      print "  $y => array(".join(", ", @{$$hourTable{$y}})."),\n";
    }
  }
  print ");\n?>\n";
}

elsif ($opt_mode eq "xml")
{
  print "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n".
  "<timetable>\n".
  " <class>\n".
  "  <general>".join("", map { "<item>".$_."</item>"; } @{$class{"info"}})."</general>\n".
  "  <info>".join("", map { "<item>".html_collapse($_, 1)."</item>"; } @{$class{"data"}})."</info>\n".
  "  <maxdays>$maxDays</maxdays>\n".
  "  <firsthour>$firstHour</firsthour>\n".
  "  <lasthour>$lastHour</lasthour>\n".
  "  <totalhours>$totalHours</totalhours>\n".
  " </class>\n";

  print " <hours>\n";
  my $cid = 0;
  foreach my $chour (@$hourTimes)
  {
    print "  <hour id=\"".$cid."\"><start>".$$chour{"start"}."</start><end>".$$chour{"end"}."</end></hour>\n";
    $cid++;
  }
  print " </hours>\n\n";

  print " <classes>\n";
  foreach $cid (sort { $a <=> $b } keys %{$hourDefs})
  {
    print "  <class id=\"$cid\">".get_hour_data_struct($$hourDefs{$cid}, 0)."</class>\n";
  }
  print " </classes>\n\n";

  print " <days>\n";
  for (my $y = 0; $y < $maxDays; $y++)
  {
    if (defined($$hourTable{$y}))
    {
      print "  <day id=\"$y\">".join("", map { "<class>".$_."</class>" } @{$$hourTable{$y}})."</day>\n";
    }
  }
  print " </days>\n";

  print "</timetable>\n";
}

close (STDOUT);