view docs/normalizeml.pl @ 1627:a3c3591f9a74

Clean up this script.
author Matti Hamalainen <ccr@tnsp.org>
date Fri, 04 May 2018 20:24:01 +0300
parents 2871db57f976
children ea96ce334a5c
line wrap: on
line source

#!/usr/bin/perl -w
#
# Utility for "normalizing" XML/SGML files
# Programmed by Matti 'ccr' Hamalainen <ccr@tnsp.org>
# (C) Copyright 2007,2009 Tecnic Software productions (TNSP)
#
use utf8;
use strict;
use warnings;


sub dorep($$)
{
  my $str = $_[0];
  my @vals = split(/ /, $_[1]);
  $str =~ s/\$(\d+)/$vals[$1 - 1]/eg;
  return $str;
}


my %xmlentities = ();
my $entMode = 0;
my $entData;
my $entName;

binmode(STDIN, ":utf8");
binmode(STDOUT, ":utf8");

while (defined(my $line = <STDIN>))
{
  if ($line =~ /<!ENTITY ([A-Za-z][A-Za-z0-9_]+) +SYSTEM +\"([^\"]*)\">/)
  {
    # Handle external entities
    my $name = $1;
    my $extfname = $2;
    local($/, *INFILE);
    open(INFILE, "<", $extfname) or die("Could not open entity file '".$extfname."'.\n");
    $xmlentities{$name} = <INFILE>;
    close(INFILE);
  }
  elsif ($line =~ /<!ENTITY ([A-Za-z][A-Za-z0-9_]+) \"(.*?)\">/)
  {
    # One-line entities
    $xmlentities{$1} = $2;
  }
  elsif ($line =~ /<!ENTITY ([A-Za-z][A-Za-z0-9_]+) \"(.*)$/)
  {
    # Multi-line entities
    $entName = $1;
    $entData = $2;
    $entMode = 1;
  }
  elsif ($entMode == 1)
  {
    if ($line =~ /^(.*)\">/)
    {
      $entData .= $1;
      $xmlentities{$entName} = $entData;
      $entMode = 0;
    }
    else
    {
      $entData .= $_;
    }
  }
  else
  {
    # Expand entities for five levels at most
    my $str = $line;
    for (my $depth = 1; $depth < 5; $depth++)
    {
      while (my ($k, $v) = each(%xmlentities))
      {
        $str =~ s/&$k;/$v/g;
        $str =~ s/&$k\s+([A-Za-z0-9 ]+);/dorep($v,$1)/eg;
      }
      last unless ($str =~ /&/);
    }
    print $str;
  }
}