annotate docs/normalizeml.pl @ 1628:ea96ce334a5c

Fix document normalization.
author Matti Hamalainen <ccr@tnsp.org>
date Sat, 07 Jul 2018 01:46:47 +0300
parents a3c3591f9a74
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
619
3ab1d39cac73 Added very simple utility for normalizing SGML/XML files, kludged in Perl.
Matti Hamalainen <ccr@tnsp.org>
parents:
diff changeset
1 #!/usr/bin/perl -w
1046
8474bece3c4b Cleanup normalizeml.pl a bit.
Matti Hamalainen <ccr@tnsp.org>
parents: 1042
diff changeset
2 #
619
3ab1d39cac73 Added very simple utility for normalizing SGML/XML files, kludged in Perl.
Matti Hamalainen <ccr@tnsp.org>
parents:
diff changeset
3 # Utility for "normalizing" XML/SGML files
1042
f212163bac42 Make use strict; clean.
Matti Hamalainen <ccr@tnsp.org>
parents: 1041
diff changeset
4 # Programmed by Matti 'ccr' Hamalainen <ccr@tnsp.org>
839
9493bb609fd3 More comments; Handle external entities (e.g. SYSTEM entities), but probably not in conforming manner.
Matti Hamalainen <ccr@tnsp.org>
parents: 623
diff changeset
5 # (C) Copyright 2007,2009 Tecnic Software productions (TNSP)
1046
8474bece3c4b Cleanup normalizeml.pl a bit.
Matti Hamalainen <ccr@tnsp.org>
parents: 1042
diff changeset
6 #
1627
a3c3591f9a74 Clean up this script.
Matti Hamalainen <ccr@tnsp.org>
parents: 1229
diff changeset
7 use utf8;
1042
f212163bac42 Make use strict; clean.
Matti Hamalainen <ccr@tnsp.org>
parents: 1041
diff changeset
8 use strict;
1046
8474bece3c4b Cleanup normalizeml.pl a bit.
Matti Hamalainen <ccr@tnsp.org>
parents: 1042
diff changeset
9 use warnings;
1042
f212163bac42 Make use strict; clean.
Matti Hamalainen <ccr@tnsp.org>
parents: 1041
diff changeset
10
1627
a3c3591f9a74 Clean up this script.
Matti Hamalainen <ccr@tnsp.org>
parents: 1229
diff changeset
11
1059
c4a4c72c247a Input and output UTF-8; Add parametrized entities.
Matti Hamalainen <ccr@tnsp.org>
parents: 1046
diff changeset
12 sub dorep($$)
c4a4c72c247a Input and output UTF-8; Add parametrized entities.
Matti Hamalainen <ccr@tnsp.org>
parents: 1046
diff changeset
13 {
c4a4c72c247a Input and output UTF-8; Add parametrized entities.
Matti Hamalainen <ccr@tnsp.org>
parents: 1046
diff changeset
14 my $str = $_[0];
c4a4c72c247a Input and output UTF-8; Add parametrized entities.
Matti Hamalainen <ccr@tnsp.org>
parents: 1046
diff changeset
15 my @vals = split(/ /, $_[1]);
c4a4c72c247a Input and output UTF-8; Add parametrized entities.
Matti Hamalainen <ccr@tnsp.org>
parents: 1046
diff changeset
16 $str =~ s/\$(\d+)/$vals[$1 - 1]/eg;
c4a4c72c247a Input and output UTF-8; Add parametrized entities.
Matti Hamalainen <ccr@tnsp.org>
parents: 1046
diff changeset
17 return $str;
c4a4c72c247a Input and output UTF-8; Add parametrized entities.
Matti Hamalainen <ccr@tnsp.org>
parents: 1046
diff changeset
18 }
c4a4c72c247a Input and output UTF-8; Add parametrized entities.
Matti Hamalainen <ccr@tnsp.org>
parents: 1046
diff changeset
19
1627
a3c3591f9a74 Clean up this script.
Matti Hamalainen <ccr@tnsp.org>
parents: 1229
diff changeset
20
1046
8474bece3c4b Cleanup normalizeml.pl a bit.
Matti Hamalainen <ccr@tnsp.org>
parents: 1042
diff changeset
21 my %xmlentities = ();
619
3ab1d39cac73 Added very simple utility for normalizing SGML/XML files, kludged in Perl.
Matti Hamalainen <ccr@tnsp.org>
parents:
diff changeset
22 my $entMode = 0;
3ab1d39cac73 Added very simple utility for normalizing SGML/XML files, kludged in Perl.
Matti Hamalainen <ccr@tnsp.org>
parents:
diff changeset
23 my $entData;
3ab1d39cac73 Added very simple utility for normalizing SGML/XML files, kludged in Perl.
Matti Hamalainen <ccr@tnsp.org>
parents:
diff changeset
24 my $entName;
3ab1d39cac73 Added very simple utility for normalizing SGML/XML files, kludged in Perl.
Matti Hamalainen <ccr@tnsp.org>
parents:
diff changeset
25
1059
c4a4c72c247a Input and output UTF-8; Add parametrized entities.
Matti Hamalainen <ccr@tnsp.org>
parents: 1046
diff changeset
26 binmode(STDIN, ":utf8");
c4a4c72c247a Input and output UTF-8; Add parametrized entities.
Matti Hamalainen <ccr@tnsp.org>
parents: 1046
diff changeset
27 binmode(STDOUT, ":utf8");
c4a4c72c247a Input and output UTF-8; Add parametrized entities.
Matti Hamalainen <ccr@tnsp.org>
parents: 1046
diff changeset
28
1627
a3c3591f9a74 Clean up this script.
Matti Hamalainen <ccr@tnsp.org>
parents: 1229
diff changeset
29 while (defined(my $line = <STDIN>))
a3c3591f9a74 Clean up this script.
Matti Hamalainen <ccr@tnsp.org>
parents: 1229
diff changeset
30 {
a3c3591f9a74 Clean up this script.
Matti Hamalainen <ccr@tnsp.org>
parents: 1229
diff changeset
31 if ($line =~ /<!ENTITY ([A-Za-z][A-Za-z0-9_]+) +SYSTEM +\"([^\"]*)\">/)
a3c3591f9a74 Clean up this script.
Matti Hamalainen <ccr@tnsp.org>
parents: 1229
diff changeset
32 {
839
9493bb609fd3 More comments; Handle external entities (e.g. SYSTEM entities), but probably not in conforming manner.
Matti Hamalainen <ccr@tnsp.org>
parents: 623
diff changeset
33 # Handle external entities
1046
8474bece3c4b Cleanup normalizeml.pl a bit.
Matti Hamalainen <ccr@tnsp.org>
parents: 1042
diff changeset
34 my $name = $1;
8474bece3c4b Cleanup normalizeml.pl a bit.
Matti Hamalainen <ccr@tnsp.org>
parents: 1042
diff changeset
35 my $extfname = $2;
1229
2871db57f976 Fix inclusion and expanding of SYSTEM entities.
Matti Hamalainen <ccr@tnsp.org>
parents: 1059
diff changeset
36 local($/, *INFILE);
1627
a3c3591f9a74 Clean up this script.
Matti Hamalainen <ccr@tnsp.org>
parents: 1229
diff changeset
37 open(INFILE, "<", $extfname) or die("Could not open entity file '".$extfname."'.\n");
1046
8474bece3c4b Cleanup normalizeml.pl a bit.
Matti Hamalainen <ccr@tnsp.org>
parents: 1042
diff changeset
38 $xmlentities{$name} = <INFILE>;
839
9493bb609fd3 More comments; Handle external entities (e.g. SYSTEM entities), but probably not in conforming manner.
Matti Hamalainen <ccr@tnsp.org>
parents: 623
diff changeset
39 close(INFILE);
1627
a3c3591f9a74 Clean up this script.
Matti Hamalainen <ccr@tnsp.org>
parents: 1229
diff changeset
40 }
a3c3591f9a74 Clean up this script.
Matti Hamalainen <ccr@tnsp.org>
parents: 1229
diff changeset
41 elsif ($line =~ /<!ENTITY ([A-Za-z][A-Za-z0-9_]+) \"(.*?)\">/)
a3c3591f9a74 Clean up this script.
Matti Hamalainen <ccr@tnsp.org>
parents: 1229
diff changeset
42 {
839
9493bb609fd3 More comments; Handle external entities (e.g. SYSTEM entities), but probably not in conforming manner.
Matti Hamalainen <ccr@tnsp.org>
parents: 623
diff changeset
43 # One-line entities
1046
8474bece3c4b Cleanup normalizeml.pl a bit.
Matti Hamalainen <ccr@tnsp.org>
parents: 1042
diff changeset
44 $xmlentities{$1} = $2;
1627
a3c3591f9a74 Clean up this script.
Matti Hamalainen <ccr@tnsp.org>
parents: 1229
diff changeset
45 }
a3c3591f9a74 Clean up this script.
Matti Hamalainen <ccr@tnsp.org>
parents: 1229
diff changeset
46 elsif ($line =~ /<!ENTITY ([A-Za-z][A-Za-z0-9_]+) \"(.*)$/)
a3c3591f9a74 Clean up this script.
Matti Hamalainen <ccr@tnsp.org>
parents: 1229
diff changeset
47 {
839
9493bb609fd3 More comments; Handle external entities (e.g. SYSTEM entities), but probably not in conforming manner.
Matti Hamalainen <ccr@tnsp.org>
parents: 623
diff changeset
48 # Multi-line entities
619
3ab1d39cac73 Added very simple utility for normalizing SGML/XML files, kludged in Perl.
Matti Hamalainen <ccr@tnsp.org>
parents:
diff changeset
49 $entName = $1;
3ab1d39cac73 Added very simple utility for normalizing SGML/XML files, kludged in Perl.
Matti Hamalainen <ccr@tnsp.org>
parents:
diff changeset
50 $entData = $2;
3ab1d39cac73 Added very simple utility for normalizing SGML/XML files, kludged in Perl.
Matti Hamalainen <ccr@tnsp.org>
parents:
diff changeset
51 $entMode = 1;
1627
a3c3591f9a74 Clean up this script.
Matti Hamalainen <ccr@tnsp.org>
parents: 1229
diff changeset
52 }
a3c3591f9a74 Clean up this script.
Matti Hamalainen <ccr@tnsp.org>
parents: 1229
diff changeset
53 elsif ($entMode == 1)
a3c3591f9a74 Clean up this script.
Matti Hamalainen <ccr@tnsp.org>
parents: 1229
diff changeset
54 {
a3c3591f9a74 Clean up this script.
Matti Hamalainen <ccr@tnsp.org>
parents: 1229
diff changeset
55 if ($line =~ /^(.*)\">/)
a3c3591f9a74 Clean up this script.
Matti Hamalainen <ccr@tnsp.org>
parents: 1229
diff changeset
56 {
619
3ab1d39cac73 Added very simple utility for normalizing SGML/XML files, kludged in Perl.
Matti Hamalainen <ccr@tnsp.org>
parents:
diff changeset
57 $entData .= $1;
1046
8474bece3c4b Cleanup normalizeml.pl a bit.
Matti Hamalainen <ccr@tnsp.org>
parents: 1042
diff changeset
58 $xmlentities{$entName} = $entData;
619
3ab1d39cac73 Added very simple utility for normalizing SGML/XML files, kludged in Perl.
Matti Hamalainen <ccr@tnsp.org>
parents:
diff changeset
59 $entMode = 0;
1627
a3c3591f9a74 Clean up this script.
Matti Hamalainen <ccr@tnsp.org>
parents: 1229
diff changeset
60 }
a3c3591f9a74 Clean up this script.
Matti Hamalainen <ccr@tnsp.org>
parents: 1229
diff changeset
61 else
a3c3591f9a74 Clean up this script.
Matti Hamalainen <ccr@tnsp.org>
parents: 1229
diff changeset
62 {
1628
ea96ce334a5c Fix document normalization.
Matti Hamalainen <ccr@tnsp.org>
parents: 1627
diff changeset
63 $entData .= $line;
619
3ab1d39cac73 Added very simple utility for normalizing SGML/XML files, kludged in Perl.
Matti Hamalainen <ccr@tnsp.org>
parents:
diff changeset
64 }
1627
a3c3591f9a74 Clean up this script.
Matti Hamalainen <ccr@tnsp.org>
parents: 1229
diff changeset
65 }
a3c3591f9a74 Clean up this script.
Matti Hamalainen <ccr@tnsp.org>
parents: 1229
diff changeset
66 else
a3c3591f9a74 Clean up this script.
Matti Hamalainen <ccr@tnsp.org>
parents: 1229
diff changeset
67 {
1046
8474bece3c4b Cleanup normalizeml.pl a bit.
Matti Hamalainen <ccr@tnsp.org>
parents: 1042
diff changeset
68 # Expand entities for five levels at most
1627
a3c3591f9a74 Clean up this script.
Matti Hamalainen <ccr@tnsp.org>
parents: 1229
diff changeset
69 my $str = $line;
a3c3591f9a74 Clean up this script.
Matti Hamalainen <ccr@tnsp.org>
parents: 1229
diff changeset
70 for (my $depth = 1; $depth < 5; $depth++)
a3c3591f9a74 Clean up this script.
Matti Hamalainen <ccr@tnsp.org>
parents: 1229
diff changeset
71 {
a3c3591f9a74 Clean up this script.
Matti Hamalainen <ccr@tnsp.org>
parents: 1229
diff changeset
72 while (my ($k, $v) = each(%xmlentities))
a3c3591f9a74 Clean up this script.
Matti Hamalainen <ccr@tnsp.org>
parents: 1229
diff changeset
73 {
1046
8474bece3c4b Cleanup normalizeml.pl a bit.
Matti Hamalainen <ccr@tnsp.org>
parents: 1042
diff changeset
74 $str =~ s/&$k;/$v/g;
1059
c4a4c72c247a Input and output UTF-8; Add parametrized entities.
Matti Hamalainen <ccr@tnsp.org>
parents: 1046
diff changeset
75 $str =~ s/&$k\s+([A-Za-z0-9 ]+);/dorep($v,$1)/eg;
1046
8474bece3c4b Cleanup normalizeml.pl a bit.
Matti Hamalainen <ccr@tnsp.org>
parents: 1042
diff changeset
76 }
8474bece3c4b Cleanup normalizeml.pl a bit.
Matti Hamalainen <ccr@tnsp.org>
parents: 1042
diff changeset
77 last unless ($str =~ /&/);
619
3ab1d39cac73 Added very simple utility for normalizing SGML/XML files, kludged in Perl.
Matti Hamalainen <ccr@tnsp.org>
parents:
diff changeset
78 }
1046
8474bece3c4b Cleanup normalizeml.pl a bit.
Matti Hamalainen <ccr@tnsp.org>
parents: 1042
diff changeset
79 print $str;
619
3ab1d39cac73 Added very simple utility for normalizing SGML/XML files, kludged in Perl.
Matti Hamalainen <ccr@tnsp.org>
parents:
diff changeset
80 }
3ab1d39cac73 Added very simple utility for normalizing SGML/XML files, kludged in Perl.
Matti Hamalainen <ccr@tnsp.org>
parents:
diff changeset
81 }