Mercurial > hg > batmud > ggrtf
annotate docs/normalizeml.pl @ 1628:ea96ce334a5c
Fix document normalization.
author | Matti Hamalainen <ccr@tnsp.org> |
---|---|
date | Sat, 07 Jul 2018 01:46:47 +0300 |
parents | a3c3591f9a74 |
children |
rev | line source |
---|---|
619
3ab1d39cac73
Added very simple utility for normalizing SGML/XML files, kludged in Perl.
Matti Hamalainen <ccr@tnsp.org>
parents:
diff
changeset
|
1 #!/usr/bin/perl -w |
1046
8474bece3c4b
Cleanup normalizeml.pl a bit.
Matti Hamalainen <ccr@tnsp.org>
parents:
1042
diff
changeset
|
2 # |
619
3ab1d39cac73
Added very simple utility for normalizing SGML/XML files, kludged in Perl.
Matti Hamalainen <ccr@tnsp.org>
parents:
diff
changeset
|
3 # Utility for "normalizing" XML/SGML files |
1042 | 4 # Programmed by Matti 'ccr' Hamalainen <ccr@tnsp.org> |
839
9493bb609fd3
More comments; Handle external entities (e.g. SYSTEM entities), but probably not in conforming manner.
Matti Hamalainen <ccr@tnsp.org>
parents:
623
diff
changeset
|
5 # (C) Copyright 2007,2009 Tecnic Software productions (TNSP) |
1046
8474bece3c4b
Cleanup normalizeml.pl a bit.
Matti Hamalainen <ccr@tnsp.org>
parents:
1042
diff
changeset
|
6 # |
1627 | 7 use utf8; |
1042 | 8 use strict; |
1046
8474bece3c4b
Cleanup normalizeml.pl a bit.
Matti Hamalainen <ccr@tnsp.org>
parents:
1042
diff
changeset
|
9 use warnings; |
1042 | 10 |
1627 | 11 |
1059
c4a4c72c247a
Input and output UTF-8; Add parametrized entities.
Matti Hamalainen <ccr@tnsp.org>
parents:
1046
diff
changeset
|
12 sub dorep($$) |
c4a4c72c247a
Input and output UTF-8; Add parametrized entities.
Matti Hamalainen <ccr@tnsp.org>
parents:
1046
diff
changeset
|
13 { |
c4a4c72c247a
Input and output UTF-8; Add parametrized entities.
Matti Hamalainen <ccr@tnsp.org>
parents:
1046
diff
changeset
|
14 my $str = $_[0]; |
c4a4c72c247a
Input and output UTF-8; Add parametrized entities.
Matti Hamalainen <ccr@tnsp.org>
parents:
1046
diff
changeset
|
15 my @vals = split(/ /, $_[1]); |
c4a4c72c247a
Input and output UTF-8; Add parametrized entities.
Matti Hamalainen <ccr@tnsp.org>
parents:
1046
diff
changeset
|
16 $str =~ s/\$(\d+)/$vals[$1 - 1]/eg; |
c4a4c72c247a
Input and output UTF-8; Add parametrized entities.
Matti Hamalainen <ccr@tnsp.org>
parents:
1046
diff
changeset
|
17 return $str; |
c4a4c72c247a
Input and output UTF-8; Add parametrized entities.
Matti Hamalainen <ccr@tnsp.org>
parents:
1046
diff
changeset
|
18 } |
c4a4c72c247a
Input and output UTF-8; Add parametrized entities.
Matti Hamalainen <ccr@tnsp.org>
parents:
1046
diff
changeset
|
19 |
1627 | 20 |
1046
8474bece3c4b
Cleanup normalizeml.pl a bit.
Matti Hamalainen <ccr@tnsp.org>
parents:
1042
diff
changeset
|
21 my %xmlentities = (); |
619
3ab1d39cac73
Added very simple utility for normalizing SGML/XML files, kludged in Perl.
Matti Hamalainen <ccr@tnsp.org>
parents:
diff
changeset
|
22 my $entMode = 0; |
3ab1d39cac73
Added very simple utility for normalizing SGML/XML files, kludged in Perl.
Matti Hamalainen <ccr@tnsp.org>
parents:
diff
changeset
|
23 my $entData; |
3ab1d39cac73
Added very simple utility for normalizing SGML/XML files, kludged in Perl.
Matti Hamalainen <ccr@tnsp.org>
parents:
diff
changeset
|
24 my $entName; |
3ab1d39cac73
Added very simple utility for normalizing SGML/XML files, kludged in Perl.
Matti Hamalainen <ccr@tnsp.org>
parents:
diff
changeset
|
25 |
1059
c4a4c72c247a
Input and output UTF-8; Add parametrized entities.
Matti Hamalainen <ccr@tnsp.org>
parents:
1046
diff
changeset
|
26 binmode(STDIN, ":utf8"); |
c4a4c72c247a
Input and output UTF-8; Add parametrized entities.
Matti Hamalainen <ccr@tnsp.org>
parents:
1046
diff
changeset
|
27 binmode(STDOUT, ":utf8"); |
c4a4c72c247a
Input and output UTF-8; Add parametrized entities.
Matti Hamalainen <ccr@tnsp.org>
parents:
1046
diff
changeset
|
28 |
1627 | 29 while (defined(my $line = <STDIN>)) |
30 { | |
31 if ($line =~ /<!ENTITY ([A-Za-z][A-Za-z0-9_]+) +SYSTEM +\"([^\"]*)\">/) | |
32 { | |
839
9493bb609fd3
More comments; Handle external entities (e.g. SYSTEM entities), but probably not in conforming manner.
Matti Hamalainen <ccr@tnsp.org>
parents:
623
diff
changeset
|
33 # Handle external entities |
1046
8474bece3c4b
Cleanup normalizeml.pl a bit.
Matti Hamalainen <ccr@tnsp.org>
parents:
1042
diff
changeset
|
34 my $name = $1; |
8474bece3c4b
Cleanup normalizeml.pl a bit.
Matti Hamalainen <ccr@tnsp.org>
parents:
1042
diff
changeset
|
35 my $extfname = $2; |
1229
2871db57f976
Fix inclusion and expanding of SYSTEM entities.
Matti Hamalainen <ccr@tnsp.org>
parents:
1059
diff
changeset
|
36 local($/, *INFILE); |
1627 | 37 open(INFILE, "<", $extfname) or die("Could not open entity file '".$extfname."'.\n"); |
1046
8474bece3c4b
Cleanup normalizeml.pl a bit.
Matti Hamalainen <ccr@tnsp.org>
parents:
1042
diff
changeset
|
38 $xmlentities{$name} = <INFILE>; |
839
9493bb609fd3
More comments; Handle external entities (e.g. SYSTEM entities), but probably not in conforming manner.
Matti Hamalainen <ccr@tnsp.org>
parents:
623
diff
changeset
|
39 close(INFILE); |
1627 | 40 } |
41 elsif ($line =~ /<!ENTITY ([A-Za-z][A-Za-z0-9_]+) \"(.*?)\">/) | |
42 { | |
839
9493bb609fd3
More comments; Handle external entities (e.g. SYSTEM entities), but probably not in conforming manner.
Matti Hamalainen <ccr@tnsp.org>
parents:
623
diff
changeset
|
43 # One-line entities |
1046
8474bece3c4b
Cleanup normalizeml.pl a bit.
Matti Hamalainen <ccr@tnsp.org>
parents:
1042
diff
changeset
|
44 $xmlentities{$1} = $2; |
1627 | 45 } |
46 elsif ($line =~ /<!ENTITY ([A-Za-z][A-Za-z0-9_]+) \"(.*)$/) | |
47 { | |
839
9493bb609fd3
More comments; Handle external entities (e.g. SYSTEM entities), but probably not in conforming manner.
Matti Hamalainen <ccr@tnsp.org>
parents:
623
diff
changeset
|
48 # Multi-line entities |
619
3ab1d39cac73
Added very simple utility for normalizing SGML/XML files, kludged in Perl.
Matti Hamalainen <ccr@tnsp.org>
parents:
diff
changeset
|
49 $entName = $1; |
3ab1d39cac73
Added very simple utility for normalizing SGML/XML files, kludged in Perl.
Matti Hamalainen <ccr@tnsp.org>
parents:
diff
changeset
|
50 $entData = $2; |
3ab1d39cac73
Added very simple utility for normalizing SGML/XML files, kludged in Perl.
Matti Hamalainen <ccr@tnsp.org>
parents:
diff
changeset
|
51 $entMode = 1; |
1627 | 52 } |
53 elsif ($entMode == 1) | |
54 { | |
55 if ($line =~ /^(.*)\">/) | |
56 { | |
619
3ab1d39cac73
Added very simple utility for normalizing SGML/XML files, kludged in Perl.
Matti Hamalainen <ccr@tnsp.org>
parents:
diff
changeset
|
57 $entData .= $1; |
1046
8474bece3c4b
Cleanup normalizeml.pl a bit.
Matti Hamalainen <ccr@tnsp.org>
parents:
1042
diff
changeset
|
58 $xmlentities{$entName} = $entData; |
619
3ab1d39cac73
Added very simple utility for normalizing SGML/XML files, kludged in Perl.
Matti Hamalainen <ccr@tnsp.org>
parents:
diff
changeset
|
59 $entMode = 0; |
1627 | 60 } |
61 else | |
62 { | |
1628
ea96ce334a5c
Fix document normalization.
Matti Hamalainen <ccr@tnsp.org>
parents:
1627
diff
changeset
|
63 $entData .= $line; |
619
3ab1d39cac73
Added very simple utility for normalizing SGML/XML files, kludged in Perl.
Matti Hamalainen <ccr@tnsp.org>
parents:
diff
changeset
|
64 } |
1627 | 65 } |
66 else | |
67 { | |
1046
8474bece3c4b
Cleanup normalizeml.pl a bit.
Matti Hamalainen <ccr@tnsp.org>
parents:
1042
diff
changeset
|
68 # Expand entities for five levels at most |
1627 | 69 my $str = $line; |
70 for (my $depth = 1; $depth < 5; $depth++) | |
71 { | |
72 while (my ($k, $v) = each(%xmlentities)) | |
73 { | |
1046
8474bece3c4b
Cleanup normalizeml.pl a bit.
Matti Hamalainen <ccr@tnsp.org>
parents:
1042
diff
changeset
|
74 $str =~ s/&$k;/$v/g; |
1059
c4a4c72c247a
Input and output UTF-8; Add parametrized entities.
Matti Hamalainen <ccr@tnsp.org>
parents:
1046
diff
changeset
|
75 $str =~ s/&$k\s+([A-Za-z0-9 ]+);/dorep($v,$1)/eg; |
1046
8474bece3c4b
Cleanup normalizeml.pl a bit.
Matti Hamalainen <ccr@tnsp.org>
parents:
1042
diff
changeset
|
76 } |
8474bece3c4b
Cleanup normalizeml.pl a bit.
Matti Hamalainen <ccr@tnsp.org>
parents:
1042
diff
changeset
|
77 last unless ($str =~ /&/); |
619
3ab1d39cac73
Added very simple utility for normalizing SGML/XML files, kludged in Perl.
Matti Hamalainen <ccr@tnsp.org>
parents:
diff
changeset
|
78 } |
1046
8474bece3c4b
Cleanup normalizeml.pl a bit.
Matti Hamalainen <ccr@tnsp.org>
parents:
1042
diff
changeset
|
79 print $str; |
619
3ab1d39cac73
Added very simple utility for normalizing SGML/XML files, kludged in Perl.
Matti Hamalainen <ccr@tnsp.org>
parents:
diff
changeset
|
80 } |
3ab1d39cac73
Added very simple utility for normalizing SGML/XML files, kludged in Perl.
Matti Hamalainen <ccr@tnsp.org>
parents:
diff
changeset
|
81 } |