changeset 6:34a89d61dbe7

Merge and cleanup.
author Matti Hamalainen <ccr@tnsp.org>
date Mon, 10 May 2021 21:43:00 +0300
parents 274b2091137c (diff) 7ce08dea935b (current diff)
children 4b4299b62f7f
files lxmldump.py
diffstat 1 files changed, 58 insertions(+), 49 deletions(-) [+]
line wrap: on
line diff
--- a/lxmldump.py	Mon May 10 12:24:10 2021 +0300
+++ b/lxmldump.py	Mon May 10 21:43:00 2021 +0300
@@ -1,13 +1,18 @@
 #!/usr/bin/python3 -B
 # coding=utf-8
 ###
-### ISO/FDIS 1951 lxmldump
+### lxmldump - Dump ISO/FDIS 1951 XML file data
+### Programmed and designed by Matti 'ccr' Hämäläinen <ccr@tnsp.org>
+### (C) Copyright 2021 Tecnic Software productions (TNSP)
+###
+### Python 3.7+ required!
 ###
 import sys
 import signal
 import re
 from pathlib import Path
 import xml.etree.ElementTree as xmlET
+import unicodedata
 
 assert sys.version_info >= (3, 7)
 
@@ -16,8 +21,8 @@
 ### Default settings
 ###
 pkk_cfg = {
-    "verbosity": 1,
     "dump": False,
+    "normalize": False,
 }
 
 
@@ -29,9 +34,11 @@
 
 
 ## Wrapper for print()
-def pkk_print(level, smsg):
-    if pkk_cfg["verbosity"] >= level:
-        print(smsg)
+def pkk_print(smsg):
+    if pkk_cfg["normalize"]:
+        sys.stdout.write(unicodedata.normalize("NFC", smsg))
+    else:
+        sys.stdout.write(smsg)
 
 
 ## Fatal error handler
@@ -47,6 +54,46 @@
     sys.exit(1)
 
 
+##
+def pkk_dump_recursive(lnode, indent):
+    if lnode.tag == "Example":
+        stmp = "".join(lnode.itertext()).strip()
+        print("{}{} \"{}\"".format("    " * indent, lnode.tag, stmp))
+    else:
+        stmp = ""
+        if lnode.text != None:
+            tmp = str(lnode.text).strip()
+            if tmp != "":
+                stmp = " \""+ tmp +"\""
+
+        if len(lnode.attrib) > 0:
+            atmp = " "+str(lnode.attrib)
+        else:
+            atmp = ""
+
+        pkk_print("{}{}{}{}\n".format("    " * indent, lnode.tag, atmp, stmp))
+        for qnode in lnode.findall("./*"):
+            pkk_dump_recursive(qnode, indent + 1)
+
+
+##
+def pkk_output_node(dnode):
+    wlist = []
+    dlist = []
+    for wnode in dnode.findall("./HeadwordCtn"):
+        for qnode in wnode.findall("./SearchForm"):
+            wlist.append(str(qnode.text).strip())
+
+        for qnode in wnode.findall("./Definition"):
+            dlist.append(str(qnode.text).strip())
+
+    for wnode in dnode.findall("./SenseGrp"):
+        for qnode in wnode.findall("./Definition"):
+            dlist.append(str(qnode.text).strip())
+
+    pkk_print("{} : {}\n".format(", ".join(wlist), " ; ".join(dlist)))
+
+
 ###
 ### Main program starts
 ###
@@ -75,9 +122,8 @@
             pkk_show_help = True
         elif arg == "dump" or arg == "d":
             pkk_cfg["dump"] = True
-        elif arg == "v" or arg == "verbosity":
-            needs_param = True
-            pkk_cfg["verbosity"] = param
+        elif arg == "normalize" or arg == "n":
+            pkk_cfg["normalize"] = True
         else:
             pkk_fatal(u"Invalid option argument '{0}'.".format(oarg))
 
@@ -100,38 +146,13 @@
         format(str(Path(sys.argv[0]).name)))
     print(u"")
     print(u"       --help              Show this help")
-#    print(u"  -v,  --verbosity <0-3>   Set verbosity")
     print(u"  -d,  --dump              Dump mode")
+    print(u"  -n,  --normalize         Output NFC normalized Unicode")
     print(u"")
     sys.exit(0)
 
 
-
-
-###
-### Main
-###
-def pkk_recursive_dump(lnode, indent):
-    if lnode.tag == "Example":
-        stmp = "".join(lnode.itertext()).strip()
-        print("{}{} \"{}\"".format("    " * indent, lnode.tag, stmp))
-    else:
-        stmp = ""
-        if lnode.text != None:
-            tmp = str(lnode.text).strip()
-            if tmp != "":
-                stmp = " \""+ tmp +"\""
-
-        if len(lnode.attrib) > 0:
-            atmp = " "+str(lnode.attrib)
-        else:
-            atmp = ""
-
-        print("{}{}{}{}".format("    " * indent, lnode.tag, atmp, stmp))
-        for qnode in lnode.findall("./*"):
-            pkk_recursive_dump(qnode, indent + 1)
-
-
+### Handle each input file
 for filename in pkk_filenames:
     # Parse XML file into element tree
     try:
@@ -144,22 +165,10 @@
         xroot = uxml.getroot()
         for dnode in xroot.findall("./DictionaryEntry"):
             if pkk_cfg["dump"]:
-                pkk_recursive_dump(dnode, 0)
+                pkk_dump_recursive(dnode, 0)
                 print("\n\n")
             else:
-                wlist = []
-                dlist = []
-                for wnode in dnode.findall("./HeadwordCtn"):
-                    for qnode in wnode.findall("./SearchForm"):
-                        wlist.append(str(qnode.text).strip())
-                    for qnode in wnode.findall("./Definition"):
-                        dlist.append(str(qnode.text).strip())
-
-                for wnode in dnode.findall("./SenseGrp"):
-                    for qnode in wnode.findall("./Definition"):
-                        dlist.append(str(qnode.text).strip())
-
-                print("{} : {}".format(", ".join(wlist), " ; ".join(dlist)))
+                pkk_output_node(dnode)
 
     except (BrokenPipeError, IOError) as e:
         sys.stderr.close()