changeset 6:34a89d61dbe7

Merge and cleanup.
author Matti Hamalainen <ccr@tnsp.org>
date Mon, 10 May 2021 21:43:00 +0300
parents 274b2091137c (current diff) 7ce08dea935b (diff)
children 4b4299b62f7f
files lxmldump.py
diffstat 1 files changed, 44 insertions(+), 34 deletions(-) [+]
line wrap: on
line diff
--- a/lxmldump.py	Mon May 10 21:38:11 2021 +0300
+++ b/lxmldump.py	Mon May 10 21:43:00 2021 +0300
@@ -54,6 +54,46 @@
     sys.exit(1)
 
 
+##
+def pkk_dump_recursive(lnode, indent):
+    if lnode.tag == "Example":
+        stmp = "".join(lnode.itertext()).strip()
+        print("{}{} \"{}\"".format("    " * indent, lnode.tag, stmp))
+    else:
+        stmp = ""
+        if lnode.text != None:
+            tmp = str(lnode.text).strip()
+            if tmp != "":
+                stmp = " \""+ tmp +"\""
+
+        if len(lnode.attrib) > 0:
+            atmp = " "+str(lnode.attrib)
+        else:
+            atmp = ""
+
+        pkk_print("{}{}{}{}\n".format("    " * indent, lnode.tag, atmp, stmp))
+        for qnode in lnode.findall("./*"):
+            pkk_dump_recursive(qnode, indent + 1)
+
+
+##
+def pkk_output_node(dnode):
+    wlist = []
+    dlist = []
+    for wnode in dnode.findall("./HeadwordCtn"):
+        for qnode in wnode.findall("./SearchForm"):
+            wlist.append(str(qnode.text).strip())
+
+        for qnode in wnode.findall("./Definition"):
+            dlist.append(str(qnode.text).strip())
+
+    for wnode in dnode.findall("./SenseGrp"):
+        for qnode in wnode.findall("./Definition"):
+            dlist.append(str(qnode.text).strip())
+
+    pkk_print("{} : {}\n".format(", ".join(wlist), " ; ".join(dlist)))
+
+
 ###
 ### Main program starts
 ###
@@ -107,42 +147,12 @@
     print(u"")
     print(u"       --help              Show this help")
     print(u"  -d,  --dump              Dump mode")
+    print(u"  -n,  --normalize         Output NFC normalized Unicode")
     print(u"")
     sys.exit(0)
 
 
-###
-### Main
-###
-def pkk_dump_simple_node(lnode, indent):
-    stmp = ""
-    if lnode.text != None:
-        tmp = str(lnode.text).strip()
-        if tmp != "":
-            stmp = " \""+ tmp +"\""
-
-    pkk_print("{}{} {}{}".format("    " * indent, lnode.tag, lnode.attrib, stmp))
-    for qnode in lnode.findall("./*"):
-        pkk_dump_simple_node(qnode, indent + 1)
-
-
-def pkk_dump_node(dnode):
-    wlist = []
-    dlist = []
-    for wnode in dnode.findall("./HeadwordCtn"):
-        for qnode in wnode.findall("./SearchForm"):
-            wlist.append(str(qnode.text).strip())
-
-        for qnode in wnode.findall("./Definition"):
-            dlist.append(str(qnode.text).strip())
-
-    for wnode in dnode.findall("./SenseGrp"):
-        for qnode in wnode.findall("./Definition"):
-            dlist.append(str(qnode.text).strip())
-
-    pkk_print("{} : {}".format(", ".join(wlist), " ; ".join(dlist)))
-
-
+### Handle each input file
 for filename in pkk_filenames:
     # Parse XML file into element tree
     try:
@@ -155,10 +165,10 @@
         xroot = uxml.getroot()
         for dnode in xroot.findall("./DictionaryEntry"):
             if pkk_cfg["dump"]:
-                pkk_dump_simple_node(dnode, 0)
+                pkk_dump_recursive(dnode, 0)
                 print("\n\n")
             else:
-                pkk_dump_node(dnode)
+                pkk_output_node(dnode)
 
     except (BrokenPipeError, IOError) as e:
         sys.stderr.close()