changeset 7:4b4299b62f7f

Moar work.
author Matti Hamalainen <ccr@tnsp.org>
date Tue, 11 May 2021 01:26:57 +0300
parents 34a89d61dbe7
children ce07bb2a247b
files lxmldump.py
diffstat 1 files changed, 91 insertions(+), 26 deletions(-) [+]
line wrap: on
line diff
--- a/lxmldump.py	Mon May 10 21:43:00 2021 +0300
+++ b/lxmldump.py	Tue May 11 01:26:57 2021 +0300
@@ -23,9 +23,25 @@
 pkk_cfg = {
     "dump": False,
     "normalize": False,
+
+    "debug": False,
 }
 
 
+pkk_str_fmap = {
+    "Fragment" : ["<", ">"],
+}
+
+
+pkk_debug_list = [
+    "ahas",
+    "ahavakkaine",
+    "ahavakala",
+    "ahavakoittuo",
+    "ahvaliha",
+]
+
+
 ###
 ### Misc. helper functions, etc
 ###
@@ -40,6 +56,9 @@
     else:
         sys.stdout.write(smsg)
 
+def pkk_printi(indent, smsg):
+    pkk_print(("    " * indent) + smsg)
+
 
 ## Fatal error handler
 def pkk_fatal(smsg):
@@ -54,44 +73,83 @@
     sys.exit(1)
 
 
+def pkk_get_text(lnode):
+    stmp = ""
+    for pnode in lnode.iter():
+        if isinstance(pnode.text, str):
+            if isinstance(pnode.tag, str) and pnode.tag in pkk_str_fmap:
+                stmp += pkk_str_fmap[pnode.tag][0] + pnode.text + pkk_str_fmap[pnode.tag][1]
+            else:
+                stmp += pnode.text
+
+        if isinstance(pnode.tail, str):
+            stmp += pnode.tail
+
+    return stmp.strip()
+
+
 ##
-def pkk_dump_recursive(lnode, indent):
-    if lnode.tag == "Example":
-        stmp = "".join(lnode.itertext()).strip()
-        print("{}{} \"{}\"".format("    " * indent, lnode.tag, stmp))
+def pkk_dump_recursive(indent, lnode):
+    if lnode.tag in ["Example"]:
+        stmp = pkk_get_text(lnode)
+        pkk_printi(indent, "{} \"{}\"".format(lnode.tag, stmp))
     else:
-        stmp = ""
-        if lnode.text != None:
-            tmp = str(lnode.text).strip()
-            if tmp != "":
-                stmp = " \""+ tmp +"\""
+        if isinstance(lnode.text, str):
+            stmp = lnode.text.strip()
+            if stmp != "":
+                stmp = " \""+ stmp +"\""
+        else:
+            stmp = ""
 
         if len(lnode.attrib) > 0:
             atmp = " "+str(lnode.attrib)
         else:
             atmp = ""
 
-        pkk_print("{}{}{}{}\n".format("    " * indent, lnode.tag, atmp, stmp))
+        pkk_printi(indent, "{}{}{}\n".format(lnode.tag, atmp, stmp))
         for qnode in lnode.findall("./*"):
-            pkk_dump_recursive(qnode, indent + 1)
+            pkk_dump_recursive(indent + 1, qnode)
 
 
 ##
-def pkk_output_node(dnode):
-    wlist = []
-    dlist = []
-    for wnode in dnode.findall("./HeadwordCtn"):
-        for qnode in wnode.findall("./SearchForm"):
-            wlist.append(str(qnode.text).strip())
+def pkk_output_one(indent, dnode, dsub):
+    for qnode in dnode.findall(dsub):
+        pkk_printi(indent, "{}\n".format(pkk_get_text(qnode)))
+
+def pkk_output_subs(indent, dnode, dsub, dname):
+    for qnode in dnode.findall(dsub):
+        pkk_printi(indent, "{} \"{}\"\n".format(dname, pkk_get_text(qnode)))
+
+
+def pkk_output_sense(indent, dnode):
+    pkk_output_subs(indent, dnode, "./SearchForm", "srch")
+    pkk_output_subs(indent, dnode, "./Definition", "defn")
+
+    for wnode in dnode.findall("./ExampleBlock/ExampleCtn"):
+        sstr = pkk_get_text(wnode.find("./Example"))
+        ltmp = []
+        for qnode in wnode.findall("./FreeTopic[@type='levikki']/GeographicalUsage"):
+            ltmp.append("{} [{}]".format(pkk_get_text(qnode), qnode.attrib["class"]))
 
-        for qnode in wnode.findall("./Definition"):
-            dlist.append(str(qnode.text).strip())
+        if len(ltmp) > 0:
+            lstr = " ({})".format(", ".join(ltmp))
+        else:
+            lstr = ""
+
+        pkk_printi(indent + 1, "{} \"{}\"{}\n".format("exmp", sstr, lstr))
+
+
+def pkk_output_node(indent, dnode):
 
-    for wnode in dnode.findall("./SenseGrp"):
-        for qnode in wnode.findall("./Definition"):
-            dlist.append(str(qnode.text).strip())
+    for wnode in dnode.findall("./HeadwordCtn"):
+        pkk_output_one (indent, wnode, "./Headword")
+        pkk_output_sense(indent + 1, wnode)
 
-    pkk_print("{} : {}\n".format(", ".join(wlist), " ; ".join(dlist)))
+        index = 1
+        for wnode in dnode.findall("./SenseGrp"):
+            pkk_printi(indent + 2, "sense #{}\n".format(index))
+            pkk_output_sense(indent + 2, wnode)
+            index += 1
 
 
 ###
@@ -124,6 +182,8 @@
             pkk_cfg["dump"] = True
         elif arg == "normalize" or arg == "n":
             pkk_cfg["normalize"] = True
+        elif arg == "p":
+            pkk_cfg["debug"] = True
         else:
             pkk_fatal(u"Invalid option argument '{0}'.".format(oarg))
 
@@ -164,11 +224,16 @@
     try:
         xroot = uxml.getroot()
         for dnode in xroot.findall("./DictionaryEntry"):
+
+            if pkk_cfg["debug"] and dnode.attrib["identifier"] not in pkk_debug_list:
+                continue
+
             if pkk_cfg["dump"]:
-                pkk_dump_recursive(dnode, 0)
-                print("\n\n")
+                pkk_dump_recursive(0, dnode)
             else:
-                pkk_output_node(dnode)
+                pkk_output_node(0, dnode)
+
+            print("\n")
 
     except (BrokenPipeError, IOError) as e:
         sys.stderr.close()