changeset 43:8ed576574712

More improvements to output flexibility.
author Matti Hamalainen <ccr@tnsp.org>
date Wed, 26 May 2021 13:12:02 +0300
parents 508de0f6836b
children d7b4b2fb0214
files lxmldump.py
diffstat 1 files changed, 66 insertions(+), 31 deletions(-) [+]
line wrap: on
line diff
--- a/lxmldump.py	Wed May 26 12:23:30 2021 +0300
+++ b/lxmldump.py	Wed May 26 13:12:02 2021 +0300
@@ -48,7 +48,7 @@
     },
 
     "word_fmt": {
-        PKK_MODE_NORMAL: "\"{word}\"{attr}\n",
+        PKK_MODE_NORMAL: "\"{word}\"{search}{attr}\n",
         PKK_MODE_ANKI: "{word}{attr}\n",
     },
     "word_attr_list": {
@@ -57,21 +57,32 @@
     "word_attr_list_empty": {
         PKK_MODE_NORMAL: " ",
     },
+    "word_attr_list_item": {
+        PKK_MODE_NORMAL: "{text}",
+    },
     "word_attr_list_sep": {
         PKK_MODE_NORMAL: " ; ",
         PKK_MODE_ANKI: " : ",
     },
 
+    "search_list": {
+        PKK_MODE_NORMAL: ", {alist}",
+    },
+    "search_list_empty": {
+        PKK_MODE_NORMAL: "",
+    },
+    "search_list_item": {
+        PKK_MODE_NORMAL: "\"{text}\"",
+    },
+    "search_list_sep": {
+        PKK_MODE_NORMAL: ", ",
+    },
+
     "sense_index": {
         PKK_MODE_NORMAL: "{indent}sense #{index}\n",
         PKK_MODE_ANKI: "[{index}]:\n",
     },
 
-    "search_fmt": {
-        PKK_MODE_NORMAL: "{indent}srch \"{text}\"\n",
-        PKK_MODE_ANKI: "{text}\n",
-    },
-
     "definition_fmt": {
         PKK_MODE_NORMAL: "{indent}defn \"{text}\"\n",
         PKK_MODE_ANKI: "? {text}\n",
@@ -82,11 +93,14 @@
         PKK_MODE_ANKI: "- {text}{geostr}\n",
     },
     "example_geo_list": {
-        PKK_MODE_NORMAL: " ({glist})",
+        PKK_MODE_NORMAL: " ({alist})",
     },
     "example_geo_list_empty": {
         PKK_MODE_NORMAL: "",
     },
+    "example_geo_list_item": {
+        PKK_MODE_NORMAL: "{text} [{tclass}]",
+    },
     "example_geo_list_sep": {
         PKK_MODE_NORMAL: ", ",
     },
@@ -300,26 +314,36 @@
             indent=pkk_geti(indent)))
 
 
-## Output a main "Headword" or "Sense" node under it
+def pkk_get_list_str(dlist, dprefix, dfilter):
+    if len(dlist) > 0:
+        if dfilter:
+            tfmt = pkk_get_fmt(dprefix + "_list_item")
+            tlist = [tfmt.format(text=i) for i in dlist]
+        else:
+            tlist = dlist
+
+        return pkk_get_fmt(dprefix + "_list").format(
+            alist=pkk_get_fmt(dprefix + "_list_sep").join(tlist))
+    else:
+        return pkk_get_fmt(dprefix + "_list_empty")
+
+
+## Output a main "Headword" or "Sense" node
 def pkk_output_sense(indent, dnode):
-    # Search form and definition
-    pkk_output_subs(indent, dnode, "./SearchForm", "search_fmt")
+    # Definition for this sense
     pkk_output_subs(indent, dnode, "./Definition", "definition_fmt")
 
-    # Examples
+    # Examples for this sense
     for wnode in dnode.findall("./ExampleBlock/ExampleCtn"):
         geolist = []
         for qnode in wnode.findall("./FreeTopic[@type='levikki']/GeographicalUsage"):
-            geolist.append("{} [{}]".format(pkk_node_to_text(qnode), qnode.attrib["class"]))
-
-        if len(geolist) > 0:
-            geostr = pkk_get_fmt("example_geo_list").format(glist=pkk_get_fmt("example_geo_list_sep").join(geolist))
-        else:
-            geostr = pkk_get_fmt("example_geo_list_empty")
+            geolist.append(pkk_get_fmt("example_geo_list_item").format(
+                text=pkk_node_to_text(qnode),
+                tclass=qnode.attrib["class"]))
 
         pkk_print(pkk_get_fmt("example_fmt").format(
             text=pkk_node_to_text(wnode.find("./Example")),
-            geostr=geostr,
+            geostr=pkk_get_list_str(geolist, "example_geo", False),
             indent=pkk_geti(indent + 1)))
 
 
@@ -327,28 +351,39 @@
 def pkk_output_node(indent, dnode):
 
     for wnode in dnode.findall("./HeadwordCtn"):
+        # Get head word
+        headword = pkk_node_to_text(wnode.find("./Headword"))
+
+        # Collect search forms
+        srchlist = []
+        for qnode in wnode.findall("./SearchForm"):
+            srchlist.append(pkk_node_to_text(qnode))
+
+        # Remove dupe if headword is also in srchlist
+        if headword in srchlist:
+            srchlist.remove(headword)
+
+        # Remove other duplicates and sort
+        srchlist = list(set(srchlist))
+        srchlist.sort(reverse=False, key=lambda attr: (attr, len(attr)))
+
         # Create list with grammatical attributes (noun, verb, etc.)
-        tmpl = []
+        attrlist = []
         for pnode in wnode.findall("./PartOfSpeechCtn/PartOfSpeech"):
-            tmpl.append(pnode.attrib["freeValue"])
+            attrlist.append(pnode.attrib["freeValue"])
 
         for pnode in wnode.findall("./GrammaticalNote"):
-            tmpl.append(pkk_node_to_text(pnode))
+            attrlist.append(pkk_node_to_text(pnode))
 
         # Remove duplicates and sort the list
-        tmpl = list(set(tmpl))
-        tmpl.sort(reverse=False, key=lambda attr: (attr, len(attr)))
-
-        if len(tmpl) > 0:
-            astr = pkk_get_fmt("word_attr_list").format(
-                alist=pkk_get_fmt("word_attr_list_sep").join(tmpl))
-        else:
-            astr = pkk_get_fmt("word_attr_list_empty")
+        attrlist = list(set(attrlist))
+        attrlist.sort(reverse=False, key=lambda attr: (attr, len(attr)))
 
         # Print the headword and attributes if any
         pkk_print(pkk_get_fmt("word_fmt").format(
-            word=pkk_node_to_text(wnode.find("./Headword")),
-            attr=astr,
+            word=headword,
+            attr=pkk_get_list_str(attrlist, "word_attr", True),
+            search=pkk_get_list_str(srchlist, "search", True),
             indent=pkk_geti(indent)))
 
         # Print main "sense"