# HG changeset patch # User Matti Hamalainen # Date 1622023922 -10800 # Node ID 8ed576574712e28f438e0f06c25d98c66b8195a9 # Parent 508de0f6836be183604015be717d859f1b011fd4 More improvements to output flexibility. diff -r 508de0f6836b -r 8ed576574712 lxmldump.py --- a/lxmldump.py Wed May 26 12:23:30 2021 +0300 +++ b/lxmldump.py Wed May 26 13:12:02 2021 +0300 @@ -48,7 +48,7 @@ }, "word_fmt": { - PKK_MODE_NORMAL: "\"{word}\"{attr}\n", + PKK_MODE_NORMAL: "\"{word}\"{search}{attr}\n", PKK_MODE_ANKI: "{word}{attr}\n", }, "word_attr_list": { @@ -57,21 +57,32 @@ "word_attr_list_empty": { PKK_MODE_NORMAL: " ", }, + "word_attr_list_item": { + PKK_MODE_NORMAL: "{text}", + }, "word_attr_list_sep": { PKK_MODE_NORMAL: " ; ", PKK_MODE_ANKI: " : ", }, + "search_list": { + PKK_MODE_NORMAL: ", {alist}", + }, + "search_list_empty": { + PKK_MODE_NORMAL: "", + }, + "search_list_item": { + PKK_MODE_NORMAL: "\"{text}\"", + }, + "search_list_sep": { + PKK_MODE_NORMAL: ", ", + }, + "sense_index": { PKK_MODE_NORMAL: "{indent}sense #{index}\n", PKK_MODE_ANKI: "[{index}]:\n", }, - "search_fmt": { - PKK_MODE_NORMAL: "{indent}srch \"{text}\"\n", - PKK_MODE_ANKI: "{text}\n", - }, - "definition_fmt": { PKK_MODE_NORMAL: "{indent}defn \"{text}\"\n", PKK_MODE_ANKI: "? {text}\n", @@ -82,11 +93,14 @@ PKK_MODE_ANKI: "- {text}{geostr}\n", }, "example_geo_list": { - PKK_MODE_NORMAL: " ({glist})", + PKK_MODE_NORMAL: " ({alist})", }, "example_geo_list_empty": { PKK_MODE_NORMAL: "", }, + "example_geo_list_item": { + PKK_MODE_NORMAL: "{text} [{tclass}]", + }, "example_geo_list_sep": { PKK_MODE_NORMAL: ", ", }, @@ -300,26 +314,36 @@ indent=pkk_geti(indent))) -## Output a main "Headword" or "Sense" node under it +def pkk_get_list_str(dlist, dprefix, dfilter): + if len(dlist) > 0: + if dfilter: + tfmt = pkk_get_fmt(dprefix + "_list_item") + tlist = [tfmt.format(text=i) for i in dlist] + else: + tlist = dlist + + return pkk_get_fmt(dprefix + "_list").format( + alist=pkk_get_fmt(dprefix + "_list_sep").join(tlist)) + else: + return pkk_get_fmt(dprefix + "_list_empty") + + +## Output a main "Headword" or "Sense" node def pkk_output_sense(indent, dnode): - # Search form and definition - pkk_output_subs(indent, dnode, "./SearchForm", "search_fmt") + # Definition for this sense pkk_output_subs(indent, dnode, "./Definition", "definition_fmt") - # Examples + # Examples for this sense for wnode in dnode.findall("./ExampleBlock/ExampleCtn"): geolist = [] for qnode in wnode.findall("./FreeTopic[@type='levikki']/GeographicalUsage"): - geolist.append("{} [{}]".format(pkk_node_to_text(qnode), qnode.attrib["class"])) - - if len(geolist) > 0: - geostr = pkk_get_fmt("example_geo_list").format(glist=pkk_get_fmt("example_geo_list_sep").join(geolist)) - else: - geostr = pkk_get_fmt("example_geo_list_empty") + geolist.append(pkk_get_fmt("example_geo_list_item").format( + text=pkk_node_to_text(qnode), + tclass=qnode.attrib["class"])) pkk_print(pkk_get_fmt("example_fmt").format( text=pkk_node_to_text(wnode.find("./Example")), - geostr=geostr, + geostr=pkk_get_list_str(geolist, "example_geo", False), indent=pkk_geti(indent + 1))) @@ -327,28 +351,39 @@ def pkk_output_node(indent, dnode): for wnode in dnode.findall("./HeadwordCtn"): + # Get head word + headword = pkk_node_to_text(wnode.find("./Headword")) + + # Collect search forms + srchlist = [] + for qnode in wnode.findall("./SearchForm"): + srchlist.append(pkk_node_to_text(qnode)) + + # Remove dupe if headword is also in srchlist + if headword in srchlist: + srchlist.remove(headword) + + # Remove other duplicates and sort + srchlist = list(set(srchlist)) + srchlist.sort(reverse=False, key=lambda attr: (attr, len(attr))) + # Create list with grammatical attributes (noun, verb, etc.) - tmpl = [] + attrlist = [] for pnode in wnode.findall("./PartOfSpeechCtn/PartOfSpeech"): - tmpl.append(pnode.attrib["freeValue"]) + attrlist.append(pnode.attrib["freeValue"]) for pnode in wnode.findall("./GrammaticalNote"): - tmpl.append(pkk_node_to_text(pnode)) + attrlist.append(pkk_node_to_text(pnode)) # Remove duplicates and sort the list - tmpl = list(set(tmpl)) - tmpl.sort(reverse=False, key=lambda attr: (attr, len(attr))) - - if len(tmpl) > 0: - astr = pkk_get_fmt("word_attr_list").format( - alist=pkk_get_fmt("word_attr_list_sep").join(tmpl)) - else: - astr = pkk_get_fmt("word_attr_list_empty") + attrlist = list(set(attrlist)) + attrlist.sort(reverse=False, key=lambda attr: (attr, len(attr))) # Print the headword and attributes if any pkk_print(pkk_get_fmt("word_fmt").format( - word=pkk_node_to_text(wnode.find("./Headword")), - attr=astr, + word=headword, + attr=pkk_get_list_str(attrlist, "word_attr", True), + search=pkk_get_list_str(srchlist, "search", True), indent=pkk_geti(indent))) # Print main "sense"