comparison lxmldump.py @ 43:8ed576574712

More improvements to output flexibility.
author Matti Hamalainen <ccr@tnsp.org>
date Wed, 26 May 2021 13:12:02 +0300
parents 508de0f6836b
children d7b4b2fb0214
comparison
equal deleted inserted replaced
42:508de0f6836b 43:8ed576574712
46 PKK_MODE_NORMAL: u"<PTR:{href}>{text}</PTR>", 46 PKK_MODE_NORMAL: u"<PTR:{href}>{text}</PTR>",
47 PKK_MODE_ANKI: u"<a href='https://kaino.kotus.fi/cgi-bin/kks/karjala.cgi?a={href}'>{text}</a>", 47 PKK_MODE_ANKI: u"<a href='https://kaino.kotus.fi/cgi-bin/kks/karjala.cgi?a={href}'>{text}</a>",
48 }, 48 },
49 49
50 "word_fmt": { 50 "word_fmt": {
51 PKK_MODE_NORMAL: "\"{word}\"{attr}\n", 51 PKK_MODE_NORMAL: "\"{word}\"{search}{attr}\n",
52 PKK_MODE_ANKI: "{word}{attr}\n", 52 PKK_MODE_ANKI: "{word}{attr}\n",
53 }, 53 },
54 "word_attr_list": { 54 "word_attr_list": {
55 PKK_MODE_NORMAL: " ({alist}) ", 55 PKK_MODE_NORMAL: " ({alist}) ",
56 }, 56 },
57 "word_attr_list_empty": { 57 "word_attr_list_empty": {
58 PKK_MODE_NORMAL: " ", 58 PKK_MODE_NORMAL: " ",
59 },
60 "word_attr_list_item": {
61 PKK_MODE_NORMAL: "{text}",
59 }, 62 },
60 "word_attr_list_sep": { 63 "word_attr_list_sep": {
61 PKK_MODE_NORMAL: " ; ", 64 PKK_MODE_NORMAL: " ; ",
62 PKK_MODE_ANKI: " : ", 65 PKK_MODE_ANKI: " : ",
63 }, 66 },
64 67
68 "search_list": {
69 PKK_MODE_NORMAL: ", {alist}",
70 },
71 "search_list_empty": {
72 PKK_MODE_NORMAL: "",
73 },
74 "search_list_item": {
75 PKK_MODE_NORMAL: "\"{text}\"",
76 },
77 "search_list_sep": {
78 PKK_MODE_NORMAL: ", ",
79 },
80
65 "sense_index": { 81 "sense_index": {
66 PKK_MODE_NORMAL: "{indent}sense #{index}\n", 82 PKK_MODE_NORMAL: "{indent}sense #{index}\n",
67 PKK_MODE_ANKI: "[{index}]:\n", 83 PKK_MODE_ANKI: "[{index}]:\n",
68 }, 84 },
69 85
70 "search_fmt": {
71 PKK_MODE_NORMAL: "{indent}srch \"{text}\"\n",
72 PKK_MODE_ANKI: "{text}\n",
73 },
74
75 "definition_fmt": { 86 "definition_fmt": {
76 PKK_MODE_NORMAL: "{indent}defn \"{text}\"\n", 87 PKK_MODE_NORMAL: "{indent}defn \"{text}\"\n",
77 PKK_MODE_ANKI: "? {text}\n", 88 PKK_MODE_ANKI: "? {text}\n",
78 }, 89 },
79 90
80 "example_fmt": { 91 "example_fmt": {
81 PKK_MODE_NORMAL: "{indent}exmp \"{text}\"{geostr}\n", 92 PKK_MODE_NORMAL: "{indent}exmp \"{text}\"{geostr}\n",
82 PKK_MODE_ANKI: "- {text}{geostr}\n", 93 PKK_MODE_ANKI: "- {text}{geostr}\n",
83 }, 94 },
84 "example_geo_list": { 95 "example_geo_list": {
85 PKK_MODE_NORMAL: " ({glist})", 96 PKK_MODE_NORMAL: " ({alist})",
86 }, 97 },
87 "example_geo_list_empty": { 98 "example_geo_list_empty": {
88 PKK_MODE_NORMAL: "", 99 PKK_MODE_NORMAL: "",
100 },
101 "example_geo_list_item": {
102 PKK_MODE_NORMAL: "{text} [{tclass}]",
89 }, 103 },
90 "example_geo_list_sep": { 104 "example_geo_list_sep": {
91 PKK_MODE_NORMAL: ", ", 105 PKK_MODE_NORMAL: ", ",
92 }, 106 },
93 107
298 pkk_print(dfmt.format( 312 pkk_print(dfmt.format(
299 text=pkk_node_to_text(qnode), 313 text=pkk_node_to_text(qnode),
300 indent=pkk_geti(indent))) 314 indent=pkk_geti(indent)))
301 315
302 316
303 ## Output a main "Headword" or "Sense" node under it 317 def pkk_get_list_str(dlist, dprefix, dfilter):
318 if len(dlist) > 0:
319 if dfilter:
320 tfmt = pkk_get_fmt(dprefix + "_list_item")
321 tlist = [tfmt.format(text=i) for i in dlist]
322 else:
323 tlist = dlist
324
325 return pkk_get_fmt(dprefix + "_list").format(
326 alist=pkk_get_fmt(dprefix + "_list_sep").join(tlist))
327 else:
328 return pkk_get_fmt(dprefix + "_list_empty")
329
330
331 ## Output a main "Headword" or "Sense" node
304 def pkk_output_sense(indent, dnode): 332 def pkk_output_sense(indent, dnode):
305 # Search form and definition 333 # Definition for this sense
306 pkk_output_subs(indent, dnode, "./SearchForm", "search_fmt")
307 pkk_output_subs(indent, dnode, "./Definition", "definition_fmt") 334 pkk_output_subs(indent, dnode, "./Definition", "definition_fmt")
308 335
309 # Examples 336 # Examples for this sense
310 for wnode in dnode.findall("./ExampleBlock/ExampleCtn"): 337 for wnode in dnode.findall("./ExampleBlock/ExampleCtn"):
311 geolist = [] 338 geolist = []
312 for qnode in wnode.findall("./FreeTopic[@type='levikki']/GeographicalUsage"): 339 for qnode in wnode.findall("./FreeTopic[@type='levikki']/GeographicalUsage"):
313 geolist.append("{} [{}]".format(pkk_node_to_text(qnode), qnode.attrib["class"])) 340 geolist.append(pkk_get_fmt("example_geo_list_item").format(
314 341 text=pkk_node_to_text(qnode),
315 if len(geolist) > 0: 342 tclass=qnode.attrib["class"]))
316 geostr = pkk_get_fmt("example_geo_list").format(glist=pkk_get_fmt("example_geo_list_sep").join(geolist))
317 else:
318 geostr = pkk_get_fmt("example_geo_list_empty")
319 343
320 pkk_print(pkk_get_fmt("example_fmt").format( 344 pkk_print(pkk_get_fmt("example_fmt").format(
321 text=pkk_node_to_text(wnode.find("./Example")), 345 text=pkk_node_to_text(wnode.find("./Example")),
322 geostr=geostr, 346 geostr=pkk_get_list_str(geolist, "example_geo", False),
323 indent=pkk_geti(indent + 1))) 347 indent=pkk_geti(indent + 1)))
324 348
325 349
326 ## Output one "DictionaryEntry" node 350 ## Output one "DictionaryEntry" node
327 def pkk_output_node(indent, dnode): 351 def pkk_output_node(indent, dnode):
328 352
329 for wnode in dnode.findall("./HeadwordCtn"): 353 for wnode in dnode.findall("./HeadwordCtn"):
354 # Get head word
355 headword = pkk_node_to_text(wnode.find("./Headword"))
356
357 # Collect search forms
358 srchlist = []
359 for qnode in wnode.findall("./SearchForm"):
360 srchlist.append(pkk_node_to_text(qnode))
361
362 # Remove dupe if headword is also in srchlist
363 if headword in srchlist:
364 srchlist.remove(headword)
365
366 # Remove other duplicates and sort
367 srchlist = list(set(srchlist))
368 srchlist.sort(reverse=False, key=lambda attr: (attr, len(attr)))
369
330 # Create list with grammatical attributes (noun, verb, etc.) 370 # Create list with grammatical attributes (noun, verb, etc.)
331 tmpl = [] 371 attrlist = []
332 for pnode in wnode.findall("./PartOfSpeechCtn/PartOfSpeech"): 372 for pnode in wnode.findall("./PartOfSpeechCtn/PartOfSpeech"):
333 tmpl.append(pnode.attrib["freeValue"]) 373 attrlist.append(pnode.attrib["freeValue"])
334 374
335 for pnode in wnode.findall("./GrammaticalNote"): 375 for pnode in wnode.findall("./GrammaticalNote"):
336 tmpl.append(pkk_node_to_text(pnode)) 376 attrlist.append(pkk_node_to_text(pnode))
337 377
338 # Remove duplicates and sort the list 378 # Remove duplicates and sort the list
339 tmpl = list(set(tmpl)) 379 attrlist = list(set(attrlist))
340 tmpl.sort(reverse=False, key=lambda attr: (attr, len(attr))) 380 attrlist.sort(reverse=False, key=lambda attr: (attr, len(attr)))
341
342 if len(tmpl) > 0:
343 astr = pkk_get_fmt("word_attr_list").format(
344 alist=pkk_get_fmt("word_attr_list_sep").join(tmpl))
345 else:
346 astr = pkk_get_fmt("word_attr_list_empty")
347 381
348 # Print the headword and attributes if any 382 # Print the headword and attributes if any
349 pkk_print(pkk_get_fmt("word_fmt").format( 383 pkk_print(pkk_get_fmt("word_fmt").format(
350 word=pkk_node_to_text(wnode.find("./Headword")), 384 word=headword,
351 attr=astr, 385 attr=pkk_get_list_str(attrlist, "word_attr", True),
386 search=pkk_get_list_str(srchlist, "search", True),
352 indent=pkk_geti(indent))) 387 indent=pkk_geti(indent)))
353 388
354 # Print main "sense" 389 # Print main "sense"
355 pkk_output_sense(indent + 1, wnode) 390 pkk_output_sense(indent + 1, wnode)
356 391