comparison lxmldump.py @ 61:9c36574199f5

Enhancements to the output flexibility.
author Matti Hamalainen <ccr@tnsp.org>
date Tue, 01 Jun 2021 13:24:34 +0300
parents cbed8ee15701
children 1932f588743f
comparison
equal deleted inserted replaced
60:cbed8ee15701 61:9c36574199f5
46 PKK_MODE_NORMAL: "<PTR:{href}>{text}</PTR>", 46 PKK_MODE_NORMAL: "<PTR:{href}>{text}</PTR>",
47 PKK_MODE_ANKI: "<a href='https://kaino.kotus.fi/cgi-bin/kks/karjala.cgi?a={href}'>{text}</a>", 47 PKK_MODE_ANKI: "<a href='https://kaino.kotus.fi/cgi-bin/kks/karjala.cgi?a={href}'>{text}</a>",
48 }, 48 },
49 49
50 "word_item": { 50 "word_item": {
51 PKK_MODE_NORMAL: "\"{word}\"{search}{attr}\n{hyphenation}{main_sense}{other_senses}\n", 51 PKK_MODE_NORMAL: "\"{word}\"{search}{attr}{hyphenation}{main_sense}{other_senses}\n",
52 PKK_MODE_ANKI: "\"{word}\"{search}{attr}{hyphenation};{main_sense}{other_senses}\n", 52 PKK_MODE_ANKI: "\"{word}\"{search}{attr}{hyphenation};{main_sense}{other_senses}\n",
53 }, 53 },
54 "word_attr_list": { 54 "word_attr_list": {
55 PKK_MODE_NORMAL: " ({alist}) ", 55 PKK_MODE_NORMAL: "{indent}attr \"({alist})\"\n",
56 PKK_MODE_NORMAL: " ({alist})", 56 PKK_MODE_ANKI: " ({alist})",
57 }, 57 },
58 "word_attr_list_empty": { 58 "word_attr_list_empty": {
59 PKK_MODE_NORMAL: " ", 59 PKK_MODE_NORMAL: " ",
60 PKK_MODE_ANKI: "", 60 PKK_MODE_ANKI: "",
61 }, 61 },
65 "word_attr_list_sep": { 65 "word_attr_list_sep": {
66 PKK_MODE_NORMAL: " ; ", 66 PKK_MODE_NORMAL: " ; ",
67 PKK_MODE_ANKI: " : ", 67 PKK_MODE_ANKI: " : ",
68 }, 68 },
69 69
70 "search_list": {
71 PKK_MODE_NORMAL: ", {alist}",
72 },
73 "search_list_empty": {
74 PKK_MODE_NORMAL: "",
75 },
76 "search_list_item": {
77 PKK_MODE_NORMAL: "\"{text}\"",
78 },
79 "search_list_sep": {
80 PKK_MODE_NORMAL: ", ",
81 },
82
83 "hyphenation": { 70 "hyphenation": {
84 PKK_MODE_NORMAL: "{indent}hyph \"{text}\"\n", 71 PKK_MODE_NORMAL: "{indent}hyph \"{text}\"\n",
85 PKK_MODE_ANKI: " [hyph: {text}]", 72 PKK_MODE_ANKI: " [hyph: {text}]",
86 }, 73 },
87 "no_hyphenation": { 74 "no_hyphenation": {
88 PKK_MODE_NORMAL: "", 75 PKK_MODE_NORMAL: "",
89 }, 76 },
90 77
78
79 "search_list": {
80 PKK_MODE_NORMAL: ", {alist}\n",
81 PKK_MODE_ANKI: ", {alist}",
82 },
83 "search_list_empty": {
84 PKK_MODE_NORMAL: "",
85 },
86 "search_list_item": {
87 PKK_MODE_NORMAL: "\"{text}\"",
88 },
89 "search_list_sep": {
90 PKK_MODE_NORMAL: ", ",
91 },
92
93 "main_sense_item": {
94 PKK_MODE_NORMAL: "{definition}{example_list}",
95 },
91 "sense_list": { 96 "sense_list": {
92 PKK_MODE_NORMAL: " | {alist}", 97 PKK_MODE_NORMAL: "{alist}",
98 PKK_MODE_ANKI: " | {alist}",
93 }, 99 },
94 "sense_list_empty": { 100 "sense_list_empty": {
95 PKK_MODE_NORMAL: "", 101 PKK_MODE_NORMAL: "",
96 }, 102 },
97 "sense_list_item": { 103 "sense_list_item": {
98 PKK_MODE_NORMAL: "{indent}sense #{index}\n{text}", 104 PKK_MODE_NORMAL: "{indent}sense #{index}:\n{definition}{example_list}",
99 PKK_MODE_ANKI: "#{index}:{text}", 105 PKK_MODE_ANKI: "#{index}:{definition}{example_list}",
100 }, 106 },
101 "sense_list_sep": { 107 "sense_list_sep": {
102 PKK_MODE_NORMAL: "", 108 PKK_MODE_NORMAL: "",
103 PKK_MODE_NORMAL: " | ", 109 PKK_MODE_NORMAL: " | ",
104 }, 110 },
110 116
111 "example_item": { 117 "example_item": {
112 PKK_MODE_NORMAL: "{indent}exmp \"{text}\"{geo_list}\n", 118 PKK_MODE_NORMAL: "{indent}exmp \"{text}\"{geo_list}\n",
113 PKK_MODE_ANKI: " * \"{text}\"{geo_list}", 119 PKK_MODE_ANKI: " * \"{text}\"{geo_list}",
114 }, 120 },
121 "example_item_sep": {
122 PKK_MODE_NORMAL: "",
123 PKK_MODE_ANKI: "",
124 },
125
115 "example_geo_list": { 126 "example_geo_list": {
116 PKK_MODE_NORMAL: " ({alist})", 127 PKK_MODE_NORMAL: " ({alist})",
117 }, 128 },
118 "example_geo_list_empty": { 129 "example_geo_list_empty": {
119 PKK_MODE_NORMAL: "", 130 PKK_MODE_NORMAL: "",
330 text=pkk_node_to_text(qnode), 341 text=pkk_node_to_text(qnode),
331 indent=pkk_geti(indent)) 342 indent=pkk_geti(indent))
332 return ostr 343 return ostr
333 344
334 345
335 def pkk_get_list_str(dlist, dprefix, dfilter): 346 def pkk_get_list_str(dindent, dlist, dprefix, dfilter):
336 if len(dlist) > 0: 347 if len(dlist) > 0:
337 if dfilter: 348 if dfilter:
338 tfmt = pkk_get_fmt(dprefix + "_list_item") 349 tfmt = pkk_get_fmt(dprefix + "_list_item")
339 tlist = [tfmt.format(text=i) for i in dlist] 350 tlist = [tfmt.format(text=i) for i in dlist]
340 else: 351 else:
341 tlist = dlist 352 tlist = dlist
342 353
343 return pkk_get_fmt(dprefix + "_list").format( 354 return pkk_get_fmt(dprefix + "_list").format(
344 alist=pkk_get_fmt(dprefix + "_list_sep").join(tlist)) 355 alist=pkk_get_fmt(dprefix + "_list_sep").join(tlist),
356 indent=pkk_geti(dindent))
345 else: 357 else:
346 return pkk_get_fmt(dprefix + "_list_empty") 358 return pkk_get_fmt(dprefix + "_list_empty").format(
347 359 indent=pkk_geti(dindent))
348 360
349 ## Output a main "Headword" or "Sense" node 361
350 def pkk_get_sense(indent, dnode): 362 ## Get definition nand examples from node
351 # Definition for this sense 363 def pkk_get_sense(indent, dnode, dname, dindex):
352 ostr = pkk_get_subs(indent, dnode, "./Definition", "definition_item") 364 exlist = []
353 365 index = 1
354 # Examples for this sense
355 for wnode in dnode.findall("./ExampleBlock/ExampleCtn"): 366 for wnode in dnode.findall("./ExampleBlock/ExampleCtn"):
356 geolist = [] 367 geolist = []
357 for qnode in wnode.findall("./FreeTopic[@type='levikki']/GeographicalUsage"): 368 for qnode in wnode.findall("./FreeTopic[@type='levikki']/GeographicalUsage"):
358 geolist.append(pkk_get_fmt("example_geo_list_item").format( 369 geolist.append(pkk_get_fmt("example_geo_list_item").format(
359 text=pkk_node_to_text(qnode), 370 text=pkk_node_to_text(qnode),
360 tclass=qnode.attrib["class"])) 371 tclass=qnode.attrib["class"],
361 372 indent=pkk_geti(indent + 2)))
362 ostr += pkk_get_fmt("example_item").format( 373
374 exlist.append(pkk_get_fmt("example_item").format(
363 text=pkk_node_to_text(wnode.find("./Example")), 375 text=pkk_node_to_text(wnode.find("./Example")),
364 geo_list=pkk_get_list_str(geolist, "example_geo", False), 376 geo_list=pkk_get_list_str(indent + 1, geolist, "example_geo", False),
365 indent=pkk_geti(indent + 1)) 377 indent=pkk_geti(indent + 1),
366 378 index=index))
367 return ostr 379 index += 1
380
381 return pkk_get_fmt(dname).format(
382 definition=pkk_get_subs(indent, dnode, "./Definition", "definition_item"),
383 example_list=pkk_get_fmt("example_item_sep").join(exlist),
384 indent=pkk_geti(indent),
385 index=dindex)
368 386
369 387
370 ## Output one "DictionaryEntry" node 388 ## Output one "DictionaryEntry" node
371 def pkk_output_node(indent, dnode): 389 def pkk_output_node(indent, dnode):
372 390
408 # Remove duplicates and sort the list 426 # Remove duplicates and sort the list
409 attrlist = list(set(attrlist)) 427 attrlist = list(set(attrlist))
410 attrlist.sort(reverse=False, key=lambda attr: (attr, len(attr))) 428 attrlist.sort(reverse=False, key=lambda attr: (attr, len(attr)))
411 429
412 # Get main "sense" 430 # Get main "sense"
413 msense = pkk_get_sense(indent + 1, wnode) 431 msense = pkk_get_sense(indent + 1, wnode, "main_sense_item", 0)
414 432
415 # Print any other "senses" 433 # Print any other "senses"
416 index = 1 434 index = 1
417 senselist = [] 435 senselist = []
418 for znode in dnode.findall("./SenseGrp"): 436 for znode in dnode.findall("./SenseGrp"):
419 senselist.append(pkk_get_fmt("sense_list_item").format( 437 senselist.append(pkk_get_sense(indent + 1, znode, "sense_list_item", index))
420 index=index,
421 text=pkk_get_sense(indent + 2, znode),
422 indent=pkk_geti(indent + 1)))
423 index += 1 438 index += 1
424 439
425 # Print the headword and attributes if any 440 # Print the headword and attributes if any
426 pkk_print(pkk_get_fmt("word_item").format( 441 pkk_print(pkk_get_fmt("word_item").format(
427 word=headword, 442 word=headword,
428 attr=pkk_get_list_str(attrlist, "word_attr", True), 443 attr=pkk_get_list_str(indent + 1, attrlist, "word_attr", True),
429 search=pkk_get_list_str(srchlist, "search", True), 444 search=pkk_get_list_str(indent + 1, srchlist, "search", True),
430 hyphenation=hyphenation, 445 hyphenation=hyphenation,
431 main_sense=msense, 446 main_sense=msense,
432 other_senses=pkk_get_list_str(senselist, "sense", False), 447 other_senses=pkk_get_list_str(indent + 1, senselist, "sense", False),
433 indent=pkk_geti(indent))) 448 indent=pkk_geti(indent)))
434 449
435 450
436 ### 451 ###
437 ### Main program starts 452 ### Main program starts