Mercurial > hg > lxmldump
comparison lxmldump.py @ 43:8ed576574712
More improvements to output flexibility.
author | Matti Hamalainen <ccr@tnsp.org> |
---|---|
date | Wed, 26 May 2021 13:12:02 +0300 |
parents | 508de0f6836b |
children | d7b4b2fb0214 |
comparison
equal
deleted
inserted
replaced
42:508de0f6836b | 43:8ed576574712 |
---|---|
46 PKK_MODE_NORMAL: u"<PTR:{href}>{text}</PTR>", | 46 PKK_MODE_NORMAL: u"<PTR:{href}>{text}</PTR>", |
47 PKK_MODE_ANKI: u"<a href='https://kaino.kotus.fi/cgi-bin/kks/karjala.cgi?a={href}'>{text}</a>", | 47 PKK_MODE_ANKI: u"<a href='https://kaino.kotus.fi/cgi-bin/kks/karjala.cgi?a={href}'>{text}</a>", |
48 }, | 48 }, |
49 | 49 |
50 "word_fmt": { | 50 "word_fmt": { |
51 PKK_MODE_NORMAL: "\"{word}\"{attr}\n", | 51 PKK_MODE_NORMAL: "\"{word}\"{search}{attr}\n", |
52 PKK_MODE_ANKI: "{word}{attr}\n", | 52 PKK_MODE_ANKI: "{word}{attr}\n", |
53 }, | 53 }, |
54 "word_attr_list": { | 54 "word_attr_list": { |
55 PKK_MODE_NORMAL: " ({alist}) ", | 55 PKK_MODE_NORMAL: " ({alist}) ", |
56 }, | 56 }, |
57 "word_attr_list_empty": { | 57 "word_attr_list_empty": { |
58 PKK_MODE_NORMAL: " ", | 58 PKK_MODE_NORMAL: " ", |
59 }, | |
60 "word_attr_list_item": { | |
61 PKK_MODE_NORMAL: "{text}", | |
59 }, | 62 }, |
60 "word_attr_list_sep": { | 63 "word_attr_list_sep": { |
61 PKK_MODE_NORMAL: " ; ", | 64 PKK_MODE_NORMAL: " ; ", |
62 PKK_MODE_ANKI: " : ", | 65 PKK_MODE_ANKI: " : ", |
63 }, | 66 }, |
64 | 67 |
68 "search_list": { | |
69 PKK_MODE_NORMAL: ", {alist}", | |
70 }, | |
71 "search_list_empty": { | |
72 PKK_MODE_NORMAL: "", | |
73 }, | |
74 "search_list_item": { | |
75 PKK_MODE_NORMAL: "\"{text}\"", | |
76 }, | |
77 "search_list_sep": { | |
78 PKK_MODE_NORMAL: ", ", | |
79 }, | |
80 | |
65 "sense_index": { | 81 "sense_index": { |
66 PKK_MODE_NORMAL: "{indent}sense #{index}\n", | 82 PKK_MODE_NORMAL: "{indent}sense #{index}\n", |
67 PKK_MODE_ANKI: "[{index}]:\n", | 83 PKK_MODE_ANKI: "[{index}]:\n", |
68 }, | 84 }, |
69 | 85 |
70 "search_fmt": { | |
71 PKK_MODE_NORMAL: "{indent}srch \"{text}\"\n", | |
72 PKK_MODE_ANKI: "{text}\n", | |
73 }, | |
74 | |
75 "definition_fmt": { | 86 "definition_fmt": { |
76 PKK_MODE_NORMAL: "{indent}defn \"{text}\"\n", | 87 PKK_MODE_NORMAL: "{indent}defn \"{text}\"\n", |
77 PKK_MODE_ANKI: "? {text}\n", | 88 PKK_MODE_ANKI: "? {text}\n", |
78 }, | 89 }, |
79 | 90 |
80 "example_fmt": { | 91 "example_fmt": { |
81 PKK_MODE_NORMAL: "{indent}exmp \"{text}\"{geostr}\n", | 92 PKK_MODE_NORMAL: "{indent}exmp \"{text}\"{geostr}\n", |
82 PKK_MODE_ANKI: "- {text}{geostr}\n", | 93 PKK_MODE_ANKI: "- {text}{geostr}\n", |
83 }, | 94 }, |
84 "example_geo_list": { | 95 "example_geo_list": { |
85 PKK_MODE_NORMAL: " ({glist})", | 96 PKK_MODE_NORMAL: " ({alist})", |
86 }, | 97 }, |
87 "example_geo_list_empty": { | 98 "example_geo_list_empty": { |
88 PKK_MODE_NORMAL: "", | 99 PKK_MODE_NORMAL: "", |
100 }, | |
101 "example_geo_list_item": { | |
102 PKK_MODE_NORMAL: "{text} [{tclass}]", | |
89 }, | 103 }, |
90 "example_geo_list_sep": { | 104 "example_geo_list_sep": { |
91 PKK_MODE_NORMAL: ", ", | 105 PKK_MODE_NORMAL: ", ", |
92 }, | 106 }, |
93 | 107 |
298 pkk_print(dfmt.format( | 312 pkk_print(dfmt.format( |
299 text=pkk_node_to_text(qnode), | 313 text=pkk_node_to_text(qnode), |
300 indent=pkk_geti(indent))) | 314 indent=pkk_geti(indent))) |
301 | 315 |
302 | 316 |
303 ## Output a main "Headword" or "Sense" node under it | 317 def pkk_get_list_str(dlist, dprefix, dfilter): |
318 if len(dlist) > 0: | |
319 if dfilter: | |
320 tfmt = pkk_get_fmt(dprefix + "_list_item") | |
321 tlist = [tfmt.format(text=i) for i in dlist] | |
322 else: | |
323 tlist = dlist | |
324 | |
325 return pkk_get_fmt(dprefix + "_list").format( | |
326 alist=pkk_get_fmt(dprefix + "_list_sep").join(tlist)) | |
327 else: | |
328 return pkk_get_fmt(dprefix + "_list_empty") | |
329 | |
330 | |
331 ## Output a main "Headword" or "Sense" node | |
304 def pkk_output_sense(indent, dnode): | 332 def pkk_output_sense(indent, dnode): |
305 # Search form and definition | 333 # Definition for this sense |
306 pkk_output_subs(indent, dnode, "./SearchForm", "search_fmt") | |
307 pkk_output_subs(indent, dnode, "./Definition", "definition_fmt") | 334 pkk_output_subs(indent, dnode, "./Definition", "definition_fmt") |
308 | 335 |
309 # Examples | 336 # Examples for this sense |
310 for wnode in dnode.findall("./ExampleBlock/ExampleCtn"): | 337 for wnode in dnode.findall("./ExampleBlock/ExampleCtn"): |
311 geolist = [] | 338 geolist = [] |
312 for qnode in wnode.findall("./FreeTopic[@type='levikki']/GeographicalUsage"): | 339 for qnode in wnode.findall("./FreeTopic[@type='levikki']/GeographicalUsage"): |
313 geolist.append("{} [{}]".format(pkk_node_to_text(qnode), qnode.attrib["class"])) | 340 geolist.append(pkk_get_fmt("example_geo_list_item").format( |
314 | 341 text=pkk_node_to_text(qnode), |
315 if len(geolist) > 0: | 342 tclass=qnode.attrib["class"])) |
316 geostr = pkk_get_fmt("example_geo_list").format(glist=pkk_get_fmt("example_geo_list_sep").join(geolist)) | |
317 else: | |
318 geostr = pkk_get_fmt("example_geo_list_empty") | |
319 | 343 |
320 pkk_print(pkk_get_fmt("example_fmt").format( | 344 pkk_print(pkk_get_fmt("example_fmt").format( |
321 text=pkk_node_to_text(wnode.find("./Example")), | 345 text=pkk_node_to_text(wnode.find("./Example")), |
322 geostr=geostr, | 346 geostr=pkk_get_list_str(geolist, "example_geo", False), |
323 indent=pkk_geti(indent + 1))) | 347 indent=pkk_geti(indent + 1))) |
324 | 348 |
325 | 349 |
326 ## Output one "DictionaryEntry" node | 350 ## Output one "DictionaryEntry" node |
327 def pkk_output_node(indent, dnode): | 351 def pkk_output_node(indent, dnode): |
328 | 352 |
329 for wnode in dnode.findall("./HeadwordCtn"): | 353 for wnode in dnode.findall("./HeadwordCtn"): |
354 # Get head word | |
355 headword = pkk_node_to_text(wnode.find("./Headword")) | |
356 | |
357 # Collect search forms | |
358 srchlist = [] | |
359 for qnode in wnode.findall("./SearchForm"): | |
360 srchlist.append(pkk_node_to_text(qnode)) | |
361 | |
362 # Remove dupe if headword is also in srchlist | |
363 if headword in srchlist: | |
364 srchlist.remove(headword) | |
365 | |
366 # Remove other duplicates and sort | |
367 srchlist = list(set(srchlist)) | |
368 srchlist.sort(reverse=False, key=lambda attr: (attr, len(attr))) | |
369 | |
330 # Create list with grammatical attributes (noun, verb, etc.) | 370 # Create list with grammatical attributes (noun, verb, etc.) |
331 tmpl = [] | 371 attrlist = [] |
332 for pnode in wnode.findall("./PartOfSpeechCtn/PartOfSpeech"): | 372 for pnode in wnode.findall("./PartOfSpeechCtn/PartOfSpeech"): |
333 tmpl.append(pnode.attrib["freeValue"]) | 373 attrlist.append(pnode.attrib["freeValue"]) |
334 | 374 |
335 for pnode in wnode.findall("./GrammaticalNote"): | 375 for pnode in wnode.findall("./GrammaticalNote"): |
336 tmpl.append(pkk_node_to_text(pnode)) | 376 attrlist.append(pkk_node_to_text(pnode)) |
337 | 377 |
338 # Remove duplicates and sort the list | 378 # Remove duplicates and sort the list |
339 tmpl = list(set(tmpl)) | 379 attrlist = list(set(attrlist)) |
340 tmpl.sort(reverse=False, key=lambda attr: (attr, len(attr))) | 380 attrlist.sort(reverse=False, key=lambda attr: (attr, len(attr))) |
341 | |
342 if len(tmpl) > 0: | |
343 astr = pkk_get_fmt("word_attr_list").format( | |
344 alist=pkk_get_fmt("word_attr_list_sep").join(tmpl)) | |
345 else: | |
346 astr = pkk_get_fmt("word_attr_list_empty") | |
347 | 381 |
348 # Print the headword and attributes if any | 382 # Print the headword and attributes if any |
349 pkk_print(pkk_get_fmt("word_fmt").format( | 383 pkk_print(pkk_get_fmt("word_fmt").format( |
350 word=pkk_node_to_text(wnode.find("./Headword")), | 384 word=headword, |
351 attr=astr, | 385 attr=pkk_get_list_str(attrlist, "word_attr", True), |
386 search=pkk_get_list_str(srchlist, "search", True), | |
352 indent=pkk_geti(indent))) | 387 indent=pkk_geti(indent))) |
353 | 388 |
354 # Print main "sense" | 389 # Print main "sense" |
355 pkk_output_sense(indent + 1, wnode) | 390 pkk_output_sense(indent + 1, wnode) |
356 | 391 |