# HG changeset patch # User Matti Hamalainen # Date 1620685617 -10800 # Node ID 4b4299b62f7fe03d2ba4bd2f1d78a52481f2bca8 # Parent 34a89d61dbe78e9ff438a4ba1f4174b78fc9b950 Moar work. diff -r 34a89d61dbe7 -r 4b4299b62f7f lxmldump.py --- a/lxmldump.py Mon May 10 21:43:00 2021 +0300 +++ b/lxmldump.py Tue May 11 01:26:57 2021 +0300 @@ -23,9 +23,25 @@ pkk_cfg = { "dump": False, "normalize": False, + + "debug": False, } +pkk_str_fmap = { + "Fragment" : ["<", ">"], +} + + +pkk_debug_list = [ + "ahas", + "ahavakkaine", + "ahavakala", + "ahavakoittuo", + "ahvaliha", +] + + ### ### Misc. helper functions, etc ### @@ -40,6 +56,9 @@ else: sys.stdout.write(smsg) +def pkk_printi(indent, smsg): + pkk_print((" " * indent) + smsg) + ## Fatal error handler def pkk_fatal(smsg): @@ -54,44 +73,83 @@ sys.exit(1) +def pkk_get_text(lnode): + stmp = "" + for pnode in lnode.iter(): + if isinstance(pnode.text, str): + if isinstance(pnode.tag, str) and pnode.tag in pkk_str_fmap: + stmp += pkk_str_fmap[pnode.tag][0] + pnode.text + pkk_str_fmap[pnode.tag][1] + else: + stmp += pnode.text + + if isinstance(pnode.tail, str): + stmp += pnode.tail + + return stmp.strip() + + ## -def pkk_dump_recursive(lnode, indent): - if lnode.tag == "Example": - stmp = "".join(lnode.itertext()).strip() - print("{}{} \"{}\"".format(" " * indent, lnode.tag, stmp)) +def pkk_dump_recursive(indent, lnode): + if lnode.tag in ["Example"]: + stmp = pkk_get_text(lnode) + pkk_printi(indent, "{} \"{}\"".format(lnode.tag, stmp)) else: - stmp = "" - if lnode.text != None: - tmp = str(lnode.text).strip() - if tmp != "": - stmp = " \""+ tmp +"\"" + if isinstance(lnode.text, str): + stmp = lnode.text.strip() + if stmp != "": + stmp = " \""+ stmp +"\"" + else: + stmp = "" if len(lnode.attrib) > 0: atmp = " "+str(lnode.attrib) else: atmp = "" - pkk_print("{}{}{}{}\n".format(" " * indent, lnode.tag, atmp, stmp)) + pkk_printi(indent, "{}{}{}\n".format(lnode.tag, atmp, stmp)) for qnode in lnode.findall("./*"): - pkk_dump_recursive(qnode, indent + 1) + pkk_dump_recursive(indent + 1, qnode) ## -def pkk_output_node(dnode): - wlist = [] - dlist = [] - for wnode in dnode.findall("./HeadwordCtn"): - for qnode in wnode.findall("./SearchForm"): - wlist.append(str(qnode.text).strip()) +def pkk_output_one(indent, dnode, dsub): + for qnode in dnode.findall(dsub): + pkk_printi(indent, "{}\n".format(pkk_get_text(qnode))) + +def pkk_output_subs(indent, dnode, dsub, dname): + for qnode in dnode.findall(dsub): + pkk_printi(indent, "{} \"{}\"\n".format(dname, pkk_get_text(qnode))) + + +def pkk_output_sense(indent, dnode): + pkk_output_subs(indent, dnode, "./SearchForm", "srch") + pkk_output_subs(indent, dnode, "./Definition", "defn") + + for wnode in dnode.findall("./ExampleBlock/ExampleCtn"): + sstr = pkk_get_text(wnode.find("./Example")) + ltmp = [] + for qnode in wnode.findall("./FreeTopic[@type='levikki']/GeographicalUsage"): + ltmp.append("{} [{}]".format(pkk_get_text(qnode), qnode.attrib["class"])) - for qnode in wnode.findall("./Definition"): - dlist.append(str(qnode.text).strip()) + if len(ltmp) > 0: + lstr = " ({})".format(", ".join(ltmp)) + else: + lstr = "" + + pkk_printi(indent + 1, "{} \"{}\"{}\n".format("exmp", sstr, lstr)) + + +def pkk_output_node(indent, dnode): - for wnode in dnode.findall("./SenseGrp"): - for qnode in wnode.findall("./Definition"): - dlist.append(str(qnode.text).strip()) + for wnode in dnode.findall("./HeadwordCtn"): + pkk_output_one (indent, wnode, "./Headword") + pkk_output_sense(indent + 1, wnode) - pkk_print("{} : {}\n".format(", ".join(wlist), " ; ".join(dlist))) + index = 1 + for wnode in dnode.findall("./SenseGrp"): + pkk_printi(indent + 2, "sense #{}\n".format(index)) + pkk_output_sense(indent + 2, wnode) + index += 1 ### @@ -124,6 +182,8 @@ pkk_cfg["dump"] = True elif arg == "normalize" or arg == "n": pkk_cfg["normalize"] = True + elif arg == "p": + pkk_cfg["debug"] = True else: pkk_fatal(u"Invalid option argument '{0}'.".format(oarg)) @@ -164,11 +224,16 @@ try: xroot = uxml.getroot() for dnode in xroot.findall("./DictionaryEntry"): + + if pkk_cfg["debug"] and dnode.attrib["identifier"] not in pkk_debug_list: + continue + if pkk_cfg["dump"]: - pkk_dump_recursive(dnode, 0) - print("\n\n") + pkk_dump_recursive(0, dnode) else: - pkk_output_node(dnode) + pkk_output_node(0, dnode) + + print("\n") except (BrokenPipeError, IOError) as e: sys.stderr.close()