# HG changeset patch # User Matti Hamalainen # Date 1621973944 -10800 # Node ID 4c8aafff8c5f954663519588a5361c73aa026123 # Parent 5aafa87dbec25772616a70d56ce29acdcd5eecae Refactor output handling to be (mostly) configurable. diff -r 5aafa87dbec2 -r 4c8aafff8c5f lxmldump.py --- a/lxmldump.py Tue May 25 13:28:24 2021 +0300 +++ b/lxmldump.py Tue May 25 23:19:04 2021 +0300 @@ -1,7 +1,7 @@ #!/usr/bin/python3 -B # coding=utf-8 ### -### lxmldump - Dump ISO/FDIS 1951 XML file data +### lxmldump - Convert and dump ISO/FDIS 1951 XML file data ### Programmed and designed by Matti 'ccr' Hämäläinen ### (C) Copyright 2021 Tecnic Software productions (TNSP) ### @@ -39,10 +39,56 @@ } -# Default Ptr URL format strings -pkk_ptr_url_fmt = { - PKK_MODE_NORMAL: u"{text}", - PKK_MODE_ANKI: u"{text}", +pkk_mode_defaults = { + # Default Ptr URL format strings + "ptr_url_fmt": { + PKK_MODE_NORMAL: u"{text}", + PKK_MODE_ANKI: u"{text}", + }, + + "word_fmt": { + PKK_MODE_NORMAL: "\"{text}\"", + }, + "word_attr_sep": { + PKK_MODE_NORMAL: " ; ", + PKK_MODE_ANKI: ":", + }, + "word_attr_fmt": { + PKK_MODE_NORMAL: " ({alist})", + }, + + "word_eol": { + PKK_MODE_NORMAL: "\n", + }, + + "sense_index": { + PKK_MODE_NORMAL: "sense #{index}\n", + }, + + "search_fmt": { + PKK_MODE_NORMAL: "srch \"{text}\"\n", + }, + + "definition_fmt": { + PKK_MODE_NORMAL: "defn \"{text}\"\n", + }, + + "example_fmt": { + PKK_MODE_NORMAL: "exmp \"{text}\"{geostr}\n", + }, + "example_geo_list": { + PKK_MODE_NORMAL: " ({glist})", + }, + "example_geo_empty": { + PKK_MODE_NORMAL: "", + }, + "example_geo_sep": { + PKK_MODE_NORMAL: ", ", + }, + + "word_end": { + PKK_MODE_NORMAL: "\n", + }, } @@ -69,6 +115,8 @@ ] +pkk_settings = {} + ### ### Misc. helper functions, etc @@ -107,6 +155,56 @@ sys.exit(1) +## Value handling +class pkk_set_value(argparse.Action): + + rexpr = re.compile(r'\s*(\w+)\s*=\s*(.*)\s*') + + def __call__(self, parser, namespace, values, option_string=None): + rmatch = re.match(self.rexpr, values) + if rmatch: + rid = rmatch.group(1).lower().replace("-", "_") + rval = rmatch.group(2) + if rid in pkk_mode_defaults: + pkk_settings[rid] = rval + else: + pkk_fatal(f"Invalid option '{option_string} {values}': No such ID '{rid}'.") + else: + pkk_fatal(f"Invalid option '{option_string} {values}': Expected id=value.") + + +## Get mode if it exists +def pkk_test_value(mid): + if mid in pkk_mode_defaults: + if pkk_cfg.mode in pkk_mode_defaults[mid]: + mmode = pkk_cfg.mode + else: + mmode = PKK_MODE_NORMAL + + if mmode in pkk_mode_defaults[mid]: + return mmode + else: + return None + else: + return None + + +## Get default value per mode +def pkk_get_value(mid): + if mid in pkk_settings and pkk_settings[mid] != None: + return pkk_settings[mid] + + mmode = pkk_test_value(mid) + if mmode == None: + pkk_fatal(f"Internal error: No mode for ID '{mid}'.") + + return pkk_mode_defaults[mid][mmode] + + +def pkk_get_fmt(mid): + return pkk_get_value(mid).replace("\\n", "\n") + + ## Annotate given string with prefix and suffix based on tag def pkk_str_annotate(mtag, mstr): if pkk_cfg.annotate and mtag in pkk_element_annotation_map: @@ -127,16 +225,7 @@ ## Format a "Ptr" node as text def pkk_ptr_to_text(pnode): - # If custom format set, use it - if pkk_cfg.ptr_url_fmt != None: - pfmt = pkk_cfg.ptr_url_fmt - elif pkk_cfg.mode in pkk_ptr_url_fmt: - # Else try mode-specific - pfmt = pkk_ptr_url_fmt[pkk_cfg.mode] - else: - # Last resort is normal mode format - pfmt = pkk_ptr_url_fmt[PKK_MODE_NORMAL] - + pfmt = pkk_get_fmt("ptr_url_fmt") return pfmt.format( text=("".join(pnode.itertext())).strip(), href=pnode.attrib["{http://www.w3.org/TR/xlink}href"]) @@ -182,34 +271,31 @@ ## Output item(s) under given node with given format string -def pkk_output_subs_fmt(indent, dnode, dsub, dname, dfmt): +def pkk_output_subs(indent, dnode, dsub, dfmtname): + dfmt = pkk_get_fmt(dfmtname) for qnode in dnode.findall(dsub): - pkk_printi(indent, dfmt.format(nname=dname, ntext=pkk_node_to_text(qnode))) - -## Output item(s) under given node with a prefixed name string -def pkk_output_subs_prefix(indent, dnode, dsub, dname): - pkk_output_subs_fmt(indent, dnode, dsub, dname, "{nname} \"{ntext}\"\n") + pkk_printi(indent, dfmt.format(text=pkk_node_to_text(qnode))) ## Output a main "Headword" or "Sense" node under it def pkk_output_sense(indent, dnode): # Search form and definition - pkk_output_subs_prefix(indent, dnode, "./SearchForm", "srch") - pkk_output_subs_prefix(indent, dnode, "./Definition", "defn") + pkk_output_subs(indent, dnode, "./SearchForm", "search_fmt") + pkk_output_subs(indent, dnode, "./Definition", "definition_fmt") # Examples for wnode in dnode.findall("./ExampleBlock/ExampleCtn"): - sstr = pkk_node_to_text(wnode.find("./Example")) - lstr = "" - - ltmp = [] + geolist = [] for qnode in wnode.findall("./FreeTopic[@type='levikki']/GeographicalUsage"): - ltmp.append("{} [{}]".format(pkk_node_to_text(qnode), qnode.attrib["class"])) + geolist.append("{} [{}]".format(pkk_node_to_text(qnode), qnode.attrib["class"])) - if len(ltmp) > 0: - lstr = " ({})".format(", ".join(ltmp)) + if len(geolist) > 0: + geostr = pkk_get_fmt("example_geo_list").format(glist=pkk_get_fmt("example_geo_sep").join(geolist)) + else: + geostr = pkk_get_fmt("example_geo_empty") - pkk_printi(indent + 1, "{} \"{}\"{}\n".format("exmp", sstr, lstr)) + pkk_printi(indent + 1, pkk_get_fmt("example_fmt").format( + text=pkk_node_to_text(wnode.find("./Example")), geostr=geostr)) ## Output one "DictionaryEntry" node @@ -229,12 +315,13 @@ tmpl.sort(reverse=False, key=lambda attr: (attr, len(attr))) # Print the headword and attributes if any - pkk_output_subs_fmt(indent, wnode, "./Headword", "", "\"{ntext}\"") + pkk_output_subs(indent, wnode, "./Headword", "word_fmt") if len(tmpl) > 0: - pkk_print(" ({nlist})".format(nlist=pkk_cfg.word_attr_sep.join(tmpl))) + pkk_print(pkk_get_fmt("word_attr_fmt").format( + alist=pkk_get_fmt("word_attr_sep").join(tmpl))) - pkk_print("\n") + pkk_print(pkk_get_fmt("word_eol")) # Print main "sense" pkk_output_sense(indent + 1, wnode) @@ -242,10 +329,12 @@ # Print any other "senses" index = 1 for wnode in dnode.findall("./SenseGrp"): - pkk_printi(indent + 1, f"sense #{index}\n") + pkk_printi(indent + 1, pkk_get_fmt("sense_index").format(index=index)) pkk_output_sense(indent + 2, wnode) index += 1 + pkk_print(pkk_get_fmt("word_end")) + ### ### Main program starts @@ -253,7 +342,7 @@ signal.signal(signal.SIGINT, pkk_signal_handler) optparser = argparse.ArgumentParser( - description="lxmldump - Dump ISO/FDIS 1951 XML file data", + description="lxmldump - Convert and dump ISO/FDIS 1951 XML file data", usage="%(prog)s [options] ", add_help=False ) @@ -284,18 +373,10 @@ help=argparse.SUPPRESS) # help="output Anki compatible") -optparser.add_argument("--ptr-url-fmt", - dest="ptr_url_fmt", - type=str, - default=None, - metavar="str", - help='Ptr URL format string (see below)') - -optparser.add_argument("--attr-sep", - dest="word_attr_sep", - type=str, default=" ; ", - metavar="str", - help='word attribute separator (default: \"%(default)s\")') +optparser.add_argument("-s", "--set", + action=pkk_set_value, + metavar="id=val", + help='set value (see below)') optparser.add_argument("-n", "--normalize", dest="normalize", @@ -311,7 +392,7 @@ dest="indent", type=int, choices=range(0, 32), default=4, metavar="n", - help='indent output by characters (default: %(default)s)') + help='set indentation level (default: %(default)s)') optparser.add_argument("-p", "--debug", dest="debug", @@ -319,15 +400,18 @@ help=argparse.SUPPRESS) +### Parse arguments +pkk_cfg = optparser.parse_args() + + ### Show help if needed -pkk_cfg = optparser.parse_args() if len(pkk_cfg.filenames) == 0 or pkk_cfg.show_help: optparser.print_help() - print(u"\nDefault Ptr format strings per mode:") - for pmode in pkk_modes_list: - if pmode in pkk_ptr_url_fmt: - print(u" {:6s} : \"{}\"".format(pkk_modes_list[pmode], pkk_ptr_url_fmt[pmode])) +# print(u"\nDefault Ptr format strings per mode:") +# for pmode in pkk_modes_list: +# if pmode in pkk_ptr_url_fmt: +# print(u" {:6s} : \"{}\"".format(pkk_modes_list[pmode], pkk_ptr_url_fmt[pmode])) print(u"") sys.exit(0) @@ -353,18 +437,18 @@ try: pkk_output_node(0, dnode) except Exception as e: + print("") pkk_dump_recursive(0, dnode) print(str(e)) sys.exit(0) elif pkk_cfg.mode == PKK_MODE_DUMP: pkk_dump_recursive(0, dnode) + print("") elif pkk_cfg.mode == PKK_MODE_XML: - pkk_print(str(xmlET.tostring(dnode, encoding="utf8")) + "\n") + pkk_print(str(xmlET.tostring(dnode, encoding="utf8")) + "\n\n") else: pkk_fatal("Invalid operation mode?") - print("\n") - except (BrokenPipeError, IOError) as e: sys.stderr.close() sys.exit(1)