view lxmldump.py @ 53:833606b39e35

Use "" quotation for searchwords also.
author Matti Hamalainen <ccr@tnsp.org>
date Thu, 27 May 2021 14:06:46 +0300
parents 95671cdda422
children 884770576e74
line wrap: on
line source

#!/usr/bin/python3 -B
# coding=utf-8
###
### lxmldump - Convert and dump ISO/FDIS 1951 XML file data
### Programmed and designed by Matti 'ccr' Hämäläinen <ccr@tnsp.org>
### (C) Copyright 2021 Tecnic Software productions (TNSP)
###
### Released / distributed under 3-clause BSD license
### (see file "COPYING" for more information)
###
### Python 3.7+ required!
###
import sys
import signal
import re
from pathlib import Path
import xml.etree.ElementTree as xmlET
import unicodedata
import argparse
import textwrap

assert sys.version_info >= (3, 7)


###
### Default settings
###
# Operation modes
PKK_MODE_NORMAL     = 0
PKK_MODE_DUMP       = 1
PKK_MODE_XML        = 2
PKK_MODE_ANKI       = 3


pkk_modes_list = {
    PKK_MODE_NORMAL: "normal",
    PKK_MODE_DUMP: "dump",
    PKK_MODE_XML: "xml",
    PKK_MODE_ANKI: "anki",
}


pkk_mode_defaults = {
    # Default Ptr URL format strings
    "ptr_fmt": {
        PKK_MODE_NORMAL: "<PTR:{href}>{text}</PTR>",
        PKK_MODE_ANKI: "<a href='https://kaino.kotus.fi/cgi-bin/kks/karjala.cgi?a={href}'>{text}</a>",
    },

    "word_fmt": {
        PKK_MODE_NORMAL: "\"{word}\"{search}{attr}\n{hyphenation}{main_sense}{other_senses}\n",
        PKK_MODE_ANKI: "\"{word}\"{search}{attr}{hyphenation};{main_sense};{other_senses}\n",
    },
    "word_attr_list": {
        PKK_MODE_NORMAL: " ({alist}) ",
        PKK_MODE_NORMAL: " ({alist})",
    },
    "word_attr_list_empty": {
        PKK_MODE_NORMAL: " ",
        PKK_MODE_ANKI: "",
    },
    "word_attr_list_item": {
        PKK_MODE_NORMAL: "{text}",
    },
    "word_attr_list_sep": {
        PKK_MODE_NORMAL: " ; ",
        PKK_MODE_ANKI: " : ",
    },

    "search_list": {
        PKK_MODE_NORMAL: ", {alist}",
    },
    "search_list_empty": {
        PKK_MODE_NORMAL: "",
    },
    "search_list_item": {
        PKK_MODE_NORMAL: "\"{text}\"",
    },
    "search_list_sep": {
        PKK_MODE_NORMAL: ", ",
    },

    "hyphenation": {
        PKK_MODE_NORMAL: "{indent}hyph \"{text}\"\n",
        PKK_MODE_ANKI: " [hyph: {text}]",
    },
    "no_hyphenation": {
        PKK_MODE_NORMAL: "",
    },

    "sense_index": {
        PKK_MODE_NORMAL: "{indent}sense #{index}\n",
        PKK_MODE_ANKI: "#{index}: ",
    },

    "definition_fmt": {
        PKK_MODE_NORMAL: "{indent}defn \"{text}\"\n",
        PKK_MODE_ANKI: " * \"{text}\"",
    },

    "example_fmt": {
        PKK_MODE_NORMAL: "{indent}exmp \"{text}\"{geostr}\n",
        PKK_MODE_ANKI: " ⚫ \"{text}\"{geostr}",
    },
    "example_geo_list": {
        PKK_MODE_NORMAL: " ({alist})",
    },
    "example_geo_list_empty": {
        PKK_MODE_NORMAL: "",
    },
    "example_geo_list_item": {
        PKK_MODE_NORMAL: "{text} [{tclass}]",
    },
    "example_geo_list_sep": {
        PKK_MODE_NORMAL: ", ",
    },
}


# Element annotation mappings
pkk_element_annotation_map = {
    "Fragment" : {
        PKK_MODE_NORMAL: ["<", ">"],
        PKK_MODE_ANKI: ["<", ">"],
    },
}


# List of words in kks1/ useful for debugging, option -p
pkk_debug_list = [
    "ahas",
    "ahavakkaine",
    "ahavakala",
    "ahavakoittuo",
    "ahvaliha",
    "aloilleh",
    "hanjahtoakseh",
    "akkalisto",
    "alto-",
    "allot-",
]


pkk_settings = {}


###
### Misc. helper functions, etc
###

## Print string to stdout using normalized Unicode if enabled
def pkk_print(smsg):
    try:
        if pkk_cfg.normalize:
            sys.stdout.write(unicodedata.normalize("NFC", smsg))
        else:
            sys.stdout.write(smsg)

    except (BrokenPipeError, IOError) as e:
        sys.stderr.close()


## Get indentation string
def pkk_geti(indent):
    return " " * pkk_cfg.indent * indent


## Print string with indentation
def pkk_printi(indent, smsg):
    pkk_print(pkk_geti(indent) + smsg)


## Fatal error handler
def pkk_fatal(smsg):
    print(u"ERROR: "+ smsg)
    sys.exit(1)


## Handler for SIGINT signals
def pkk_signal_handler(signal, frame):
    print(u"\nQuitting due to SIGINT / Ctrl+C!")
    sys.exit(1)


class pkk_set_mode(argparse.Action):
    def __call__(self, parser, namespace, value, option_string=None):
        lvalue = value.strip().lower()
        for mode in pkk_modes_list:
            if pkk_modes_list[mode] == lvalue:
                setattr(namespace, self.dest, mode)
                return

        pkk_fatal(f"Invalid output mode '{lvalue}'.")


## Value handling
class pkk_set_value(argparse.Action):

    rexpr = re.compile(r'\s*(\w+)\s*=\s*(.*)\s*')

    def __call__(self, parser, namespace, values, option_string=None):
        rmatch = re.match(self.rexpr, values)
        if rmatch:
            rid = rmatch.group(1).lower().replace("-", "_")
            rval = rmatch.group(2)
            if rid in pkk_mode_defaults:
                pkk_settings[rid] = rval
            else:
                pkk_fatal(f"Invalid option '{option_string} {values}': No such ID '{rid}'.")
        else:
            pkk_fatal(f"Invalid option '{option_string} {values}': Expected id=value.")


## Get mode if it exists
def pkk_test_value(mid):
    if mid in pkk_mode_defaults:
        if pkk_cfg.mode in pkk_mode_defaults[mid]:
            mmode = pkk_cfg.mode
        else:
            mmode = PKK_MODE_NORMAL

        if mmode in pkk_mode_defaults[mid]:
            return mmode
        else:
            return None
    else:
        return None


## Get default value per mode
def pkk_get_value(mid):
    if mid in pkk_settings and pkk_settings[mid] != None:
        return pkk_settings[mid]

    mmode = pkk_test_value(mid)
    if mmode == None:
        pkk_fatal(f"Internal error: No mode for ID '{mid}'.")

    return pkk_mode_defaults[mid][mmode]


def pkk_get_fmt(mid):
    return pkk_get_value(mid).replace("\\n", "\n")


## Annotate given string with prefix and suffix based on tag
def pkk_str_annotate(mtag, mstr):
    if pkk_cfg.annotate and mtag in pkk_element_annotation_map:
        if pkk_cfg.mode in pkk_element_annotation_map[mtag]:
            mmode = pkk_cfg.mode
        else:
            mmode = PKK_MODE_NORMAL

        return pkk_element_annotation_map[mtag][mmode][0] + mstr + pkk_element_annotation_map[mtag][mmode][1]
    else:
        return mstr


## Clean string by removing tabs and newlines
def pkk_str_clean(mstr):
    return re.sub(r'[\n\r\t]', '', mstr)


## Format a "Ptr" node as text
def pkk_ptr_to_text(pnode):
    pfmt = pkk_get_fmt("ptr_fmt")
    return pfmt.format(
        text=("".join(pnode.itertext())).strip(),
        href=pnode.attrib["{http://www.w3.org/TR/xlink}href"])


## Get text inside a given node
def pkk_node_to_text(lnode):
    stmp = ""
    for pnode in lnode.iter():
        if pnode.tag == "Ptr":
            stmp += pkk_ptr_to_text(pnode)
        else:
            if isinstance(pnode.text, str):
                stmp += pkk_str_annotate(pnode.tag, pkk_str_clean(pnode.text))

            if isinstance(pnode.tail, str):
                stmp += pkk_str_clean(pnode.tail)

    return stmp.strip()


## Simple recursive dump starting at given node
def pkk_dump_recursive(indent, lnode):
    if lnode.tag in ["Example"]:
        stmp = pkk_node_to_text(lnode)
        pkk_printi(indent, f"{lnode.tag} \"{stmp}\"\n")
    else:
        if isinstance(lnode.text, str):
            textstr = pkk_str_clean(lnode.text).strip()
            if textstr != "":
                textstr = " \""+ textstr +"\""
        else:
            textstr = ""

        if len(lnode.attrib) > 0:
            attrstr = " "+ str(lnode.attrib)
        else:
            attrstr = ""

        pkk_printi(indent, f"{lnode.tag}{attrstr}{textstr}\n")
        for qnode in lnode.findall("./*"):
            pkk_dump_recursive(indent + 1, qnode)


## Output item(s) under given node with given format string
def pkk_get_subs(indent, dnode, dsub, dfmtname):
    dfmt = pkk_get_fmt(dfmtname)
    ostr = ""
    for qnode in dnode.findall(dsub):
        ostr += dfmt.format(
            text=pkk_node_to_text(qnode),
            indent=pkk_geti(indent))
    return ostr


def pkk_get_list_str(dlist, dprefix, dfilter):
    if len(dlist) > 0:
        if dfilter:
            tfmt = pkk_get_fmt(dprefix + "_list_item")
            tlist = [tfmt.format(text=i) for i in dlist]
        else:
            tlist = dlist

        return pkk_get_fmt(dprefix + "_list").format(
            alist=pkk_get_fmt(dprefix + "_list_sep").join(tlist))
    else:
        return pkk_get_fmt(dprefix + "_list_empty")


## Output a main "Headword" or "Sense" node
def pkk_get_sense(indent, dnode):
    # Definition for this sense
    ostr = pkk_get_subs(indent, dnode, "./Definition", "definition_fmt")

    # Examples for this sense
    for wnode in dnode.findall("./ExampleBlock/ExampleCtn"):
        geolist = []
        for qnode in wnode.findall("./FreeTopic[@type='levikki']/GeographicalUsage"):
            geolist.append(pkk_get_fmt("example_geo_list_item").format(
                text=pkk_node_to_text(qnode),
                tclass=qnode.attrib["class"]))

        ostr += pkk_get_fmt("example_fmt").format(
            text=pkk_node_to_text(wnode.find("./Example")),
            geostr=pkk_get_list_str(geolist, "example_geo", False),
            indent=pkk_geti(indent + 1))

    return ostr


## Output one "DictionaryEntry" node
def pkk_output_node(indent, dnode):

    for wnode in dnode.findall("./HeadwordCtn"):
        # Get head word
        headword = pkk_node_to_text(wnode.find("./Headword"))

        # Collect search forms
        srchlist = []
        for qnode in wnode.findall("./SearchForm"):
            srchlist.append(pkk_node_to_text(qnode))

        # Remove dupe if headword is also in srchlist
        if headword in srchlist:
            srchlist.remove(headword)

        # Remove other duplicates and sort
        srchlist = list(set(srchlist))
        srchlist.sort(reverse=False, key=lambda attr: (attr, len(attr)))

        # Get hyphenation note, if any
        hnode = wnode.find("./Hyphenation")
        if hnode != None:
            hyphenation = pkk_get_fmt("hyphenation").format(
                text=pkk_node_to_text(hnode),
                indent=pkk_geti(indent + 1))
        else:
            hyphenation = pkk_get_fmt("no_hyphenation").format(
                indent=pkk_geti(indent + 1))

        # Create list with grammatical attributes (noun, verb, etc.)
        attrlist = []
        for pnode in wnode.findall("./PartOfSpeechCtn/PartOfSpeech"):
            attrlist.append(pnode.attrib["freeValue"])

        for pnode in wnode.findall("./GrammaticalNote"):
            attrlist.append(pkk_node_to_text(pnode))

        # Remove duplicates and sort the list
        attrlist = list(set(attrlist))
        attrlist.sort(reverse=False, key=lambda attr: (attr, len(attr)))

        # Get main "sense"
        msense = pkk_get_sense(indent + 1, wnode)

        # Print any other "senses"
        index = 1
        osenses = ""
        for znode in dnode.findall("./SenseGrp"):
            osenses += pkk_get_fmt("sense_index").format(
                index=index,
                indent=pkk_geti(indent + 1))
            osenses += pkk_get_sense(indent + 2, znode)
            index += 1

        # Print the headword and attributes if any
        pkk_print(pkk_get_fmt("word_fmt").format(
            word=headword,
            attr=pkk_get_list_str(attrlist, "word_attr", True),
            search=pkk_get_list_str(srchlist, "search", True),
            hyphenation=hyphenation,
            main_sense=msense,
            other_senses=osenses,
            indent=pkk_geti(indent)))


###
### Main program starts
###
signal.signal(signal.SIGINT, pkk_signal_handler)

optparser = argparse.ArgumentParser(
    description="lxmldump - Convert and dump ISO/FDIS 1951 XML file data",
    usage="%(prog)s [options] <input xml file(s)>",
    add_help=False
    )

optparser.add_argument("filenames",
    type=str, action="extend", nargs="*",
    metavar="filename",
    help="XML filename(s)")

optparser.add_argument("-h", "--help",
    dest="show_help",
    action="store_true",
    help="show this help message")

optparser.add_argument("-m", "--mode",
    dest="mode",
    action=pkk_set_mode,
    default=PKK_MODE_NORMAL,
    help="set output mode (see below)")

optparser.add_argument("-s", "--set",
    action=pkk_set_value,
    metavar="ID=STR",
    help='set format string (see below)')

optparser.add_argument("-n", "--normalize",
    dest="normalize",
    action="store_true",
    help="output NFC normalized Unicode")

optparser.add_argument("-a", "--annotate",
    dest="annotate",
    action="store_true",
    help="annotate strings")

optparser.add_argument("-i", "--indent",
    dest="indent",
    type=int, choices=range(0, 32), default=4,
    metavar="N",
    help='set indentation level (default: %(default)s)')

optparser.add_argument("-p", "--debug",
    dest="debug",
    action="store_true",
    help=argparse.SUPPRESS)


### Parse arguments
pkk_cfg = optparser.parse_args()


### Show help if needed
if len(pkk_cfg.filenames) == 0 or pkk_cfg.show_help:
    optparser.print_help()
    print(u"\nAvailable output modes:")
    print("  " + ", ".join(pkk_modes_list.values()))

    if pkk_cfg.mode not in [PKK_MODE_NORMAL, PKK_MODE_ANKI]:
        pkk_cfg.mode = PKK_MODE_NORMAL

    print(u"\nAvailable format strings and values (mode '{}'):".format(
        pkk_modes_list[pkk_cfg.mode]))

    for mid in pkk_mode_defaults:
        stmp = pkk_get_value(mid).replace("\\", "\\\\").replace("\n", "\\n")
        print(u"  {:22s} : '{}'".format(mid, stmp))

    sys.exit(0)


### Handle each input file
for filename in pkk_cfg.filenames:
    # Parse XML file into element tree
    try:
        uxml = xmlET.parse(filename)
    except Exception as e:
        pkk_fatal(u"SVG/XML parsing failed: {0}".format(str(e)))

    # Dump output
    try:
        xroot = uxml.getroot()
        for dnode in xroot.findall("./DictionaryEntry"):

            if pkk_cfg.debug and dnode.attrib["identifier"] not in pkk_debug_list:
                continue

            if pkk_cfg.mode in [PKK_MODE_NORMAL, PKK_MODE_ANKI]:
                try:
                    pkk_output_node(0, dnode)
                except Exception as e:
                    print("")
                    pkk_dump_recursive(0, dnode)
                    print(str(e))
                    sys.exit(0)
            elif pkk_cfg.mode == PKK_MODE_DUMP:
                pkk_dump_recursive(0, dnode)
                print("")
            elif pkk_cfg.mode == PKK_MODE_XML:
                pkk_print(xmlET.tostring(dnode, encoding="utf8").decode("utf8") + "\n\n")
            else:
                pkk_fatal("Invalid operation mode?")

    except (BrokenPipeError, IOError) as e:
        sys.stderr.close()
        sys.exit(1)

sys.exit(0)