view lxmldump.py @ 13:3bd772fd6a50

Cleanups.
author Matti Hamalainen <ccr@tnsp.org>
date Tue, 11 May 2021 12:49:15 +0300
parents d50e71642be7
children 7498bda8b4a2
line wrap: on
line source

#!/usr/bin/python3 -B
# coding=utf-8
###
### lxmldump - Dump ISO/FDIS 1951 XML file data
### Programmed and designed by Matti 'ccr' Hämäläinen <ccr@tnsp.org>
### (C) Copyright 2021 Tecnic Software productions (TNSP)
###
### Python 3.7+ required!
###
import sys
import signal
import re
from pathlib import Path
import xml.etree.ElementTree as xmlET
import unicodedata

assert sys.version_info >= (3, 7)


###
### Default settings
###
pkk_cfg = {
    "verbosity": 3,

    "annotate": False,
    "mode": 0,
    "normalize": False,
    "xml": False,

    "debug": False,
}


pkk_str_fmap = {
    "Fragment" : ["<", ">"],
}


pkk_debug_list = [
    "ahas",
    "ahavakkaine",
    "ahavakala",
    "ahavakoittuo",
    "ahvaliha",
    "aloilleh",
]


###
### Misc. helper functions, etc
###
def pkk_cleanup():
    return 0


## Print string to stdout using normalized Unicode if enabled
def pkk_print(smsg):
    try:
        if pkk_cfg["normalize"]:
            sys.stdout.write(unicodedata.normalize("NFC", smsg))
        else:
            sys.stdout.write(smsg)

    except (BrokenPipeError, IOError) as e:
        sys.stderr.close()


## Print string with indentation
def pkk_printi(indent, smsg):
    pkk_print(("    " * indent) + smsg)


## Check value against current verbosity level
def pkk_verbosity(lvl):
    return pkk_cfg["verbosity"] >= lvl


## Fatal error handler
def pkk_fatal(smsg):
    print(u"ERROR: "+ smsg)
    sys.exit(1)


## Handler for SIGINT signals
def pkk_signal_handler(signal, frame):
    pkk_cleanup()
    print(u"\nQuitting due to SIGINT / Ctrl+C!")
    sys.exit(1)


## Clean string by removing tabs and newlines
def pkk_str_clean(mstr):
    return re.sub(r'[\n\r\t]', '', mstr)


## Format "Ptr" node as text
def pkk_ptr_to_text(pnode):
    return "PTR: <{}>{}</>".format(
        pnode.attrib["{http://www.w3.org/TR/xlink}href"],
        ("".join(pnode.itertext())).strip())


## Get text inside a given node
def pkk_get_text(lnode):
    stmp = ""
    for pnode in lnode.iter():
        if pnode.tag == "Ptr":
            stmp += pkk_ptr_to_text(pnode)
        else:
            if isinstance(pnode.text, str):
                ptext = pkk_str_clean(pnode.text).strip()
                if pkk_cfg["annotate"] and isinstance(pnode.tag, str) and pnode.tag in pkk_str_fmap:
                    stmp += pkk_str_fmap[pnode.tag][0] + ptext + pkk_str_fmap[pnode.tag][1]
                else:
                    stmp += ptext

            if isinstance(pnode.tail, str):
                stmp += pkk_str_clean(pnode.tail)

    return stmp.strip()


## Simple recursive dump starting at given node
def pkk_dump_recursive(indent, lnode):
    if lnode.tag in ["Example"]:
        stmp = pkk_get_text(lnode)
        pkk_printi(indent, "{} \"{}\"\n".format(lnode.tag, stmp))
    else:
        if isinstance(lnode.text, str):
            stmp = pkk_str_clean(lnode.text).strip()
            if stmp != "":
                stmp = " \""+ stmp +"\""
        else:
            stmp = ""

        if len(lnode.attrib) > 0:
            atmp = " "+ str(lnode.attrib)
        else:
            atmp = ""

        pkk_printi(indent, "{}{}{}\n".format(lnode.tag, atmp, stmp))
        for qnode in lnode.findall("./*"):
            pkk_dump_recursive(indent + 1, qnode)


## Output item under given node
def pkk_output_subs_fmt(indent, dnode, dsub, dname, dfmt):
    for qnode in dnode.findall(dsub):
        pkk_printi(indent, dfmt.format(dname, pkk_get_text(qnode)))


def pkk_output_subs_prefix(indent, dnode, dsub, dname):
    pkk_output_subs_fmt(indent, dnode, dsub, dname, "{0} \"{1}\"\n")


def pkk_output_sense(indent, dnode):
    pkk_output_subs_prefix(indent, dnode, "./SearchForm", "srch")
    pkk_output_subs_prefix(indent, dnode, "./Definition", "defn")

    for wnode in dnode.findall("./ExampleBlock/ExampleCtn"):
        sstr = pkk_get_text(wnode.find("./Example"))
        lstr = ""

        if pkk_verbosity(1):
            ltmp = []
            for qnode in wnode.findall("./FreeTopic[@type='levikki']/GeographicalUsage"):
                ltmp.append("{} [{}]".format(pkk_get_text(qnode), qnode.attrib["class"]))

            if len(ltmp) > 0:
                lstr = " ({})".format(", ".join(ltmp))

        pkk_printi(indent + 1, "{} \"{}\"{}\n".format("exmp", sstr, lstr))


def pkk_output_node(indent, dnode):

    for wnode in dnode.findall("./HeadwordCtn"):
        pkk_output_subs_fmt(indent, wnode, "./Headword", "", "\"{1}\":\n")
        pkk_output_sense(indent + 1, wnode)

        index = 1
        for wnode in dnode.findall("./SenseGrp"):
            pkk_printi(indent + 1, "sense #{}\n".format(index))
            pkk_output_sense(indent + 2, wnode)
            index += 1


###
### Main program starts
###
signal.signal(signal.SIGINT, pkk_signal_handler)


### Check if we have arguments
pkk_show_help = False
pkk_filenames = [] 
argc = 1
while argc < len(sys.argv):
    arg = sys.argv[argc]

    needs_param = False
    if argc + 1 < len(sys.argv):
        param = sys.argv[argc + 1]
    else:
        param = None

    # Check for option type arg
    if arg[0:1] == "-":
        oarg = arg
        arg = arg.lstrip("-")

        if arg == "help" or arg == "h":
            pkk_show_help = True
        elif arg == "dump" or arg == "d":
            pkk_cfg["mode"] = 1
        elif arg == "xml" or arg == "x":
            pkk_cfg["mode"] = 2
        elif arg == "normalize" or arg == "n":
            pkk_cfg["normalize"] = True
        elif arg == "annotate" or arg == "a":
            pkk_cfg["annotate"] = True
        elif arg == "p":
            pkk_cfg["debug"] = True
        elif arg == "verbosity" or arg == "v":
            needs_param = True
            pkk_cfg["verbosity"] = param
        else:
            pkk_fatal(u"Invalid option argument '{0}'.".format(oarg))

        if needs_param and param == None:
            pkk_fatal(u"Option '{0}' requires an argument.".format(oarg))
    else:
        # Non-option argument
        pkk_filenames.append(arg)

    if needs_param:
        argc += 2
    else:
        argc += 1


### Show help if requested
if pkk_show_help or len(pkk_filenames) == 0:
    print(u"lxmldump - Dump ISO/FDIS 1951 XML file data")
    print(u"Usage: {0} <options> <input xml file(s)>".
        format(str(Path(sys.argv[0]).name)))
    print(u"")
    print(u"       --help              Show this help")
    print(u"  -d,  --dump              Dump mode")
    print(u"  -n,  --normalize         Output NFC normalized Unicode")
    print(u"  -a,  --annotate          Annotate strings")
    print(u"  -v,  --verbosity <n>     Set verbosity level (0 - 3)")
    print(u"")
    sys.exit(0)


### Validate settings
try:
    pkk_cfg["verbosity"] = int(pkk_cfg["verbosity"])
except Exception as e:
    pkk_fatal(u"Verbosity level is not a valid integer.")
if pkk_cfg["verbosity"] < 0 or pkk_cfg["verbosity"] > 3:
    pkk_fatal(u"Invalid verbosity level value {0}.".format(pkk_cfg["verbosity"]))


### Handle each input file
for filename in pkk_filenames:
    # Parse XML file into element tree
    try:
        uxml = xmlET.parse(filename)
    except Exception as e:
        pkk_fatal(u"SVG/XML parsing failed: {0}".format(str(e)))

    # Dump output
    try:
        xroot = uxml.getroot()
        for dnode in xroot.findall("./DictionaryEntry"):

            if pkk_cfg["debug"] and dnode.attrib["identifier"] not in pkk_debug_list:
                continue

            if pkk_cfg["mode"] == 0:
                pkk_output_node(0, dnode)
            elif pkk_cfg["mode"] == 1:
                pkk_dump_recursive(0, dnode)
            elif pkk_cfg["mode"] == 2:
                pkk_print(str(xmlET.tostring(dnode, encoding="utf8")) + "\n")
            else:
                pkk_fatal("Invalid operation mode?")

            print("\n")

    except (BrokenPipeError, IOError) as e:
        sys.stderr.close()
        sys.exit(1)

pkk_cleanup()
sys.exit(0)