view lxmldump.py @ 6:34a89d61dbe7

Merge and cleanup.
author Matti Hamalainen <ccr@tnsp.org>
date Mon, 10 May 2021 21:43:00 +0300
parents 274b2091137c 7ce08dea935b
children 4b4299b62f7f
line wrap: on
line source

#!/usr/bin/python3 -B
# coding=utf-8
###
### lxmldump - Dump ISO/FDIS 1951 XML file data
### Programmed and designed by Matti 'ccr' Hämäläinen <ccr@tnsp.org>
### (C) Copyright 2021 Tecnic Software productions (TNSP)
###
### Python 3.7+ required!
###
import sys
import signal
import re
from pathlib import Path
import xml.etree.ElementTree as xmlET
import unicodedata

assert sys.version_info >= (3, 7)


###
### Default settings
###
pkk_cfg = {
    "dump": False,
    "normalize": False,
}


###
### Misc. helper functions, etc
###
def pkk_cleanup():
    return 0


## Wrapper for print()
def pkk_print(smsg):
    if pkk_cfg["normalize"]:
        sys.stdout.write(unicodedata.normalize("NFC", smsg))
    else:
        sys.stdout.write(smsg)


## Fatal error handler
def pkk_fatal(smsg):
    print(u"ERROR: "+ smsg)
    sys.exit(1)


## Handler for SIGINT signals
def pkk_signal_handler(signal, frame):
    pkk_cleanup()
    print(u"\nQuitting due to SIGINT / Ctrl+C!")
    sys.exit(1)


##
def pkk_dump_recursive(lnode, indent):
    if lnode.tag == "Example":
        stmp = "".join(lnode.itertext()).strip()
        print("{}{} \"{}\"".format("    " * indent, lnode.tag, stmp))
    else:
        stmp = ""
        if lnode.text != None:
            tmp = str(lnode.text).strip()
            if tmp != "":
                stmp = " \""+ tmp +"\""

        if len(lnode.attrib) > 0:
            atmp = " "+str(lnode.attrib)
        else:
            atmp = ""

        pkk_print("{}{}{}{}\n".format("    " * indent, lnode.tag, atmp, stmp))
        for qnode in lnode.findall("./*"):
            pkk_dump_recursive(qnode, indent + 1)


##
def pkk_output_node(dnode):
    wlist = []
    dlist = []
    for wnode in dnode.findall("./HeadwordCtn"):
        for qnode in wnode.findall("./SearchForm"):
            wlist.append(str(qnode.text).strip())

        for qnode in wnode.findall("./Definition"):
            dlist.append(str(qnode.text).strip())

    for wnode in dnode.findall("./SenseGrp"):
        for qnode in wnode.findall("./Definition"):
            dlist.append(str(qnode.text).strip())

    pkk_print("{} : {}\n".format(", ".join(wlist), " ; ".join(dlist)))


###
### Main program starts
###
signal.signal(signal.SIGINT, pkk_signal_handler)


### Check if we have arguments
pkk_show_help = False
pkk_filenames = [] 
argc = 1
while argc < len(sys.argv):
    arg = sys.argv[argc]

    needs_param = False
    if argc + 1 < len(sys.argv):
        param = sys.argv[argc + 1]
    else:
        param = None

    # Check for option type arg
    if arg[0:1] == "-":
        oarg = arg
        arg = arg.lstrip("-")

        if arg == "help" or arg == "h":
            pkk_show_help = True
        elif arg == "dump" or arg == "d":
            pkk_cfg["dump"] = True
        elif arg == "normalize" or arg == "n":
            pkk_cfg["normalize"] = True
        else:
            pkk_fatal(u"Invalid option argument '{0}'.".format(oarg))

        if needs_param and param == None:
            pkk_fatal(u"Option '{0}' requires an argument.".format(oarg))
    else:
        # Non-option argument
        pkk_filenames.append(arg)

    if needs_param:
        argc += 2
    else:
        argc += 1


### Show help if requested
if pkk_show_help or len(pkk_filenames) == 0:
    print(u"lxmldump - Dump ISO/FDIS 1951 XML file data")
    print(u"Usage: {0} <options> <input xml file(s)>".
        format(str(Path(sys.argv[0]).name)))
    print(u"")
    print(u"       --help              Show this help")
    print(u"  -d,  --dump              Dump mode")
    print(u"  -n,  --normalize         Output NFC normalized Unicode")
    print(u"")
    sys.exit(0)


### Handle each input file
for filename in pkk_filenames:
    # Parse XML file into element tree
    try:
        uxml = xmlET.parse(filename)
    except Exception as e:
        pkk_fatal(u"SVG/XML parsing failed: {0}".format(str(e)))

    # Dump output
    try:
        xroot = uxml.getroot()
        for dnode in xroot.findall("./DictionaryEntry"):
            if pkk_cfg["dump"]:
                pkk_dump_recursive(dnode, 0)
                print("\n\n")
            else:
                pkk_output_node(dnode)

    except (BrokenPipeError, IOError) as e:
        sys.stderr.close()
        sys.exit(1)

pkk_cleanup()
sys.exit(0)