comparison lxmldump.py @ 13:3bd772fd6a50

Cleanups.
author Matti Hamalainen <ccr@tnsp.org>
date Tue, 11 May 2021 12:49:15 +0300
parents d50e71642be7
children 7498bda8b4a2
comparison
equal deleted inserted replaced
12:d50e71642be7 13:3bd772fd6a50
52 ### 52 ###
53 def pkk_cleanup(): 53 def pkk_cleanup():
54 return 0 54 return 0
55 55
56 56
57 ## Wrapper for print() 57 ## Print string to stdout using normalized Unicode if enabled
58 def pkk_print(smsg): 58 def pkk_print(smsg):
59 try: 59 try:
60 if pkk_cfg["normalize"]: 60 if pkk_cfg["normalize"]:
61 sys.stdout.write(unicodedata.normalize("NFC", smsg)) 61 sys.stdout.write(unicodedata.normalize("NFC", smsg))
62 else: 62 else:
64 64
65 except (BrokenPipeError, IOError) as e: 65 except (BrokenPipeError, IOError) as e:
66 sys.stderr.close() 66 sys.stderr.close()
67 67
68 68
69 ## Print string with indentation
69 def pkk_printi(indent, smsg): 70 def pkk_printi(indent, smsg):
70 pkk_print((" " * indent) + smsg) 71 pkk_print((" " * indent) + smsg)
71 72
72 73
74 ## Check value against current verbosity level
73 def pkk_verbosity(lvl): 75 def pkk_verbosity(lvl):
74 return pkk_cfg["verbosity"] >= lvl 76 return pkk_cfg["verbosity"] >= lvl
75 77
76 78
77 ## Fatal error handler 79 ## Fatal error handler
85 pkk_cleanup() 87 pkk_cleanup()
86 print(u"\nQuitting due to SIGINT / Ctrl+C!") 88 print(u"\nQuitting due to SIGINT / Ctrl+C!")
87 sys.exit(1) 89 sys.exit(1)
88 90
89 91
90 ## 92 ## Clean string by removing tabs and newlines
91 def pkk_str_clean(mstr): 93 def pkk_str_clean(mstr):
92 return re.sub(r'[\n\r\t]', '', mstr) 94 return re.sub(r'[\n\r\t]', '', mstr)
93 95
94 96
97 ## Format "Ptr" node as text
95 def pkk_ptr_to_text(pnode): 98 def pkk_ptr_to_text(pnode):
96 return "PTR: <{}>{}</>".format( 99 return "PTR: <{}>{}</>".format(
97 pnode.attrib["{http://www.w3.org/TR/xlink}href"], 100 pnode.attrib["{http://www.w3.org/TR/xlink}href"],
98 ("".join(pnode.itertext())).strip()) 101 ("".join(pnode.itertext())).strip())
99 102
100 103
104 ## Get text inside a given node
101 def pkk_get_text(lnode): 105 def pkk_get_text(lnode):
102 stmp = "" 106 stmp = ""
103 for pnode in lnode.iter(): 107 for pnode in lnode.iter():
104 if pnode.tag == "Ptr": 108 if pnode.tag == "Ptr":
105 stmp += pkk_ptr_to_text(pnode) 109 stmp += pkk_ptr_to_text(pnode)
115 stmp += pkk_str_clean(pnode.tail) 119 stmp += pkk_str_clean(pnode.tail)
116 120
117 return stmp.strip() 121 return stmp.strip()
118 122
119 123
120 ## 124 ## Simple recursive dump starting at given node
121 def pkk_dump_recursive(indent, lnode): 125 def pkk_dump_recursive(indent, lnode):
122 if lnode.tag in ["Example"]: 126 if lnode.tag in ["Example"]:
123 stmp = pkk_get_text(lnode) 127 stmp = pkk_get_text(lnode)
124 pkk_printi(indent, "{} \"{}\"\n".format(lnode.tag, stmp)) 128 pkk_printi(indent, "{} \"{}\"\n".format(lnode.tag, stmp))
125 else: 129 else:
138 pkk_printi(indent, "{}{}{}\n".format(lnode.tag, atmp, stmp)) 142 pkk_printi(indent, "{}{}{}\n".format(lnode.tag, atmp, stmp))
139 for qnode in lnode.findall("./*"): 143 for qnode in lnode.findall("./*"):
140 pkk_dump_recursive(indent + 1, qnode) 144 pkk_dump_recursive(indent + 1, qnode)
141 145
142 146
143 ## 147 ## Output item under given node
144 def pkk_output_one(indent, dnode, dsub, dfmt): 148 def pkk_output_subs_fmt(indent, dnode, dsub, dname, dfmt):
145 for qnode in dnode.findall(dsub): 149 for qnode in dnode.findall(dsub):
146 pkk_printi(indent, dfmt.format(pkk_get_text(qnode))) 150 pkk_printi(indent, dfmt.format(dname, pkk_get_text(qnode)))
147 151
148 152
149 def pkk_output_subs(indent, dnode, dsub, dname): 153 def pkk_output_subs_prefix(indent, dnode, dsub, dname):
150 for qnode in dnode.findall(dsub): 154 pkk_output_subs_fmt(indent, dnode, dsub, dname, "{0} \"{1}\"\n")
151 pkk_printi(indent, "{} \"{}\"\n".format(dname, pkk_get_text(qnode)))
152 155
153 156
154 def pkk_output_sense(indent, dnode): 157 def pkk_output_sense(indent, dnode):
155 pkk_output_subs(indent, dnode, "./SearchForm", "srch") 158 pkk_output_subs_prefix(indent, dnode, "./SearchForm", "srch")
156 pkk_output_subs(indent, dnode, "./Definition", "defn") 159 pkk_output_subs_prefix(indent, dnode, "./Definition", "defn")
157 160
158 for wnode in dnode.findall("./ExampleBlock/ExampleCtn"): 161 for wnode in dnode.findall("./ExampleBlock/ExampleCtn"):
159 sstr = pkk_get_text(wnode.find("./Example")) 162 sstr = pkk_get_text(wnode.find("./Example"))
160 lstr = "" 163 lstr = ""
161 164
171 174
172 175
173 def pkk_output_node(indent, dnode): 176 def pkk_output_node(indent, dnode):
174 177
175 for wnode in dnode.findall("./HeadwordCtn"): 178 for wnode in dnode.findall("./HeadwordCtn"):
176 pkk_output_one (indent, wnode, "./Headword", "\"{}\":\n") 179 pkk_output_subs_fmt(indent, wnode, "./Headword", "", "\"{1}\":\n")
177 pkk_output_sense(indent + 1, wnode) 180 pkk_output_sense(indent + 1, wnode)
178 181
179 index = 1 182 index = 1
180 for wnode in dnode.findall("./SenseGrp"): 183 for wnode in dnode.findall("./SenseGrp"):
181 pkk_printi(indent + 1, "sense #{}\n".format(index)) 184 pkk_printi(indent + 1, "sense #{}\n".format(index))