Version 7.6.3.2-android, tag libreoffice-7.6.3.2-android
[LibreOffice.git] / bin / flat-odf-cleanup.py
blob59b0a2af16b1f240ebc8dee42eff2c8863ad8a3f
1 #!/usr/bin/python3
2 # -*- tab-width: 4; indent-tabs-mode: nil; py-indent-offset: 4 -*-
4 # This file is part of the LibreOffice project.
6 # This Source Code Form is subject to the terms of the Mozilla Public
7 # License, v. 2.0. If a copy of the MPL was not distributed with this
8 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
11 import sys
12 # sadly need lxml because the python one doesn't preserve namespace prefixes
13 # and type-detection looks for the string "office:document"
14 from lxml import etree as ET
15 #import xml.etree.ElementTree as ET
17 def get_used_p_styles(root):
18 elementnames = [
19 ".//{urn:oasis:names:tc:opendocument:xmlns:text:1.0}p",
20 ".//{urn:oasis:names:tc:opendocument:xmlns:text:1.0}h",
21 ".//{urn:oasis:names:tc:opendocument:xmlns:text:1.0}alphabetical-index-entry-template",
22 ".//{urn:oasis:names:tc:opendocument:xmlns:text:1.0}bibliography-entry-template",
23 ".//{urn:oasis:names:tc:opendocument:xmlns:text:1.0}illustration-index-entry-template",
24 ".//{urn:oasis:names:tc:opendocument:xmlns:text:1.0}index-source-style",
25 ".//{urn:oasis:names:tc:opendocument:xmlns:text:1.0}object-index-entry-template",
26 ".//{urn:oasis:names:tc:opendocument:xmlns:text:1.0}table-index-entry-template",
27 ".//{urn:oasis:names:tc:opendocument:xmlns:text:1.0}table-of-content-entry-template",
28 ".//{urn:oasis:names:tc:opendocument:xmlns:text:1.0}user-index-entry-template",
31 # document content
32 ps = sum([root.findall(e) for e in elementnames], [])
33 usedpstyles = set()
34 usedcondstyles = set()
35 for p in ps:
36 usedpstyles.add(p.get("{urn:oasis:names:tc:opendocument:xmlns:text:1.0}style-name"))
37 if p.get("{urn:oasis:names:tc:opendocument:xmlns:text:1.0}cond-style-name"):
38 usedcondstyles.add(p.get("{urn:oasis:names:tc:opendocument:xmlns:text:1.0}cond-style-name"))
39 if p.get("{urn:oasis:names:tc:opendocument:xmlns:text:1.0}class-names"):
40 for style in p.get("{urn:oasis:names:tc:opendocument:xmlns:text:1.0}class-names").split(" "):
41 usedpstyles.add(style)
42 for shape in root.findall(".//*[@{urn:oasis:names:tc:opendocument:xmlns:drawing:1.0}text-style-name]"):
43 usedpstyles.add(shape.get("{urn:oasis:names:tc:opendocument:xmlns:drawing:1.0}text-style-name"))
44 for tabletemplate in root.findall(".//*[@{urn:oasis:names:tc:opendocument:xmlns:table:1.0}paragraph-style-name]"):
45 usedpstyles.add(tabletemplate.get("{urn:oasis:names:tc:opendocument:xmlns:table:1.0}paragraph-style-name"))
46 for page in root.findall(".//*[@{urn:oasis:names:tc:opendocument:xmlns:style:1.0}register-truth-ref-style-name]"):
47 usedpstyles.add(page.get("{urn:oasis:names:tc:opendocument:xmlns:style:1.0}register-truth-ref-style-name"))
48 for form in root.findall(".//*[@{urn:oasis:names:tc:opendocument:xmlns:form:1.0}text-style-name]"):
49 usedpstyles.add(form.get("{urn:oasis:names:tc:opendocument:xmlns:form:1.0}text-style-name"))
50 # conditional styles
51 for condstyle in usedcondstyles:
52 for map_ in root.findall(".//{urn:oasis:names:tc:opendocument:xmlns:style:1.0}style[@{urn:oasis:names:tc:opendocument:xmlns:style:1.0}family='paragraph'][@{urn:oasis:names:tc:opendocument:xmlns:style:1.0}name='" + condstyle + "']/{urn:oasis:names:tc:opendocument:xmlns:style:1.0}map"):
53 usedpstyles.add(map_.get("{urn:oasis:names:tc:opendocument:xmlns:style:1.0}apply-style-name"))
54 # other styles
55 for notesconfig in root.findall(".//*[@{urn:oasis:names:tc:opendocument:xmlns:text:1.0}default-style-name]"):
56 usedpstyles.add(notesconfig.get("{urn:oasis:names:tc:opendocument:xmlns:text:1.0}default-style-name"))
57 return usedpstyles
59 def add_parent_styles(usedstyles, styles):
60 size = -1
61 while size != len(usedstyles):
62 size = len(usedstyles)
63 for style in styles:
64 if style.get("{urn:oasis:names:tc:opendocument:xmlns:style:1.0}name") in usedstyles:
65 if style.get("{urn:oasis:names:tc:opendocument:xmlns:style:1.0}parent-style-name"):
66 usedstyles.add(style.get("{urn:oasis:names:tc:opendocument:xmlns:style:1.0}parent-style-name"))
67 # only for paragraph styles and master-pages
68 if style.get("{urn:oasis:names:tc:opendocument:xmlns:style:1.0}next-style-name"):
69 usedstyles.add(style.get("{urn:oasis:names:tc:opendocument:xmlns:style:1.0}next-style-name"))
71 def remove_unused_styles(root, usedstyles, styles, name):
72 for style in styles:
73 print(style.get("{urn:oasis:names:tc:opendocument:xmlns:style:1.0}name"))
74 if not(style.get("{urn:oasis:names:tc:opendocument:xmlns:style:1.0}name") in usedstyles):
75 print("removing unused " + name + " " + style.get("{urn:oasis:names:tc:opendocument:xmlns:style:1.0}name"))
76 # it is really dumb that there is no parent pointer in dom
77 try:
78 root.find(".//{urn:oasis:names:tc:opendocument:xmlns:office:1.0}automatic-styles").remove(style)
79 except ValueError:
80 root.find(".//{urn:oasis:names:tc:opendocument:xmlns:office:1.0}styles").remove(style)
82 def collect_all_attribute(usedstyles, attribute):
83 for element in root.findall(".//*[@" + attribute + "]"):
84 usedstyles.add(element.get(attribute))
86 def remove_unused(root):
87 # 1) find all elements that may reference page styles - this gets rid of some paragaraphs
88 usedpstyles = get_used_p_styles(root)
89 print(usedpstyles)
90 usedtstyles = set()
91 tables = root.findall(".//{urn:oasis:names:tc:opendocument:xmlns:table:1.0}table")
92 print(tables)
93 for table in tables:
94 usedtstyles.add(table.get("{urn:oasis:names:tc:opendocument:xmlns:table:1.0}style-name"))
95 pstyles = root.findall(".//{urn:oasis:names:tc:opendocument:xmlns:style:1.0}style[@{urn:oasis:names:tc:opendocument:xmlns:style:1.0}family='paragraph']")
96 tstyles = root.findall(".//{urn:oasis:names:tc:opendocument:xmlns:style:1.0}style[@{urn:oasis:names:tc:opendocument:xmlns:style:1.0}family='table']")
97 usedmasterpages = {"Standard"} # assume this is the default on page 1
98 # only automatic styles may have page breaks in LO, so no need to chase parents or nexts
99 for pstyle in pstyles:
100 print(pstyle.get("{urn:oasis:names:tc:opendocument:xmlns:style:1.0}name"))
101 if pstyle.get("{urn:oasis:names:tc:opendocument:xmlns:style:1.0}name") in usedpstyles:
102 usedmasterpages.add(pstyle.get("{urn:oasis:names:tc:opendocument:xmlns:style:1.0}master-page-name"))
103 for tstyle in tstyles:
104 if tstyle.get("{urn:oasis:names:tc:opendocument:xmlns:style:1.0}name") in usedtstyles:
105 usedmasterpages.add(tstyle.get("{urn:oasis:names:tc:opendocument:xmlns:style:1.0}master-page-name"))
106 for node in root.findall(".//*[@{urn:oasis:names:tc:opendocument:xmlns:text:1.0}master-page-name]"):
107 usedmasterpages.add(node.get("{urn:oasis:names:tc:opendocument:xmlns:text:1.0}master-page-name"))
108 for node in root.findall(".//*[@{urn:oasis:names:tc:opendocument:xmlns:drawing:1.0}master-page-name]"):
109 usedmasterpages.add(node.get("{urn:oasis:names:tc:opendocument:xmlns:drawing:1.0}master-page-name"))
110 print(usedmasterpages)
111 # iterate parent/next until no more masterpage is added
112 size = -1
113 while size != len(usedmasterpages):
114 size = len(usedmasterpages)
115 for mp in root.findall(".//{urn:oasis:names:tc:opendocument:xmlns:style:1.0}master-page"):
116 if mp.get("{urn:oasis:names:tc:opendocument:xmlns:style:1.0}name") in usedmasterpages:
117 if mp.get("{urn:oasis:names:tc:opendocument:xmlns:style:1.0}parent-style-name"):
118 usedmasterpages.add(mp.get("{urn:oasis:names:tc:opendocument:xmlns:style:1.0}parent-style-name"))
119 if mp.get("{urn:oasis:names:tc:opendocument:xmlns:style:1.0}next-style-name"):
120 usedmasterpages.add(mp.get("{urn:oasis:names:tc:opendocument:xmlns:style:1.0}next-style-name"))
121 # remove unused masterpages
122 for mp in root.findall(".//{urn:oasis:names:tc:opendocument:xmlns:style:1.0}master-page"):
123 if not(mp.get("{urn:oasis:names:tc:opendocument:xmlns:style:1.0}name") in usedmasterpages):
124 print("removing unused master page " + mp.get("{urn:oasis:names:tc:opendocument:xmlns:style:1.0}name"))
125 # there is no way to get the parent element???
126 root.find(".//{urn:oasis:names:tc:opendocument:xmlns:office:1.0}master-styles").remove(mp)
128 # 2) remove unused paragraph styles
129 usedpstyles = get_used_p_styles(root)
131 add_parent_styles(usedpstyles, pstyles)
132 remove_unused_styles(root, usedpstyles, pstyles, "paragraph style")
134 # 3) unused list styles - keep referenced from still used paragraph styles
135 usedliststyles = set()
136 for style in root.findall(".//*[@{urn:oasis:names:tc:opendocument:xmlns:style:1.0}list-style-name]"):
137 usedliststyles.add(style.get("{urn:oasis:names:tc:opendocument:xmlns:style:1.0}list-style-name)"))
138 for list_ in root.findall(".//{urn:oasis:names:tc:opendocument:xmlns:text:1.0}list[@{urn:oasis:names:tc:opendocument:xmlns:text:1.0}style-name]"):
139 usedliststyles.add(list_.get("{urn:oasis:names:tc:opendocument:xmlns:text:1.0}style-name"))
140 for listitem in root.findall(".//{urn:oasis:names:tc:opendocument:xmlns:text:1.0}list-item[@{urn:oasis:names:tc:opendocument:xmlns:text:1.0}style-override]"):
141 usedliststyles.add(listitem.get("{urn:oasis:names:tc:opendocument:xmlns:text:1.0}style-override"))
142 for numpara in root.findall(".//{urn:oasis:names:tc:opendocument:xmlns:text:1.0}numbered-paragraph[@{urn:oasis:names:tc:opendocument:xmlns:text:1.0}style-name]"):
143 usedliststyles.add(list_.get("{urn:oasis:names:tc:opendocument:xmlns:text:1.0}style-name"))
144 # ignore ones that are children of style:graphic-properties, those must be handled as the containing style
145 # there is no inheritance for these
146 liststyles = root.findall("./*/{urn:oasis:names:tc:opendocument:xmlns:text:1.0}list-style")
147 remove_unused_styles(root, usedliststyles, liststyles, "list style")
149 # 4) unused text styles
150 usedtextstyles = set()
151 usedsectionstyles = set()
152 usedrubystyles = set()
154 sections = {
155 "{urn:oasis:names:tc:opendocument:xmlns:text:1.0}alphabetical-index",
156 "{urn:oasis:names:tc:opendocument:xmlns:text:1.0}bibliography",
157 "{urn:oasis:names:tc:opendocument:xmlns:text:1.0}illustration-index",
158 "{urn:oasis:names:tc:opendocument:xmlns:text:1.0}index-title",
159 "{urn:oasis:names:tc:opendocument:xmlns:text:1.0}object-index",
160 "{urn:oasis:names:tc:opendocument:xmlns:text:1.0}section",
161 "{urn:oasis:names:tc:opendocument:xmlns:text:1.0}table-of-content",
162 "{urn:oasis:names:tc:opendocument:xmlns:text:1.0}table-index",
163 "{urn:oasis:names:tc:opendocument:xmlns:text:1.0}user-index",
165 texts = {
166 "{urn:oasis:names:tc:opendocument:xmlns:text:1.0}a",
167 "{urn:oasis:names:tc:opendocument:xmlns:text:1.0}index-entry-bibliography",
168 "{urn:oasis:names:tc:opendocument:xmlns:text:1.0}index-entry-chapter",
169 "{urn:oasis:names:tc:opendocument:xmlns:text:1.0}index-entry-link-end",
170 "{urn:oasis:names:tc:opendocument:xmlns:text:1.0}index-entry-link-start",
171 "{urn:oasis:names:tc:opendocument:xmlns:text:1.0}index-entry-page-number",
172 "{urn:oasis:names:tc:opendocument:xmlns:text:1.0}index-entry-span",
173 "{urn:oasis:names:tc:opendocument:xmlns:text:1.0}index-entry-tab-stop",
174 "{urn:oasis:names:tc:opendocument:xmlns:text:1.0}index-entry-text",
175 "{urn:oasis:names:tc:opendocument:xmlns:text:1.0}index-title-template",
176 "{urn:oasis:names:tc:opendocument:xmlns:text:1.0}linenumbering-configuration",
177 "{urn:oasis:names:tc:opendocument:xmlns:text:1.0}list-level-style-number",
178 "{urn:oasis:names:tc:opendocument:xmlns:text:1.0}list-level-style-bullet",
179 "{urn:oasis:names:tc:opendocument:xmlns:text:1.0}outline-level-style",
180 "{urn:oasis:names:tc:opendocument:xmlns:text:1.0}ruby-text",
181 "{urn:oasis:names:tc:opendocument:xmlns:text:1.0}span",
183 for element in root.findall(".//*[@{urn:oasis:names:tc:opendocument:xmlns:text:1.0}style-name]"):
184 style = element.get("{urn:oasis:names:tc:opendocument:xmlns:text:1.0}style-name")
185 if element.tag == "{urn:oasis:names:tc:opendocument:xmlns:text:1.0}ruby":
186 usedrubystyles.add(style)
187 elif element.tag in sections:
188 usedsectionstyles.add(style)
189 elif element.tag in texts:
190 usedtextstyles.add(style)
192 collect_all_attribute(usedtextstyles, "{urn:oasis:names:tc:opendocument:xmlns:style:1.0}style-name")
193 collect_all_attribute(usedtextstyles, "{urn:oasis:names:tc:opendocument:xmlns:style:1.0}leader-text-style")
194 collect_all_attribute(usedtextstyles, "{urn:oasis:names:tc:opendocument:xmlns:style:1.0}text-line-through-text-style")
195 collect_all_attribute(usedtextstyles, "{urn:oasis:names:tc:opendocument:xmlns:text:1.0}visited-style-name")
196 collect_all_attribute(usedtextstyles, "{urn:oasis:names:tc:opendocument:xmlns:text:1.0}main-entry-style-name")
197 collect_all_attribute(usedtextstyles, "{urn:oasis:names:tc:opendocument:xmlns:text:1.0}citation-style-name")
198 collect_all_attribute(usedtextstyles, "{urn:oasis:names:tc:opendocument:xmlns:text:1.0}citation-body-style-name")
199 for span in root.findall(".//{urn:oasis:names:tc:opendocument:xmlns:text:1.0}span[@{urn:oasis:names:tc:opendocument:xmlns:text:1.0}class-names]"):
200 for style in span.get("{urn:oasis:names:tc:opendocument:xmlns:text:1.0}class-names").split(" "):
201 usedtextstyles.add(style)
202 textstyles = root.findall(".//{urn:oasis:names:tc:opendocument:xmlns:style:1.0}style[@{urn:oasis:names:tc:opendocument:xmlns:style:1.0}family='text']")
203 add_parent_styles(usedtextstyles, textstyles)
204 remove_unused_styles(root, usedtextstyles, textstyles, "text style")
206 # 5) unused ruby styles - can't have parents?
207 rubystyles = root.findall(".//{urn:oasis:names:tc:opendocument:xmlns:style:1.0}style[@{urn:oasis:names:tc:opendocument:xmlns:style:1.0}family='ruby']")
208 remove_unused_styles(root, usedrubystyles, rubystyles, "ruby style")
210 # 6) unused section styles - can't have parents?
211 sectionstyles = root.findall(".//{urn:oasis:names:tc:opendocument:xmlns:style:1.0}style[@{urn:oasis:names:tc:opendocument:xmlns:style:1.0}family='section']")
212 remove_unused_styles(root, usedsectionstyles, sectionstyles, "section style")
214 # 7) presentation styles
215 usedpresentationstyles = set()
217 collect_all_attribute(usedpresentationstyles, "{urn:oasis:names:tc:opendocument:xmlns:presentation:1.0}style-name")
218 for element in root.findall(".//*[@{urn:oasis:names:tc:opendocument:xmlns:presentation:1.0}class-names]"):
219 for style in element.get("{urn:oasis:names:tc:opendocument:xmlns:presentation:1.0}class-names").split(" "):
220 usedpresentationstyles.add(style)
222 presentationstyles = root.findall(".//{urn:oasis:names:tc:opendocument:xmlns:style:1.0}style[@{urn:oasis:names:tc:opendocument:xmlns:style:1.0}family='presentation']")
223 add_parent_styles(usedpresentationstyles, presentationstyles)
224 remove_unused_styles(root, usedpresentationstyles, presentationstyles, "presentation style")
226 # 8) graphic styles
227 pages = {
228 "{urn:oasis:names:tc:opendocument:xmlns:drawing:1.0}page",
229 "{urn:oasis:names:tc:opendocument:xmlns:presentation:1.0}notes",
230 "{urn:oasis:names:tc:opendocument:xmlns:style:1.0}handout-master",
231 "{urn:oasis:names:tc:opendocument:xmlns:style:1.0}master-page",
233 usedgraphicstyles = set()
234 useddrawingpagestyles = set()
235 for element in root.findall(".//*[@{urn:oasis:names:tc:opendocument:xmlns:drawing:1.0}style-name]"):
236 style = element.get("{urn:oasis:names:tc:opendocument:xmlns:drawing:1.0}style-name")
237 if element.tag in pages:
238 useddrawingpagestyles.add(style)
239 else:
240 usedgraphicstyles.add(style)
241 for element in root.findall(".//*[@{urn:oasis:names:tc:opendocument:xmlns:drawing:1.0}class-names]"):
242 for style in element.get("{urn:oasis:names:tc:opendocument:xmlns:drawing:1.0}class-names").split(" "):
243 usedgraphicstyles.add(style)
245 graphicstyles = root.findall(".//{urn:oasis:names:tc:opendocument:xmlns:style:1.0}style[@{urn:oasis:names:tc:opendocument:xmlns:style:1.0}family='graphic']")
246 add_parent_styles(usedgraphicstyles, graphicstyles)
247 remove_unused_styles(root, usedgraphicstyles, graphicstyles, "graphic style")
249 # 9) drawing-page styles
250 drawingpagestyles = root.findall(".//{urn:oasis:names:tc:opendocument:xmlns:style:1.0}style[@{urn:oasis:names:tc:opendocument:xmlns:style:1.0}family='drawing-page']")
251 add_parent_styles(useddrawingpagestyles, drawingpagestyles)
252 remove_unused_styles(root, useddrawingpagestyles, drawingpagestyles, "drawing-page style")
255 # TODO 3 other styles
257 # 13) unused font-face-decls
258 usedfonts = set()
259 collect_all_attribute(usedfonts, "{urn:oasis:names:tc:opendocument:xmlns:style:1.0}font-name")
260 collect_all_attribute(usedfonts, "{urn:oasis:names:tc:opendocument:xmlns:style:1.0}font-name-asian")
261 collect_all_attribute(usedfonts, "{urn:oasis:names:tc:opendocument:xmlns:style:1.0}font-name-complex")
262 fonts = root.findall(".//{urn:oasis:names:tc:opendocument:xmlns:style:1.0}font-face")
263 for font in fonts:
264 if not(font.get("{urn:oasis:names:tc:opendocument:xmlns:style:1.0}name") in usedfonts):
265 print("removing unused font-face " + font.get("{urn:oasis:names:tc:opendocument:xmlns:style:1.0}name"))
266 root.find(".//{urn:oasis:names:tc:opendocument:xmlns:office:1.0}font-face-decls").remove(font)
268 # 14) remove rsid attributes
269 styles = root.findall(".//{urn:oasis:names:tc:opendocument:xmlns:style:1.0}style")
270 for style in styles:
271 tp = style.find(".//{urn:oasis:names:tc:opendocument:xmlns:style:1.0}text-properties")
272 if tp is not None:
273 if "{http://openoffice.org/2009/office}rsid" in tp.attrib:
274 print("removing rsid from " + style.get("{urn:oasis:names:tc:opendocument:xmlns:style:1.0}name"))
275 del tp.attrib["{http://openoffice.org/2009/office}rsid"]
276 if "{http://openoffice.org/2009/office}paragraph-rsid" in tp.attrib:
277 print("removing paragraph-rsid from " + style.get("{urn:oasis:names:tc:opendocument:xmlns:style:1.0}name"))
278 del tp.attrib["{http://openoffice.org/2009/office}paragraph-rsid"]
280 # remove office:settings
281 settings = root.find(".//{urn:oasis:names:tc:opendocument:xmlns:office:1.0}settings")
282 if settings is not None:
283 root.remove(settings)
285 # scripts are almost never needed
286 scripts = root.find(".//{urn:oasis:names:tc:opendocument:xmlns:office:1.0}scripts")
287 if scripts is not None:
288 root.remove(scripts)
290 # TODO: replace embedded image with some tiny one
291 # TODO: perhaps replace text with xxx (optionally)?
293 if __name__ == "__main__":
294 infile = sys.argv[1]
295 outfile = sys.argv[2]
297 dom = ET.parse(infile)
298 root = dom.getroot()
300 remove_unused(root)
302 # write output
303 dom.write(outfile, encoding='utf-8', xml_declaration=True)
306 TODO
307 chart:style-name
308 -> chart
309 db:style-name
310 -> table-column, table
311 db:default-row-style-name
312 -> table-row
313 db:default-cell-style-name
314 -> cell
315 style:data-style-name
316 -> data style
317 presentation:presentation-page-layout-name
318 -> presentation-page-layout
319 style:page-layout-name
320 -> "page layout style" ?
321 style:percentage-data-style-name
322 -> data style
323 table:default-cell-style-name
324 -> cell
326 draw:stroke-dash-names
327 -> draw:stroke-dash
329 draw:fill-gradient-name
330 -> gradient
331 draw:fill-hatch-name
332 -> hatch
333 draw:fill-image-name
334 -> bitmap
335 draw:opacity-name
336 -> gradient
337 draw:stroke-dash
338 -> draw:stroke-dash
339 draw:marker-start
340 draw:marker-end
343 # vim: set shiftwidth=4 softtabstop=4 expandtab: