1 #! /usr/bin/env nix-shell
2 #! nix-shell -I nixpkgs=channel:nixos-unstable -i python3 -p python3 -p python3.pkgs.lxml
5 Pandoc will strip any markup within code elements so
6 let’s escape them so that they can be handled manually.
9 import lxml
.etree
as ET
13 def replace_element_by_text(el
: ET
.Element
, text
: str) -> None:
16 Source: https://stackoverflow.com/a/10520552/160386
17 SPDX-License-Identifier: CC-BY-SA-3.0
19 text
= text
+ (el
.tail
or "")
20 parent
= el
.getparent()
21 if parent
is not None:
22 previous
= el
.getprevious()
23 if previous
is not None:
24 previous
.tail
= (previous
.tail
or "") + text
26 parent
.text
= (parent
.text
or "") + text
29 DOCBOOK_NS
= "http://docbook.org/ns/docbook"
31 # List of elements that pandoc’s DocBook reader strips markup from.
32 # https://github.com/jgm/pandoc/blob/master/src/Text/Pandoc/Readers/DocBook.hs
60 XMLNS_REGEX
= re
.compile(r
'\s+xmlns(?::[^=]+)?="[^"]*"')
61 ROOT_ELEMENT_REGEX
= re
.compile(r
'^\s*<[^>]+>')
63 def remove_xmlns(match
: re
.Match
) -> str:
65 Removes xmlns attributes.
67 Expects a match containing an opening tag.
69 return XMLNS_REGEX
.sub('', match
.group(0))
71 if __name__
== '__main__':
72 assert len(sys
.argv
) >= 3, "usage: escape-code-markup.py <input> <output>"
74 tree
= ET
.parse(sys
.argv
[1])
75 name_predicate
= " or ".join([f
"local-name()='{el}'" for el
in code_elements
])
77 for markup
in tree
.xpath(f
"//*[({name_predicate}) and namespace-uri()='{DOCBOOK_NS}']/*"):
78 text
= ET
.tostring(markup
, encoding
=str)
80 # tostring adds xmlns attributes to the element we want to stringify
81 # as if it was supposed to be usable standalone.
82 # We are just converting it to CDATA so we do not care.
83 # Let’s strip the namespace declarations to keep the code clean.
85 # Note that this removes even namespaces that were potentially
86 # in the original file. Though, that should be very rare –
87 # most of the time, we will stringify empty DocBook elements
88 # like <xref> or <co> or, at worst, <link> with xlink:href attribute.
90 # Also note that the regex expects the root element to be first
91 # thing in the string. But that should be fine, the tostring method
92 # does not produce XML declaration or doctype by default.
93 text
= ROOT_ELEMENT_REGEX
.sub(remove_xmlns
, text
)
95 replace_element_by_text(markup
, text
)
97 tree
.write(sys
.argv
[2])