bump date and make CI happy
[CppCoreGuidelines.git] / scripts / python / md-split.py
blob2403c6f6db962da394f32e662e8efe5d242392e4
1 #! /usr/bin/env python
3 # A script that splits a Markdown file into plain text (for spell checking) and c++ files.
6 from __future__ import absolute_import, print_function, unicode_literals
8 import os
9 import shutil
10 import io
11 import argparse
13 import re, cgi
14 TAG_REGEX = re.compile(r'(<!--.*?-->|<[^>]*>)')
15 NAMED_A_TAG_REGEX = re.compile(r'.*name ?= ?"([^"]*)"')
17 def main():
18 """
19 This script ended up ugly, so in case somebody wants to reimplement, here is the spec that grew by time.
21 What it should do it take a markdown file, and split it into more files. A targetfile should have the same
22 number of lines as the original, with source code snippets and markdown non-words removed, for spell-checking.
24 Each code snipped should go into a separate file in codedir.
26 Each code snipped should get additional C++ code around it to help compile the line in context, with
27 some heuristic guessing of what is needed around. The wrapping code should have a token in each line allowing
28 other tools to filter out these lines
30 The name for each file chosen consists os the section id in the markdown document, a counter for the snippet inside the section.
32 Snippets without code (only comments) or containing lines starting with ??? should not yeld files,
33 but the counter for naming snippets should still increment.
34 """
35 parser = argparse.ArgumentParser(description='Split md file into plain text and code blocks')
36 parser.add_argument('sourcefile',
37 help='which file to read')
38 parser.add_argument('targetfile',
39 help='where to put plain text')
40 parser.add_argument('codedir',
41 help='where to put codeblocks')
42 args = parser.parse_args()
44 # ensure folder exists
45 if not os.path.exists(args.codedir):
46 os.makedirs(args.codedir)
49 if os.path.exists(args.targetfile):
50 os.remove(args.targetfile)
52 code_block_index = 0
53 last_header = ''
54 linenum = 0
55 with io.open(args.sourcefile, 'r') as read_filehandle:
56 with io.open(args.targetfile, 'w') as text_filehandle:
57 for line in read_filehandle:
58 linenum += 1
59 indent_depth = is_code(line)
60 if indent_depth:
61 (line, linenum) = process_code(read_filehandle,
62 text_filehandle,
63 line, linenum,
64 args.sourcefile, args.codedir,
65 last_header, code_block_index,
66 indent_depth)
67 code_block_index += 1
68 # reach here either line was not code, or was code
69 # and we dealt with n code lines
70 if indent_depth < 4 or not is_code(line, indent_depth):
71 # store header id for codeblock
72 section_id = get_marker(line)
73 if section_id is not None:
74 code_block_index = 0
75 last_header = section_id
76 sline = stripped(line)
77 text_filehandle.write(sline)
79 assert line_length(args.sourcefile) == line_length(args.targetfile)
82 def process_code(read_filehandle, text_filehandle, line, linenum, sourcefile, codedir, name, index, indent_depth):
83 fenced = (line.strip() == '```')
84 if fenced:
85 try:
86 line = read_filehandle.readLine()
87 linenum += 1
88 text_filehandle.write('\n')
89 except StopIteration:
90 return ('', linenum)
91 start_linenum = linenum
92 has_actual_code = False
93 has_question_marks = False
94 linebuffer = []
95 while ((fenced and line.strip() != '```') or (not fenced and is_inside_code(line, indent_depth))):
96 # copy comments to plain text for spell check
97 comment_idx = line.find('//')
98 no_comment_line = line
99 if comment_idx >= 0:
100 no_comment_line = line[:comment_idx].strip()
101 text_filehandle.write(line[comment_idx + 2:])
102 else:
103 # write empty line so line numbers stay stable
104 text_filehandle.write('\n')
106 if (not has_actual_code
107 and not line.strip().startswith('//')
108 and not line.strip().startswith('???')
109 and not line.strip() == ''):
110 has_actual_code = True
112 if (not line.strip() == '```'):
113 if ('???' == no_comment_line or '...' == no_comment_line):
114 has_question_marks = True
115 linebuffer.append(dedent(line, indent_depth) if not fenced else line)
116 try:
117 line = read_filehandle.readline()
118 linenum += 1
119 except StopIteration:
120 line = ''
121 break
122 codefile = os.path.join(codedir, '%s%s.cpp' % (name, index))
123 if fenced:
124 text_filehandle.write('\n')
126 if (has_actual_code and not has_question_marks):
127 linebuffer = clean_trailing_newlines(linebuffer)
128 write_with_harness(codefile, sourcefile, start_linenum, linebuffer)
129 return (line, linenum)
132 def clean_trailing_newlines(linebuffer):
133 result = []
134 code_started = False
135 linebuffer.reverse()
136 for line in linebuffer:
137 if not code_started and line == '\n':
138 continue
139 code_started = True
140 result.append(line)
141 result.reverse()
142 return result
145 def write_with_harness(codefile, sourcefile, start_linenum, linebuffer):
146 '''write output with additional lines to make code likely compilable'''
147 # add commonly used headers, so that lines can likely compile.
148 # This is work in progress, the main issue remains handling class
149 # declarations in in-function code differently
150 with io.open(codefile, 'w') as code_filehandle:
151 code_filehandle.write('''\
152 #include<stdio.h> // by md-split
153 #include<stdlib.h> // by md-split
154 #include<tuple> // by md-split
155 #include<utility> // by md-split
156 #include<limits> // by md-split
157 #include<functional> // by md-split
158 #include<string> // by md-split
159 #include<map> // by md-split
160 #include<iostream> // by md-split
161 #include<vector> // by md-split
162 #include<algorithm> // by md-split
163 #include<memory> // by md-split
164 using namespace std; // by md-split
165 // %s : %s
166 ''' % (sourcefile, start_linenum))
167 # TODO: if not toplevel code, wrap inside class
168 for codeline in linebuffer:
169 code_filehandle.write(codeline)
172 def is_code(line, indent_depth = 4):
173 '''returns the indent depth, 0 means not code in markup'''
174 if line.startswith(' ' * indent_depth):
175 return len(line) - len(line.lstrip(' '))
176 return 0
178 def is_inside_code(line, indent_depth):
179 return is_code(line, indent_depth) > 0 or line.strip() == ''
181 def stripped(line):
182 # Remove well-formed html tags, fixing mistakes by legitimate users
183 sline = TAG_REGEX.sub('', line)
184 sline = re.sub('[()\[\]#*]', ' ', line)
185 return sline
187 def dedent(line, indent_depth):
188 if line.startswith(' ' * indent_depth):
189 return line[indent_depth:]
190 if line.startswith('\t'):
191 return line[1:]
192 return line
194 def get_marker(line):
195 matchlist = TAG_REGEX.findall(line)
196 if matchlist:
197 namematch = NAMED_A_TAG_REGEX.match(line)
198 if namematch:
199 return namematch.group(1) # group 0 is full match
201 return None
203 def line_length(filename):
204 return sum(1 for line in open(filename))
206 if __name__ == '__main__':
207 main()