scripts/python/md-split.py

   1 #! /usr/bin/env python
   2
   3 # A script that splits a Markdown file into plain text (for spell checking) and c++ files.
   4
   5
   6 from __future__ import absolute_import, print_function, unicode_literals
   7
   8 import os
   9 import shutil
  10 import io
  11 import argparse
  12
  13 import re
  14 TAG_REGEX = re.compile(r'(<!--.*?-->|<[^>]*>)')
  15 NAMED_A_TAG_REGEX = re.compile(r'.*name ?= ?"([^"]*)"')
  16
  17 def main():
  18     """
  19     This script ended up ugly, so in case somebody wants to reimplement, here is the spec that grew by time.
  20
  21     What it should do it take a markdown file, and split it into more files. A targetfile should have the same
  22     number of lines as the original, with source code snippets and markdown non-words removed, for spell-checking.
  23
  24     Each code snipped should go into a separate file in codedir.
  25
  26     Each code snipped should get additional C++ code around it to help compile the line in context, with
  27     some heuristic guessing of what is needed around. The wrapping code should have a token in each line allowing
  28     other tools to filter out these lines
  29
  30     The name for each file chosen consists os the section id in the markdown document, a counter for the snippet inside the section.
  31
  32     Snippets without code (only comments) or containing lines starting with ??? should not yeld files,
  33     but the counter for naming snippets should still increment.
  34     """
  35     parser = argparse.ArgumentParser(description='Split md file into plain text and code blocks')
  36     parser.add_argument('sourcefile',
  37                         help='which file to read')
  38     parser.add_argument('targetfile',
  39                         help='where to put plain text')
  40     parser.add_argument('codedir',
  41                         help='where to put codeblocks')
  42     args = parser.parse_args()
  43
  44     # ensure folder exists
  45     if not os.path.exists(args.codedir):
  46         os.makedirs(args.codedir)
  47
  48
  49     if os.path.exists(args.targetfile):
  50         os.remove(args.targetfile)
  51
  52     code_block_index = 0
  53     last_header = ''
  54     linenum = 0
  55     with io.open(args.sourcefile, 'r') as read_filehandle:
  56         with io.open(args.targetfile, 'w') as text_filehandle:
  57             for line in read_filehandle:
  58                 linenum += 1
  59                 indent_depth = is_code(line)
  60                 if indent_depth:
  61                     (line, linenum) = process_code(read_filehandle,
  62                                                     text_filehandle,
  63                                                     line, linenum,
  64                                                     args.sourcefile, args.codedir,
  65                                                     last_header, code_block_index,
  66                                                     indent_depth)
  67                     code_block_index += 1
  68                 # reach here either line was not code, or was code
  69                 # and we dealt with n code lines
  70                 if indent_depth < 4 or not is_code(line, indent_depth):
  71                     # store header id for codeblock
  72                     section_id = get_marker(line)
  73                     if section_id is not None:
  74                         code_block_index = 0
  75                         last_header = section_id
  76                     sline = stripped(line)
  77                     text_filehandle.write(sline)
  78
  79     assert line_length(args.sourcefile) == line_length(args.targetfile)
  80
  81
  82 def process_code(read_filehandle, text_filehandle, line, linenum, sourcefile, codedir, name, index, indent_depth):
  83     fenced = (line.strip() == '```')
  84     if fenced:
  85         try:
  86             line = read_filehandle.readLine()
  87             linenum += 1
  88             text_filehandle.write('\n')
  89         except StopIteration:
  90             return ('', linenum)
  91     start_linenum = linenum
  92     has_actual_code = False
  93     has_question_marks = False
  94     linebuffer = []
  95     while ((fenced and line.strip() != '```') or (not fenced and is_inside_code(line, indent_depth))):
  96         # copy comments to plain text for spell check
  97         comment_idx = line.find('//')
  98         no_comment_line = line
  99         if comment_idx >= 0:
 100             no_comment_line = line[:comment_idx].strip()
 101             text_filehandle.write(line[comment_idx + 2:])
 102         else:
 103             # write empty line so line numbers stay stable
 104             text_filehandle.write('\n')
 105
 106         if (not has_actual_code
 107             and not line.strip().startswith('//')
 108             and not line.strip().startswith('???')
 109             and not line.strip() == ''):
 110             has_actual_code = True
 111
 112         if (not line.strip() == '```'):
 113             if ('???' == no_comment_line or '...' == no_comment_line):
 114                 has_question_marks = True
 115             linebuffer.append(dedent(line, indent_depth) if not fenced else line)
 116         try:
 117             line = read_filehandle.readline()
 118             linenum += 1
 119         except StopIteration:
 120             line = ''
 121             break
 122     codefile = os.path.join(codedir, '%s%s.cpp' % (name, index))
 123     if fenced:
 124         text_filehandle.write('\n')
 125
 126     if (has_actual_code and not has_question_marks):
 127         linebuffer = clean_trailing_newlines(linebuffer)
 128         write_with_harness(codefile, sourcefile, start_linenum, linebuffer)
 129     return (line, linenum)
 130
 131
 132 def clean_trailing_newlines(linebuffer):
 133     result = []
 134     code_started = False
 135     linebuffer.reverse()
 136     for line in linebuffer:
 137         if not code_started and line == '\n':
 138             continue
 139         code_started = True
 140         result.append(line)
 141     result.reverse()
 142     return result
 143
 144
 145 def write_with_harness(codefile, sourcefile, start_linenum, linebuffer):
 146     '''write output with additional lines to make code likely compilable'''
 147     # add commonly used headers, so that lines can likely compile.
 148     # This is work in progress, the main issue remains handling class
 149     # declarations in in-function code differently
 150     with io.open(codefile, 'w') as code_filehandle:
 151         code_filehandle.write('''\
 152 #include<stdio.h>      // by md-split
 153 #include<stdlib.h>     // by md-split
 154 #include<tuple>        // by md-split
 155 #include<utility>      // by md-split
 156 #include<limits>       // by md-split
 157 #include<functional>   // by md-split
 158 #include<string>       // by md-split
 159 #include<map>          // by md-split
 160 #include<iostream>     // by md-split
 161 #include<vector>       // by md-split
 162 #include<algorithm>    // by md-split
 163 #include<memory>       // by md-split
 164 using namespace std;   // by md-split
 165 // %s : %s
 166 ''' % (sourcefile, start_linenum))
 167         # TODO: if not toplevel code, wrap inside class
 168         for codeline in linebuffer:
 169             code_filehandle.write(codeline)
 170
 171
 172 def is_code(line, indent_depth = 4):
 173     '''returns the indent depth, 0 means not code in markup'''
 174     if line.startswith(' ' * indent_depth):
 175         return len(line) - len(line.lstrip(' '))
 176     return 0
 177
 178 def is_inside_code(line, indent_depth):
 179     return is_code(line, indent_depth) > 0 or line.strip() == ''
 180
 181 def stripped(line):
 182     # Remove well-formed html tags, fixing mistakes by legitimate users
 183     sline = TAG_REGEX.sub('', line)
 184     sline = re.sub('[()\[\]#*]', ' ', line)
 185     return sline
 186
 187 def dedent(line, indent_depth):
 188     if line.startswith(' ' * indent_depth):
 189         return line[indent_depth:]
 190     if line.startswith('\t'):
 191         return line[1:]
 192     return line
 193
 194 def get_marker(line):
 195     matchlist = TAG_REGEX.findall(line)
 196     if matchlist:
 197         namematch = NAMED_A_TAG_REGEX.match(line)
 198         if namematch:
 199             return namematch.group(1) # group 0 is full match
 200
 201     return None
 202
 203 def line_length(filename):
 204     return sum(1 for line in open(filename))
 205
 206 if __name__ == '__main__':
 207     main()