annot fix: end pre before para
[c-standard.git] / translit.sh
blobd880571238bd0ca60a877469ad66bee3365d677d
1 #!/bin/sh
3 # assumes utf8 locale..
4 # remove nonascii from the output of pdftotext -layout standard.pdf
6 sed '
7 s/\f/(newpage)/g
8 # utf8 fixes
9 s/fi/fi/g
10 s/fl/fl/g
11 s/§/!S/g
12 s/©/(C)/g
13 s/—/--/g
14 s/−/-/g
15 s/∗/*/g
16 s/ˆ/^/g
17 s/〈/</g
18 s/〉/>/g
19 s/⎡/[^/g
20 s/⎤/^]/g
21 s/⎣/[_/g
22 s/⎦/_]/g
23 s/⎢/[ /g
24 s/⎥/ ]/g
25 s/⎧/{/g
26 s/⎨/{/g
27 s/⎩/{/g
28 s/±/(+-)/g
29 s/≤/<=/g
30 s/≥/>=/g
31 s/≠/!=/g
32 s/Σ/(Sum)/g
33 s/√/(sqrt)/g
34 s/π/pi/g
35 s/∞/(inf)/g
36 s/ƒ/fl./g
37 s/∫/(integral)/g
38 s/Γ/(Gamma)/g
39 s/×/x/g
40 s/•/o/g
41 s/⎯/-/g
42 s/↑/(uparrow)/g
43 s/↓/(downarrow)/g
44 s/↔/<->/g
45 s/→/->/g
46 s/‘/'\''/g
47 s/’/'\''/g
48 # pdftotext layout fixes
49 s/_ _/__/g
50 # floats are sometimes broken
51 s/\([0-9]\)\. \([0-9]\)/\1.\2/g
52 ' | LC_ALL=C tr -c '\n-~' '?' | awk '
53 BEGIN {
54 getline
55 last=$0
56 side=0
58 /^$/ {
59 nl=nl "\n"
60 next
62 # TODO: shift page numbers
63 #function inc(x) {
64 # if (x ~ /[0-9]/)
65 # return x+1
66 # if (sub(/viii$/,"ix",x) ||
67 # sub(/iii$/,"iv",x) ||
68 # sub(/iv$/,"v",x) ||
69 # sub(/ix$/,"x",x))
70 # return x
71 # return x "i"
73 /^\(newpage\)/ {
74 n=split(last,a)
75 if(side)
76 p=a[1]
77 else
78 p=a[n]
79 side=!side
80 # if (p !~ /[0-9]/ && $0 ~ /INTERNATIONAL STANDARD/)
81 # p=0
82 # print "\n[page " inc(p) "]"
83 print "\n[page " p "]"
84 getline
85 getline
86 last=$0
87 next
90 print last
91 last=nl $0
92 nl=""