manually fix n1570.html, fix x^y in cpow for n1256.html
[c-standard.git] / translit.sh
blobdce2b08d30e21a7a42df683c22f1a2d53b424d45
1 #!/bin/sh
3 # assumes utf8 locale..
4 # remove nonascii from the output of pdftotext -layout standard.pdf
6 sed '
7 s/\f/(newpage)/g
8 # utf8 fixes
9 s/fi/fi/g
10 s/fl/fl/g
11 s/ff/ff/g
12 s/ffi/ffi/g
13 s/§/!S/g
14 s/©/(C)/g
15 s/—/--/g
16 s/−/-/g
17 s/–/-/g
18 s/∗/*/g
19 s/ˆ/^/g
20 s/〈/</g
21 s/〉/>/g
22 s/⎡/[^/g
23 s/⎤/^]/g
24 s/⎣/[_/g
25 s/⎦/_]/g
26 s/⎢/[ /g
27 s/⎥/ ]/g
28 s/⎧/{/g
29 s/⎨/{/g
30 s/⎩/{/g
31 s/±/(+-)/g
32 s/≤/<=/g
33 s/≥/>=/g
34 s/≠/!=/g
35 s/Σ/(Sum)/g
36 s/√/(sqrt)/g
37 s/π/pi/g
38 s/∞/(inf)/g
39 s/ƒ/fl./g
40 s/∫/(integral)/g
41 s/Γ/(Gamma)/g
42 s/×/x/g
43 s/•/o/g
44 s/⎯/-/g
45 s/↑/(uparrow)/g
46 s/↓/(downarrow)/g
47 s/↔/<->/g
48 s/→/->/g
49 s/‘/'\''/g
50 s/’/'\''/g
51 s/“/"/g
52 s/”/"/g
53 s/∼/~/g
54 # pdftotext layout fixes
55 s/_ _/__/g
56 # floats are sometimes broken
57 s/\([0-9]\)\. \([0-9]\)/\1.\2/g
58 ' | LC_ALL=C tr -c '\n-~' '?' | awk '
59 BEGIN {
60 getline
61 last=$0
62 side=0
64 /^$/ {
65 nl=nl "\n"
66 next
68 # TODO: shift page numbers
69 #function inc(x) {
70 # if (x ~ /[0-9]/)
71 # return x+1
72 # if (sub(/viii$/,"ix",x) ||
73 # sub(/iii$/,"iv",x) ||
74 # sub(/iv$/,"v",x) ||
75 # sub(/ix$/,"x",x))
76 # return x
77 # return x "i"
79 /^\(newpage\)/ {
80 n=split(last,a)
81 if(side)
82 p=a[1]
83 else
84 p=a[n]
85 side=!side
86 # if (p !~ /[0-9]/ && $0 ~ /INTERNATIONAL STANDARD/)
87 # p=0
88 # print "\n[page " inc(p) "]"
89 print "\n[page " p "]"
90 getline
91 getline
92 last=$0
93 next
96 print last
97 last=nl $0
98 nl=""