pdftotext
[c-standard.git] / translit.sh
blob0bf85a05a3c524d549f0c71c11702cdb5c4510fd
1 #!/bin/sh
3 # assumes utf8 locale..
4 # remove nonascii from the output of pdftotext -layout standard.pdf
6 sed '
7 s/\f/(newpage)/g
8 # utf8 fixes
9 s/fi/fi/g
10 s/fl/fl/g
11 s/§/!S/g
12 s/©/(C)/g
13 s/—/--/g
14 s/−/-/g
15 s/∗/*/g
16 s/ˆ/^/g
17 s/〈/</g
18 s/〉/>/g
19 s/⎡/[^/g
20 s/⎤/^]/g
21 s/⎣/[_/g
22 s/⎦/_]/g
23 s/⎢/[ /g
24 s/⎥/ ]/g
25 s/⎧/{/g
26 s/⎨/{/g
27 s/⎩/{/g
28 s/±/(+-)/g
29 s/≤/<=/g
30 s/≥/>=/g
31 s/≠/!=/g
32 s/Σ/(Sum)/g
33 s/√/sqrt:/g
34 s/π/pi/g
35 s/∞/(inf)/g
36 s/ƒ/fl./g
37 s/∫/(integral)/g
38 s/Γ/(Gamma)/g
39 s/×/x/g
40 s/•/o/g
41 s/⎯/-/g
42 s/↑/(uparrow)/g
43 s/↓/(downarrow)/g
44 s/↔/(<->)/g
45 s/→/(->)/g
46 s/‘/'\''/g
47 s/’/'\''/g
48 # pdftotext layout fixes
49 s/_ _/__/g
50 s/\([0-9]\). \([0-9]\)/\1.\2/g
51 ' | LC_ALL=C tr -c '\n-~' '?' | awk '
52 BEGIN {
53 getline
54 last=$0
55 side=0
57 /^\(newpage\)/ {
58 n=split(last,a)
59 if(side)
60 p=a[1]
61 else
62 p=a[n]
63 side=!side
64 print "[page " p "]"
65 getline
66 getline
67 last=$0
68 next
71 print last
72 last=$0