2 # -*- mode: sh; coding: utf-8 -*-
3 # $Date: 2008/03/01 06:26:19 $
5 # Do the matching for Latin alphabet (English), but in the UTF-8 encoding (for wide characters).
6 LC_ALL
=POSIX.UTF8
sed -e "# I use the doublequote because I use ' in the scripts.
7 # NB: take care when writing this map that it should basically be injective
8 # (except for unimportant variants like s^' and s'^): if it's not, it means
9 # there is a mess in your transcription system, and you should fix your trancripts.
10 # FIXME: Terrible bug: now, '\Fut' gives '\wˣut'. This is not elegantly
11 # avoidable in this sed script, so I'll rewrite the program in another language.
12 #############################################
13 # Very special combinations of symbols
14 # (these transformations must come in the script before the other
15 # transformation because the other are more general
16 # and would match these special cases as well.)
17 # Sometimes I wrote the phonetic variants of sʹ, zʹ:
18 # (FIXME: when I should use sʹ and ś in the recoded script?)
19 # (s^' can also represent Russian s^', as in 'es^'o'.)
24 # possibly I could write the phonetic variant of t':
25 s|c^'|ć|g # soft, of course
27 # FIXME: How did I write an analoguous variant of d'? E.g., the word for \`Теньгушево' after a voiced? Or it is hard?
28 ################################################
31 s|c^|č|g # affricate, pronounced hard
34 # At last, the simplest combinations:
35 #s|Z|ʒ|g # affricate -- Commented out, because there seems there weren't this sound in Mordvin or Komi speech, but I wrote it by mistake. So this conversion will catch the errors.
36 # (FIXME: What about affricates with the palatal sibilants: ź, ś? Z^'; and c^' is already present.
37 ###############################################
39 s|@|ə|g # schwa (unclear vowel that can be pronounced by them with any quality)
40 # FIXME: (Komi) try to put the Cyrillic-o-umlaut in the Cyrillic orthography; example:
41 # казьт воспоминание казьт@д
43 ###############################################
45 s|R|rˣ|g # the voiceless r
46 s|L|lˣ|g # the voiceless l
47 #s|F|wˣ|g # the voiceless w (sounds also like x, h)
48 # -- disabled 'F' because of the '\Fut' bug. Actually,
49 # 'W' for this sound looks more systematic in my system, so perhaps
50 # I should have used it, shouldn't I?
51 s|W|wˣ(?)|g # FIXME: sometimes I wrote 'W', is it the same sound?
52 s|J|jˣ|g # the voiceless j (sounds also like x, h; soft? Perhaps, I didn't distinguish 2 sounds--soft and hard--when using this sign.)
53 ##############################################
55 s|N|ŋ|g # velar or nasalization
56 h # Save before the destructive checks (copy pattern space to hold space).
57 ##########################################
58 # Well-formedness of the input (is checked destructively):
59 # - no ^'s should remain;
60 # - other capitals should not have been used (no S^, no C^ and no plain capitals);
62 s|\$Date.*\$|| # ignore the 'preamble'
63 s|\\url{.*}|| # ignore filenames, which have been marked up like this: \url{Komi_rus_slovar1.pdf}
66 q 56;} # (mostly GNU sed extensions)
67 /\(^\|[^\\]\)[[:upper:]]/ { # [^\\] -- a workaround for grammatical terms,
68 # to escape these restrictions on Latin characters: write them like \Pres, \Gen etc.
70 q 57;} # (mostly GNU sed extensions)
71 g # Restore after the destructive checks (copy/append hold space to pattern space).
74 # FIXME: A "keep-going" mode: detect all the non-well-formednesses, and accumulate all
75 # the error codes in the exitCode (say, as bits).
79 56) echo $
"-- error: non well-formed input (a left-over '^')!" >/dev
/stderr
81 57) echo $
"-- error: non well-formed input (a left-over capital Latin char)!" >/dev
/stderr
86 # (Mordvin) Check that I have marked softenss? E.g., :