conv-specchars

   1 #!/bin/bash
   2 # -*- mode: sh; coding: utf-8 -*-
   3 # $Date: 2008/03/01 06:26:19 $
   4
   5 # Do the matching for Latin alphabet (English), but in the UTF-8 encoding (for wide characters).
   6 LC_ALL=POSIX.UTF8 sed -e "# I use the doublequote because I use ' in the scripts.
   7 # NB: take care when writing this map that it should basically be injective
   8 # (except for unimportant variants like s^' and s'^): if it's not, it means
   9 # there is a mess in your transcription system, and you should fix your trancripts.
  10 # FIXME: Terrible bug: now, '\Fut' gives '\wˣut'. This is not elegantly
  11 # avoidable in this sed script, so I'll rewrite the program in another language.
  12 #############################################
  13 # Very special combinations of symbols
  14 # (these transformations must come in the script before the other
  15 # transformation because the other are more general
  16 # and would match these special cases as well.)
  17 # Sometimes I wrote the phonetic variants of sʹ, zʹ:
  18 # (FIXME: when I should use sʹ and ś in the recoded script?)
  19 # (s^' can also represent Russian s^', as in 'es^'o'.)
  20 s|s'^|ś|g
  21 s|s^'|ś|g
  22 s|z'^|ź|g
  23 s|z^'|ź|g
  24 # possibly I could write the phonetic variant of t':
  25 s|c^'|ć|g # soft, of course
  26 s|c'^|ć|g
  27 # FIXME: How did I write an analoguous variant of d'? E.g., the word for \`Теньгушево' after a voiced? Or it is hard?
  28 ################################################
  29 # Fricatives:
  30 s|s^|š|g # fricative
  31 s|c^|č|g # affricate, pronounced hard
  32 s|z^|ž|g # fricative
  33 s|Z^|ǯ|g # affricate
  34 # At last, the simplest combinations:
  35 #s|Z|ʒ|g # affricate -- Commented out, because there seems there weren't this sound in Mordvin or Komi speech, but I wrote it by mistake. So this conversion will catch the errors.
  36 # (FIXME: What about affricates with the palatal sibilants: ź, ś? Z^'; and c^' is already present.
  37 ###############################################
  38 # Vowels:
  39 s|@|ə|g # schwa (unclear vowel that can be pronounced by them with any quality)
  40 # FIXME: (Komi) try to put the Cyrillic-o-umlaut in the Cyrillic orthography; example:
  41 #  казьт воспоминание казьт@д
  42 #
  43 ###############################################
  44 # The voiceless:
  45 s|R|rˣ|g # the voiceless r
  46 s|L|lˣ|g # the voiceless l
  47 #s|F|wˣ|g # the voiceless w (sounds also like x, h)
  48 # -- disabled 'F' because of the '\Fut' bug. Actually,
  49 # 'W' for this sound looks more systematic in my system, so perhaps
  50 # I should have used it, shouldn't I?
  51 s|W|wˣ(?)|g # FIXME: sometimes I wrote 'W', is it the same sound?
  52 s|J|jˣ|g # the voiceless j (sounds also like x, h; soft? Perhaps, I didn't distinguish 2 sounds--soft and hard--when using this sign.)
  53 ##############################################
  54 # N:
  55 s|N|ŋ|g # velar or nasalization
  56 h # Save before the destructive checks (copy pattern space to hold space).
  57 ##########################################
  58 # Well-formedness of the input (is checked destructively):
  59 # - no ^'s should remain;
  60 # - other capitals should not have been used (no S^, no C^ and no plain capitals);
  61 # - more?
  62 s|\$Date.*\$|| # ignore the 'preamble'
  63 s|\\url{.*}|| # ignore filenames, which have been marked up like this: \url{Komi_rus_slovar1.pdf}
  64 /\^/ {
  65 w /dev/stderr
  66 q 56;} # (mostly GNU sed extensions)
  67 /\(^\|[^\\]\)[[:upper:]]/ { # [^\\] -- a workaround for grammatical terms,
  68 # to escape these restrictions on Latin characters: write them like \Pres, \Gen etc.
  69 w /dev/stderr
  70 q 57;} # (mostly GNU sed extensions)
  71 g # Restore after the destructive checks (copy/append hold space to pattern space).
  72 "
  73
  74 # FIXME: A "keep-going" mode: detect all the non-well-formednesses, and accumulate all
  75 # the error codes in the exitCode (say, as bits).
  76
  77 exitCode="$?"
  78 case "$exitCode" in
  79 56) echo $"-- error: non well-formed input (a left-over '^')!" >/dev/stderr
  80     ;;
  81 57) echo $"-- error: non well-formed input (a left-over capital Latin char)!" >/dev/stderr
  82     ;;
  83 esac
  84 exit "$exitCode"
  85
  86 # (Mordvin) Check that I have marked softenss? E.g., :
  87 # |ti|t'i|
  88