scripts/syllabify.py

   1 # -*- coding: utf-8 -*-
   2 import re, sys, argparse
   3
   4 class Sign():
   5     """
   6     Represent a syllable constituent (a single alphabetical character),
   7     with other text (mute characters, punctuation, spaces, etc.) attached
   8     to it.
   9
  10     sign.get_char() gives the alphabetical syllable consituent.
  11     sign.get_text() gives the whole text attached to the sign
  12     """
  13     def __init__(self, c):
  14         self._sign = c
  15         self._text = ""
  16         self._word_end = False
  17         self._word_start = False
  18         self._forced_syllable_end = False
  19         self._forced_syllable_start = False
  20
  21     def add_text(self, str):
  22         self._text = "".join((self._text, str))
  23
  24     def set_forced_syllable_end(self):
  25         self._forced_syllable_end = True
  26
  27     def forced_syllable_end(self):
  28         return self._forced_syllable_end
  29
  30     def set_forced_syllable_start(self):
  31         self._forced_syllable_start = True
  32
  33     def forced_syllable_start(self):
  34         return self._forced_syllable_start
  35
  36     def word_end(self):
  37         return self._word_end
  38
  39     def set_word_end(self):
  40         self._word_end = True
  41
  42     def word_start(self):
  43         return self._word_start
  44
  45     def set_word_start(self):
  46         self._word_start = True
  47
  48     def get_char(self):
  49         return self._sign
  50
  51     def get_text(self):
  52         return self._text
  53
  54 class SignTokenizer():
  55     """
  56     Provides a method for build a list of signs from a decorated verse string.
  57     Usage:
  58       sign_tokenizer = SignTokenizer()
  59       signs = sign_tokenizer.tokenize("Un ver avec des décorations")
  60     signs being a list of Sign objects
  61
  62     The decorations can be:
  63      - "°" for grouping 'empty' words to 'full' words.
  64        Example:
  65          En°vain j'ay respecté la°celebre memoire
  66          Des°Heros des°siecles passez ;
  67        Can be overriden with word_separator_markers constructor keyword
  68
  69      - "*" for marking a mute letter (e.g. a 'h').
  70        Example:
  71          Et c'est l'*Hyver qui les°rassemble.
  72        Can be overriden with mute_character_marker constructor keyword
  73
  74      - "=" for forcing syllable ends, e.g. for marking a diaeresis.
  75        Example:
  76          Trop *heureux Phrygi=ens, venez icy l'attendre.
  77        Can be overriden with forced_syllable_end_marker constructor keyword
  78
  79      - other unused markers: < > { }
  80        Can be overriden with ignored_markers constructor keyword
  81     """
  82     def __init__(self,
  83                  language = "fr",
  84                  word_separators = " -",
  85                  word_separator_markers = "°",
  86                  simple_punctuations = ".,",
  87                  double_punctuations = ":;?!",
  88                  apostrophes = "'’",
  89                  forced_syllable_end_marker = "=",
  90                  mute_character_marker = "*",
  91                  ignored_markers = "<>{}",
  92                  ignored_characters = "[]()|/~_"
  93                  ):
  94         self.word_separators = word_separators
  95         self.word_separator_markers = word_separator_markers
  96         self.all_word_separators = "".join((word_separators,
  97                                             word_separator_markers))
  98         self.simple_punctuations = simple_punctuations
  99         self.double_punctuations = double_punctuations
 100         self.space_before_double_punctuations = (language == "fr")
 101         self.apostrophes = apostrophes
 102         self.forced_syllable_end_marker = forced_syllable_end_marker
 103         self.mute_character_marker = mute_character_marker
 104         self.ignored_markers = ignored_markers
 105         self.ignored_characters = ignored_characters
 106         self.punctuation_re = re.compile(
 107             " *([{}{}])".format(self.simple_punctuations,
 108                               self.double_punctuations))
 109         self.et_re = re.compile("([Ee]t)({})".format(
 110                 "|".join(self.all_word_separators)))
 111
 112     def _reset(self):
 113         self._prefix = ""
 114         self._current_sign = None
 115         self._signs = []
 116
 117     def _add_sign(self, c):
 118         self._current_sign = Sign(c.lower())
 119         self._signs.append(self._current_sign)
 120         if self._prefix != "":
 121             self._current_sign.add_text(self._prefix)
 122             self._prefix = ""
 123
 124     def _add_prefix(self, prefix):
 125         self._prefix = "".join((self._prefix, prefix))
 126
 127     def _add_text(self, text):
 128         self._current_sign.add_text(text)
 129
 130     def _set_forced_syllable_end(self):
 131         self._current_sign.set_forced_syllable_end()
 132
 133     def _set_word_end(self):
 134         self._current_sign.set_word_end()
 135
 136     def tokenize(self, verse_text):
 137         self._reset()
 138         sign_count = len(verse_text)
 139         i = 0
 140         mute_next = False
 141         word_start = True
 142         while (i < sign_count):
 143             c = verse_text[i]
 144             punctuation_match = self.punctuation_re.match(verse_text[i:])
 145             ## Markers: they are not real text
 146             # forced syllable end marker
 147             if c == self.forced_syllable_end_marker:
 148                 self._set_forced_syllable_end()
 149                 i += 1
 150             # mute character marker
 151             elif c == self.mute_character_marker:
 152                 i += 1
 153                 mute_next = True
 154             # ignored markers
 155             elif c in self.ignored_markers:
 156                 i += 1
 157             ## Actual text
 158             # apostroph
 159             elif c in self.apostrophes:
 160                 self._add_text("’")
 161                 i += 1
 162             # punctuation
 163             elif punctuation_match:
 164                 punct = punctuation_match.group(1)
 165                 if self.space_before_double_punctuations and punct in self.double_punctuations:
 166                     self._add_text("\u00A0")
 167                 self._add_text(punct)
 168                 i += len(punctuation_match.group(0))
 169                 self._set_word_end()
 170                 word_start = True
 171             # word separator
 172             elif c in self.all_word_separators:
 173                 self._set_word_end()
 174                 word_start = True
 175                 if c in self.word_separator_markers:
 176                     self._add_text(" ")
 177                 else:
 178                     self._add_text(c)
 179                 i += 1
 180             # ignored characters
 181             elif c in self.ignored_characters:
 182                 self._add_text(c)
 183                 i += 1
 184             # consonant or vowel
 185             else:
 186                 if mute_next:
 187                     self._add_prefix(c)
 188                     mute_next = False
 189                     i += 1
 190                 else:
 191                     m = word_start and self.et_re.match(verse_text[i:])
 192                     if m:
 193                         # special case: et -> &
 194                         self._add_sign("&")
 195                         self._add_text(m.group(1))
 196                         self._add_text(" ")
 197                         self._set_word_end()
 198                         word_start = True
 199                         i += len(m.group(0))
 200                     else:
 201                         # consonant or vowel
 202                         self._add_sign(c)
 203                         self._add_text(c)
 204                         word_start = False
 205                         i += 1
 206         # the last character is at word end and syllable end
 207         self._set_word_end()
 208         self._set_forced_syllable_end()
 209         # set word_start and forced_syllable_start for characters
 210         # following a word end or forced_syllable_end
 211         at_word_start = True
 212         at_syllable_start = True
 213         for sign in self._signs:
 214             if at_word_start:
 215                 sign.set_word_start()
 216             if at_syllable_start:
 217                 sign.set_forced_syllable_start()
 218             at_word_start = sign.word_end()
 219             at_syllable_start = sign.forced_syllable_end()
 220         return self._signs
 221
 222     def get_chars(self):
 223         return "".join([c.get_char() for c in self._signs])
 224
 225     def get_full_verse(self):
 226         return "".join([c.get_text() for c in self._signs])
 227
 228
 229 class Syllable():
 230     """
 231     Represents a syllable, consisting in a list of signs.
 232     """
 233     def __init__(self):
 234         self._signs = []
 235
 236     def add_sign(self, sign):
 237         self._signs.append(sign)
 238
 239     def add_signs(self, signs):
 240         self._signs.extend(signs)
 241
 242     def get_signs(self):
 243         return self._signs
 244
 245     def set_signs(self, signs):
 246         self._signs = signs
 247
 248     def get_text(self):
 249         return "".join([sign.get_text() for sign in self._signs])
 250
 251     def get_chars(self):
 252         return "".join([sign.get_char() for sign in self._signs])
 253
 254     def is_empty(self):
 255         return not self._signs
 256
 257     def at_word_start(self):
 258         return self._signs[0].word_start()
 259
 260     def at_word_end(self):
 261         return self._signs[-1].word_end()
 262
 263     def is_feminine(self):
 264         """
 265         A syllable is feminine iff:
 266         - it is placed at word end
 267         - it contains exactly one vowel, which is 'e' or 'ë', at the end
 268         (with possibly a final s)
 269         -
 270         """
 271         if self.at_word_end():
 272             chars = "".join([sign.get_char() for sign in self._signs])
 273             # special cases:
 274             # exact words: ces, mes, ses, tes, les, des, es
 275             # have no feminine e
 276             if (self.at_word_start()
 277                 and re.match("^[cmstld]?es$", chars)):
 278                 return False
 279             vowels = ""
 280             for char in chars:
 281                 if char in "aàâäeëéèêœiìïîoôòuùûüy&":
 282                     vowels = "".join((vowels, char))
 283             return not not (
 284                 # only one vowel: e or ë, and word ends with -e or -es
 285                 ((vowels == "e" or vowels == "ë")
 286                  and (vowels == chars[-1] or (vowels + "s") == chars[-2:]))
 287                 # two vowels: "que?" or "gues?"
 288                 or ((vowels == "ue" or vowels == "uë")
 289                     and re.search("[qg]u[eë]s?", chars)))
 290         return False
 291
 292
 293 class SyllableTokenizer():
 294     """
 295     Provides a method for build a list of syllables from a list of signs.
 296     Usage:
 297       sign_tokenizer = SignTokenizer()
 298       syllable_tokenizer = SyllableTokenizer()
 299       signs = sign_tokenizer.tokenize("Un ver avec des décorations")
 300       syllables = syllable_tokenizer.tokenize(signs)
 301     syllables being a list of Syllable objects
 302     """
 303     def __init__(self,
 304                  e_vowels = "eë",
 305                  other_vowels = "aàâäéèêœiìïîoôòuùûüy&",
 306                  consonants_sonority_levels = { 'liquid' : "lrh",
 307                                                 'nasal' : "mn",
 308                                                 'constrictive' : "çfjsvxz",
 309                                                 'occlusive' : "bcdgkpqt" }
 310                  ):
 311         self.e_vowels = e_vowels
 312         self.other_vowels = other_vowels
 313         self.vowels = "".join((e_vowels, other_vowels))
 314         self.consonants_sonority_levels = consonants_sonority_levels
 315         self.consonants = "".join(consonants_sonority_levels.values())
 316         self._reset()
 317         self.re = {
 318             # [something][vowel (no feminine e)]<space>[vowel]
 319             'hiatus' : ".[{}][{}]".format(self.other_vowels, self.vowels),
 320             # <word start>s[cçpt][vowel]
 321             '^sca' : "s[cçpt][{}]".format(self.vowels),
 322             # <word start>s[cp][lr][vowel]
 323             '^scla' : "s[cp][lr][{}]".format(self.vowels),
 324             # <word start>ps[vowel]
 325             '^psa' : "ps[{}]".format(self.vowels),
 326             # gn[vowel]
 327             'gna' : "gn[{}]".format(self.vowels),
 328             # [occlusive bcdgkpqt or constrictive çfjvxz][liquid lrh][vowel]
 329             'bla' : "[{}{}][{}][{}]".format(
 330                 self.consonants_sonority_levels['occlusive'],
 331                 self.consonants_sonority_levels['constrictive'].replace("s", ""),
 332                 self.consonants_sonority_levels['liquid'],
 333                 self.vowels),
 334             # [tpc]h[rl][vowel]
 335             'thra' : "[tpc]h[rl][{}]".format(self.vowels),
 336             # [consonant][vowel]
 337             'ba' : "[{}][{}]".format(self.consonants, self.vowels),
 338             }
 339         self.compiled_re = {}
 340         for (key, string) in self.re.items():
 341             self.compiled_re[key] = re.compile(string)
 342         self._match_data = None
 343
 344     def _match(self, re_key, text):
 345         self._match_data = self.compiled_re[re_key].match(text)
 346         return self._match_data
 347
 348     def _get_match_data(self):
 349         return self._match_data
 350
 351     def _reset(self):
 352         self._syllables = []
 353         self._current_syllable = None
 354         self._first_syllable = Syllable()
 355
 356     def _start_new_syllable(self):
 357         if (self._first_syllable and not self._first_syllable.is_empty()):
 358             self._syllables.append(self._first_syllable)
 359         if not (self._current_syllable
 360                 and self._current_syllable.is_empty()):
 361             self._current_syllable = Syllable()
 362             self._syllables.append(self._current_syllable)
 363         self._first_syllable = None
 364
 365     def _add_sign(self, text):
 366         if self._first_syllable:
 367             self._first_syllable.add_sign(text)
 368         else:
 369             self._current_syllable.add_sign(text)
 370
 371     def get_syllables(self):
 372         return self._syllables
 373
 374     def tokenize(self, signs):
 375         self._reset()
 376         verse_text = "".join([sign.get_char() for sign in signs])
 377         sign_count = len(signs)
 378         i = 0
 379         while (i < sign_count):
 380             word_start = signs[i].word_start()
 381             # forced syllable ends
 382             if (i > 0 and signs[i].forced_syllable_start()):
 383                 self._start_new_syllable()
 384
 385             # Hiatus
 386             # ^[vowel]<space>
 387             if (i == 0
 388                   and verse_text[i] in self.vowels
 389                   and signs[i].word_end()):
 390                 self._add_sign(signs[i])
 391                 i += 1
 392                 self._start_new_syllable()
 393             # [something][vowel (no feminine e)]<space>[vowel]
 394             elif (self._match('hiatus', verse_text[i:])
 395                   and signs[i+1].word_end()):
 396                 self._add_sign(signs[i])
 397                 self._add_sign(signs[i+1])
 398                 self._start_new_syllable()
 399                 self._add_sign(signs[i+2])
 400                 i += 3
 401             elif (
 402                 # <word start>s[cçpt][vowel]
 403                 (word_start and self._match('^sca', verse_text[i:])
 404                  and not signs[i].word_end())
 405                 # <word start>s[cp][lr][vowel]
 406                 or (word_start and self._match('^scla', verse_text[i:])
 407                     and not signs[i].word_end()
 408                     and not signs[i+1].word_end())
 409                 # <word start>ps[vowel]
 410                 or (word_start and self._match('^psa', verse_text[i:]))
 411                 # gn[vowel]
 412                 or (self._match('gna', verse_text[i:])
 413                     and not signs[i].word_end())
 414                 # [bcdgkpqtçfjvxz][lrh][vowel]
 415                 or (self._match('bla', verse_text[i:])
 416                     and not signs[i].word_end())
 417                 # [tpc]h[rl][vowel]
 418                 or (self._match('thra', verse_text[i:])
 419                     and not signs[i+1].word_end())
 420                 # [consonant][vowel]
 421                 or self._match('ba', verse_text[i:])
 422                 ):
 423                 match = self._get_match_data().group(0)
 424                 self._start_new_syllable()
 425                 for x in match:
 426                     self._add_sign(signs[i])
 427                     i += 1
 428             else:
 429                 self._add_sign(signs[i])
 430                 i += 1
 431         return self.get_syllables()
 432
 433
 434 class SyllableTokenizerWithWordSeparation(SyllableTokenizer):
 435     """
 436     A specialized SyllableTokenizer which preferes syllable
 437     breaking between words when possible.  For instance:
 438
 439       "tant attendu"
 440       gives:  tant / at / ten / du
 441       iso:    tan / t at / ten / du
 442
 443     This is useful when breaking verses for lyrics.
 444
 445     Usage:
 446       sign_tokenizer = SignTokenizer()
 447       syllable_tokenizer = SyllableTokenizerWithWordSeparation()
 448       signs = sign_tokenizer.tokenize("Un ver avec des décorations")
 449       syllables = syllable_tokenizer.tokenize(signs)
 450     syllables being a list of Syllable objects
 451     """
 452     def force_word_separation(self, syllables = None):
 453         syllables = syllables or self._syllables
 454         syllable_count = len(syllables)
 455         prev_syllable = syllables[0]
 456         for this_syllable in syllables[1:]:
 457             signs = this_syllable.get_signs()
 458             if not signs[0].word_start() and signs[1:]:
 459                 tokens_count = len(signs)
 460                 i = 1
 461                 while (not signs[i].word_start()
 462                        or not signs[i].get_char() in self.vowels):
 463                     i += 1
 464                     if i == tokens_count:
 465                         break
 466                 else:
 467                     # we found a vowel at word start at index i
 468                     # signs from indices 0 to i-1 go to the previous syllable
 469                     prev_syllable.add_signs(signs[0:i])
 470                     this_syllable.set_signs(signs[i:])
 471             prev_syllable = this_syllable
 472         return syllables
 473
 474     def tokenize(self, signs):
 475         SyllableTokenizer.tokenize(self, signs)
 476         return self.force_word_separation()
 477
 478 class Verse():
 479     """
 480     A verse
 481
 482     Usage:
 483       verse = Verse("Un ver avec des décorations")
 484       # possible pass sign and syllable tokenizers to split:
 485       verse.split()
 486       verse.get_syllables()
 487       => ["Un ", "ve", "r a", "vec ", "des ", "dé", "co", "ra", "tions"]
 488
 489     """
 490     def __init__(self, text, lineno = None):
 491         self._text = text
 492         self._syllables = []
 493         self._lineno = lineno
 494
 495     def get_syllables(self):
 496         return [syll.get_text() for syll in self._syllables]
 497
 498     def get_text(self):
 499         return "".join([syll.get_text() for syll in self._syllables])
 500
 501     def syllabify(self,
 502               sign_tokenizer = SignTokenizer(),
 503               syllable_tokenizer = SyllableTokenizer()
 504               ):
 505         self._syllables = syllable_tokenizer.tokenize(
 506             sign_tokenizer.tokenize(self._text))
 507
 508     def get_metric(self):
 509         return len(self._syllables) - (1 if self._syllables[-1].is_feminine() else 0)
 510
 511     def hyphenate(self, hyphen = "-", add_space = False):
 512         syllables = []
 513         i = 0
 514         count = len(self._syllables)
 515         for syllable in self._syllables:
 516             if (i > 0) and not syllable.at_word_start():
 517                 syllables.append(hyphen)
 518             text = syllable.get_text()
 519             syllables.append(text)
 520             if add_space:
 521                 verse_end = (i == count - 1)
 522                 # if syllable is word end and do not end with a space,
 523                 # add it (unless at verse end)
 524                 if (not verse_end
 525                     and syllable.at_word_end()
 526                     and text[-1] != " "):
 527                     syllables.append(" ")
 528             i += 1
 529         return "".join(syllables)
 530
 531 class Corpus():
 532     """
 533     A corpus, consisting of verses.
 534
 535     Example:
 536     To generate LilyPond lyrics (where syllables in a word are separated
 537     with " -- ")
 538
 539       corpus = Corpus()
 540       corpus.add_verse(["premier ver", "second ver..."])
 541       corpus.syllabify(syllable_tokenizer = SyllableTokenizerWithWordSeparation())
 542       corpus.get_hyphenated_verses(hyphen = " -- ")
 543       => ["pre -- mier ver", "se -- cond ver..."]
 544     """
 545     def __init__(self, filename = None):
 546         self._verses = []
 547         self._filename = filename
 548
 549     def add_verse(self, verse, lineno = None):
 550         """
 551         Add verse (a string) to the corpus.
 552         """
 553         self._verses.append(Verse(verse, lineno))
 554
 555     def get_verses(self):
 556         return self._verses
 557
 558     def syllabify(self,
 559                   sign_tokenizer = SignTokenizer(),
 560                   syllable_tokenizer = SyllableTokenizer()):
 561         """
 562         Syllabify all the corpus verses.
 563         """
 564         for verse in self._verses:
 565             verse.syllabify(sign_tokenizer, syllable_tokenizer)
 566
 567     def get_hyphenated_verses(self, hyphen = "-", add_space = False):
 568         """
 569         Return the hyphenated verses (list of strings) contained in the
 570         corpus.
 571         Corpus.syllabify() is supposed to have been called before.
 572         """
 573         return [verse.hyphenate(hyphen, add_space)
 574                 for verse in self._verses]
 575
 576 class CorpusReader():
 577
 578     def read(self, filename = "-"):
 579         """
 580         Read a corpus file (or stdin if filename is "-")
 581         and produce a Corpus object.
 582         """
 583         file = open(filename, 'r') if (filename != "-") else sys.stdin
 584         corpus = Corpus(filename)
 585         lineno = 0
 586         for line in file:
 587             line = line.strip()
 588             lineno += 1
 589             # skip empty lines
 590             if line == "":
 591                 pass
 592             # skip comments
 593             elif re.match(r"^//", line):
 594                 # TODO: do something
 595                 pass
 596             # TODO: titling directives
 597             elif re.match(r"^#", line):
 598                 pass
 599             # a verse
 600             else:
 601                 # verse format:
 602                 # verse text TAB+ [properties]
 603                 # where properties can be:
 604                 #   [LB]+  breve/long syllables indicators
 605                 #   [AT]+  schema (?)
 606                 #   R      "refrain"
 607                 #   D      "double"
 608                 #   other  lilypond code
 609                 # for now, we only keep the verse text itself
 610                 text = re.sub(r"([^\t]+)\t.*$", r"\1", line)
 611                 corpus.add_verse(text, lineno)
 612         file.close()
 613         return corpus
 614
 615 def main():
 616     """
 617     Syllabify and print verses.
 618     """
 619     parser = argparse.ArgumentParser(
 620         description='Verse syllabication.',
 621         formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 622     parser.add_argument(
 623         '--verse',
 624         metavar="words",
 625         nargs='+',
 626         help='verse words to syllabify (if no corpus is provided)')
 627     parser.add_argument(
 628         '--corpus',
 629         help="Corpus file to syllabify.  Use - for reading from stdin")
 630     parser.add_argument(
 631         '--hyphen',
 632         default=" -- ",
 633         help="String to be used when hyphenating a verse.")
 634     parser.add_argument(
 635         '--format',
 636         default="{hyphenated_verse}",
 637         help="""Python format string for outputing the verse.
 638 Possible keywords, to be used between curly braces in the format string,
 639 are
 640 *) hyphenated_verse: the verse after applying hyphenation
 641 *) verse: the verse without hyphenation
 642 *) metric: the verse metric (a number).""")
 643     args = vars(parser.parse_args())
 644
 645     if args['corpus']:
 646         # Syllabify a corpus
 647         reader = CorpusReader()
 648         corpus = reader.read(args['corpus'])
 649         corpus.syllabify(
 650             syllable_tokenizer = SyllableTokenizerWithWordSeparation())
 651         for verse in corpus.get_verses():
 652             hyphenated_verse = verse.hyphenate(hyphen = args['hyphen'],
 653                                                add_space = True)
 654             print(args['format'].format(verse = verse.get_text(),
 655                                         hyphenated_verse = hyphenated_verse,
 656                                         metric = verse.get_metric()))
 657     elif args['verse']:
 658         # read verse on command line arguments
 659         verse = Verse(" ".join(args['verse']))
 660         verse.syllabify(
 661             syllable_tokenizer = SyllableTokenizerWithWordSeparation())
 662         hyphenated_verse = verse.hyphenate(hyphen = args['hyphen'], add_space = True)
 663         print(args['format'].format(verse = verse.get_text(),
 664                                     hyphenated_verse = hyphenated_verse,
 665                                     metric = verse.get_metric()))
 666     else:
 667         parser.print_help()
 668
 669 if __name__ == '__main__':
 670     main()