1 # -*- coding: utf-8 -*-
2 import re
, sys
, argparse
6 Represent a syllable constituent (a single alphabetical character),
7 with other text (mute characters, punctuation, spaces, etc.) attached
10 sign.get_char() gives the alphabetical syllable consituent.
11 sign.get_text() gives the whole text attached to the sign
13 def __init__(self
, c
):
16 self
._word
_end
= False
17 self
._word
_start
= False
18 self
._forced
_syllable
_end
= False
19 self
._forced
_syllable
_start
= False
21 def add_text(self
, str):
22 self
._text
= "".join((self
._text
, str))
24 def set_forced_syllable_end(self
):
25 self
._forced
_syllable
_end
= True
27 def forced_syllable_end(self
):
28 return self
._forced
_syllable
_end
30 def set_forced_syllable_start(self
):
31 self
._forced
_syllable
_start
= True
33 def forced_syllable_start(self
):
34 return self
._forced
_syllable
_start
39 def set_word_end(self
):
43 return self
._word
_start
45 def set_word_start(self
):
46 self
._word
_start
= True
54 class SignTokenizer():
56 Provides a method for build a list of signs from a decorated verse string.
58 sign_tokenizer = SignTokenizer()
59 signs = sign_tokenizer.tokenize("Un ver avec des décorations")
60 signs being a list of Sign objects
62 The decorations can be:
63 - "°" for grouping 'empty' words to 'full' words.
65 En°vain j'ay respecté la°celebre memoire
66 Des°Heros des°siecles passez ;
67 Can be overriden with word_separator_markers constructor keyword
69 - "*" for marking a mute letter (e.g. a 'h').
71 Et c'est l'*Hyver qui les°rassemble.
72 Can be overriden with mute_character_marker constructor keyword
74 - "=" for forcing syllable ends, e.g. for marking a diaeresis.
76 Trop *heureux Phrygi=ens, venez icy l'attendre.
77 Can be overriden with forced_syllable_end_marker constructor keyword
79 - other unused markers: < > { }
80 Can be overriden with ignored_markers constructor keyword
84 word_separators
= " -",
85 word_separator_markers
= "°",
86 simple_punctuations
= ".,",
87 double_punctuations
= ":;?!",
89 forced_syllable_end_marker
= "=",
90 mute_character_marker
= "*",
91 ignored_markers
= "<>{}",
92 ignored_characters
= "[]()|/~_"
94 self
.word_separators
= word_separators
95 self
.word_separator_markers
= word_separator_markers
96 self
.all_word_separators
= "".join((word_separators
,
97 word_separator_markers
))
98 self
.simple_punctuations
= simple_punctuations
99 self
.double_punctuations
= double_punctuations
100 self
.space_before_double_punctuations
= (language
== "fr")
101 self
.apostrophes
= apostrophes
102 self
.forced_syllable_end_marker
= forced_syllable_end_marker
103 self
.mute_character_marker
= mute_character_marker
104 self
.ignored_markers
= ignored_markers
105 self
.ignored_characters
= ignored_characters
106 self
.punctuation_re
= re
.compile(
107 " *([{}{}])".format(self
.simple_punctuations
,
108 self
.double_punctuations
))
109 self
.et_re
= re
.compile("([Ee]t)({})".format(
110 "|".join(self
.all_word_separators
)))
114 self
._current
_sign
= None
117 def _add_sign(self
, c
):
118 self
._current
_sign
= Sign(c
.lower())
119 self
._signs
.append(self
._current
_sign
)
120 if self
._prefix
!= "":
121 self
._current
_sign
.add_text(self
._prefix
)
124 def _add_prefix(self
, prefix
):
125 self
._prefix
= "".join((self
._prefix
, prefix
))
127 def _add_text(self
, text
):
128 self
._current
_sign
.add_text(text
)
130 def _set_forced_syllable_end(self
):
131 self
._current
_sign
.set_forced_syllable_end()
133 def _set_word_end(self
):
134 self
._current
_sign
.set_word_end()
136 def tokenize(self
, verse_text
):
138 sign_count
= len(verse_text
)
142 while (i
< sign_count
):
144 punctuation_match
= self
.punctuation_re
.match(verse_text
[i
:])
145 ## Markers: they are not real text
146 # forced syllable end marker
147 if c
== self
.forced_syllable_end_marker
:
148 self
._set
_forced
_syllable
_end
()
150 # mute character marker
151 elif c
== self
.mute_character_marker
:
155 elif c
in self
.ignored_markers
:
159 elif c
in self
.apostrophes
:
163 elif punctuation_match
:
164 punct
= punctuation_match
.group(1)
165 if self
.space_before_double_punctuations
and punct
in self
.double_punctuations
:
166 self
._add
_text
("\u00A0")
167 self
._add
_text
(punct
)
168 i
+= len(punctuation_match
.group(0))
172 elif c
in self
.all_word_separators
:
175 if c
in self
.word_separator_markers
:
181 elif c
in self
.ignored_characters
:
191 m
= word_start
and self
.et_re
.match(verse_text
[i
:])
193 # special case: et -> &
195 self
._add
_text
(m
.group(1))
206 # the last character is at word end and syllable end
208 self
._set
_forced
_syllable
_end
()
209 # set word_start and forced_syllable_start for characters
210 # following a word end or forced_syllable_end
212 at_syllable_start
= True
213 for sign
in self
._signs
:
215 sign
.set_word_start()
216 if at_syllable_start
:
217 sign
.set_forced_syllable_start()
218 at_word_start
= sign
.word_end()
219 at_syllable_start
= sign
.forced_syllable_end()
223 return "".join([c
.get_char() for c
in self
._signs
])
225 def get_full_verse(self
):
226 return "".join([c
.get_text() for c
in self
._signs
])
231 Represents a syllable, consisting in a list of signs.
236 def add_sign(self
, sign
):
237 self
._signs
.append(sign
)
239 def add_signs(self
, signs
):
240 self
._signs
.extend(signs
)
245 def set_signs(self
, signs
):
249 return "".join([sign
.get_text() for sign
in self
._signs
])
252 return "".join([sign
.get_char() for sign
in self
._signs
])
255 return not self
._signs
257 def at_word_start(self
):
258 return self
._signs
[0].word_start()
260 def at_word_end(self
):
261 return self
._signs
[-1].word_end()
263 def is_feminine(self
):
265 A syllable is feminine iff:
266 - it is placed at word end
267 - it contains exactly one vowel, which is 'e' or 'ë', at the end
268 (with possibly a final s)
271 if self
.at_word_end():
272 chars
= "".join([sign
.get_char() for sign
in self
._signs
])
274 # exact words: ces, mes, ses, tes, les, des, es
276 if (self
.at_word_start()
277 and re
.match("^[cmstld]?es$", chars
)):
281 if char
in "aàâäeëéèêœiìïîoôòuùûüy&":
282 vowels
= "".join((vowels
, char
))
284 # only one vowel: e or ë, and word ends with -e or -es
285 ((vowels
== "e" or vowels
== "ë")
286 and (vowels
== chars
[-1] or (vowels
+ "s") == chars
[-2:]))
287 # two vowels: "que?" or "gues?"
288 or ((vowels
== "ue" or vowels
== "uë")
289 and re
.search("[qg]u[eë]s?", chars
)))
293 class SyllableTokenizer():
295 Provides a method for build a list of syllables from a list of signs.
297 sign_tokenizer = SignTokenizer()
298 syllable_tokenizer = SyllableTokenizer()
299 signs = sign_tokenizer.tokenize("Un ver avec des décorations")
300 syllables = syllable_tokenizer.tokenize(signs)
301 syllables being a list of Syllable objects
305 other_vowels
= "aàâäéèêœiìïîoôòuùûüy&",
306 consonants_sonority_levels
= { 'liquid' : "lrh",
308 'constrictive' : "çfjsvxz",
309 'occlusive' : "bcdgkpqt" }
311 self
.e_vowels
= e_vowels
312 self
.other_vowels
= other_vowels
313 self
.vowels
= "".join((e_vowels
, other_vowels
))
314 self
.consonants_sonority_levels
= consonants_sonority_levels
315 self
.consonants
= "".join(consonants_sonority_levels
.values())
318 # [something][vowel (no feminine e)]<space>[vowel]
319 'hiatus' : ".[{}][{}]".format(self
.other_vowels
, self
.vowels
),
320 # <word start>s[cçpt][vowel]
321 '^sca' : "s[cçpt][{}]".format(self
.vowels
),
322 # <word start>s[cp][lr][vowel]
323 '^scla' : "s[cp][lr][{}]".format(self
.vowels
),
324 # <word start>ps[vowel]
325 '^psa' : "ps[{}]".format(self
.vowels
),
327 'gna' : "gn[{}]".format(self
.vowels
),
328 # [occlusive bcdgkpqt or constrictive çfjvxz][liquid lrh][vowel]
329 'bla' : "[{}{}][{}][{}]".format(
330 self
.consonants_sonority_levels
['occlusive'],
331 self
.consonants_sonority_levels
['constrictive'].replace("s", ""),
332 self
.consonants_sonority_levels
['liquid'],
335 'thra' : "[tpc]h[rl][{}]".format(self
.vowels
),
337 'ba' : "[{}][{}]".format(self
.consonants
, self
.vowels
),
339 self
.compiled_re
= {}
340 for (key
, string
) in self
.re
.items():
341 self
.compiled_re
[key
] = re
.compile(string
)
342 self
._match
_data
= None
344 def _match(self
, re_key
, text
):
345 self
._match
_data
= self
.compiled_re
[re_key
].match(text
)
346 return self
._match
_data
348 def _get_match_data(self
):
349 return self
._match
_data
353 self
._current
_syllable
= None
354 self
._first
_syllable
= Syllable()
356 def _start_new_syllable(self
):
357 if (self
._first
_syllable
and not self
._first
_syllable
.is_empty()):
358 self
._syllables
.append(self
._first
_syllable
)
359 if not (self
._current
_syllable
360 and self
._current
_syllable
.is_empty()):
361 self
._current
_syllable
= Syllable()
362 self
._syllables
.append(self
._current
_syllable
)
363 self
._first
_syllable
= None
365 def _add_sign(self
, text
):
366 if self
._first
_syllable
:
367 self
._first
_syllable
.add_sign(text
)
369 self
._current
_syllable
.add_sign(text
)
371 def get_syllables(self
):
372 return self
._syllables
374 def tokenize(self
, signs
):
376 verse_text
= "".join([sign
.get_char() for sign
in signs
])
377 sign_count
= len(signs
)
379 while (i
< sign_count
):
380 word_start
= signs
[i
].word_start()
381 # forced syllable ends
382 if (i
> 0 and signs
[i
].forced_syllable_start()):
383 self
._start
_new
_syllable
()
388 and verse_text
[i
] in self
.vowels
389 and signs
[i
].word_end()):
390 self
._add
_sign
(signs
[i
])
392 self
._start
_new
_syllable
()
393 # [something][vowel (no feminine e)]<space>[vowel]
394 elif (self
._match
('hiatus', verse_text
[i
:])
395 and signs
[i
+1].word_end()):
396 self
._add
_sign
(signs
[i
])
397 self
._add
_sign
(signs
[i
+1])
398 self
._start
_new
_syllable
()
399 self
._add
_sign
(signs
[i
+2])
402 # <word start>s[cçpt][vowel]
403 (word_start
and self
._match
('^sca', verse_text
[i
:])
404 and not signs
[i
].word_end())
405 # <word start>s[cp][lr][vowel]
406 or (word_start
and self
._match
('^scla', verse_text
[i
:])
407 and not signs
[i
].word_end()
408 and not signs
[i
+1].word_end())
409 # <word start>ps[vowel]
410 or (word_start
and self
._match
('^psa', verse_text
[i
:]))
412 or (self
._match
('gna', verse_text
[i
:])
413 and not signs
[i
].word_end())
414 # [bcdgkpqtçfjvxz][lrh][vowel]
415 or (self
._match
('bla', verse_text
[i
:])
416 and not signs
[i
].word_end())
418 or (self
._match
('thra', verse_text
[i
:])
419 and not signs
[i
+1].word_end())
421 or self
._match
('ba', verse_text
[i
:])
423 match
= self
._get
_match
_data
().group(0)
424 self
._start
_new
_syllable
()
426 self
._add
_sign
(signs
[i
])
429 self
._add
_sign
(signs
[i
])
431 return self
.get_syllables()
434 class SyllableTokenizerWithWordSeparation(SyllableTokenizer
):
436 A specialized SyllableTokenizer which preferes syllable
437 breaking between words when possible. For instance:
440 gives: tant / at / ten / du
441 iso: tan / t at / ten / du
443 This is useful when breaking verses for lyrics.
446 sign_tokenizer = SignTokenizer()
447 syllable_tokenizer = SyllableTokenizerWithWordSeparation()
448 signs = sign_tokenizer.tokenize("Un ver avec des décorations")
449 syllables = syllable_tokenizer.tokenize(signs)
450 syllables being a list of Syllable objects
452 def force_word_separation(self
, syllables
= None):
453 syllables
= syllables
or self
._syllables
454 syllable_count
= len(syllables
)
455 prev_syllable
= syllables
[0]
456 for this_syllable
in syllables
[1:]:
457 signs
= this_syllable
.get_signs()
458 if not signs
[0].word_start() and signs
[1:]:
459 tokens_count
= len(signs
)
461 while (not signs
[i
].word_start()
462 or not signs
[i
].get_char() in self
.vowels
):
464 if i
== tokens_count
:
467 # we found a vowel at word start at index i
468 # signs from indices 0 to i-1 go to the previous syllable
469 prev_syllable
.add_signs(signs
[0:i
])
470 this_syllable
.set_signs(signs
[i
:])
471 prev_syllable
= this_syllable
474 def tokenize(self
, signs
):
475 SyllableTokenizer
.tokenize(self
, signs
)
476 return self
.force_word_separation()
483 verse = Verse("Un ver avec des décorations")
484 # possible pass sign and syllable tokenizers to split:
486 verse.get_syllables()
487 => ["Un ", "ve", "r a", "vec ", "des ", "dé", "co", "ra", "tions"]
490 def __init__(self
, text
, lineno
= None):
493 self
._lineno
= lineno
495 def get_syllables(self
):
496 return [syll
.get_text() for syll
in self
._syllables
]
499 return "".join([syll
.get_text() for syll
in self
._syllables
])
502 sign_tokenizer
= SignTokenizer(),
503 syllable_tokenizer
= SyllableTokenizer()
505 self
._syllables
= syllable_tokenizer
.tokenize(
506 sign_tokenizer
.tokenize(self
._text
))
508 def get_metric(self
):
509 return len(self
._syllables
) - (1 if self
._syllables
[-1].is_feminine() else 0)
511 def hyphenate(self
, hyphen
= "-", add_space
= False):
514 count
= len(self
._syllables
)
515 for syllable
in self
._syllables
:
516 if (i
> 0) and not syllable
.at_word_start():
517 syllables
.append(hyphen
)
518 text
= syllable
.get_text()
519 syllables
.append(text
)
521 verse_end
= (i
== count
- 1)
522 # if syllable is word end and do not end with a space,
523 # add it (unless at verse end)
525 and syllable
.at_word_end()
526 and text
[-1] != " "):
527 syllables
.append(" ")
529 return "".join(syllables
)
533 A corpus, consisting of verses.
536 To generate LilyPond lyrics (where syllables in a word are separated
540 corpus.add_verse(["premier ver", "second ver..."])
541 corpus.syllabify(syllable_tokenizer = SyllableTokenizerWithWordSeparation())
542 corpus.get_hyphenated_verses(hyphen = " -- ")
543 => ["pre -- mier ver", "se -- cond ver..."]
545 def __init__(self
, filename
= None):
547 self
._filename
= filename
549 def add_verse(self
, verse
, lineno
= None):
551 Add verse (a string) to the corpus.
553 self
._verses
.append(Verse(verse
, lineno
))
555 def get_verses(self
):
559 sign_tokenizer
= SignTokenizer(),
560 syllable_tokenizer
= SyllableTokenizer()):
562 Syllabify all the corpus verses.
564 for verse
in self
._verses
:
565 verse
.syllabify(sign_tokenizer
, syllable_tokenizer
)
567 def get_hyphenated_verses(self
, hyphen
= "-", add_space
= False):
569 Return the hyphenated verses (list of strings) contained in the
571 Corpus.syllabify() is supposed to have been called before.
573 return [verse
.hyphenate(hyphen
, add_space
)
574 for verse
in self
._verses
]
576 class CorpusReader():
578 def read(self
, filename
= "-"):
580 Read a corpus file (or stdin if filename is "-")
581 and produce a Corpus object.
583 file = open(filename
, 'r') if (filename
!= "-") else sys
.stdin
584 corpus
= Corpus(filename
)
593 elif re
.match(r
"^//", line
):
596 # TODO: titling directives
597 elif re
.match(r
"^#", line
):
602 # verse text TAB+ [properties]
603 # where properties can be:
604 # [LB]+ breve/long syllables indicators
608 # other lilypond code
609 # for now, we only keep the verse text itself
610 text
= re
.sub(r
"([^\t]+)\t.*$", r
"\1", line
)
611 corpus
.add_verse(text
, lineno
)
617 Syllabify and print verses.
619 parser
= argparse
.ArgumentParser(
620 description
='Verse syllabication.',
621 formatter_class
=argparse
.ArgumentDefaultsHelpFormatter
)
626 help='verse words to syllabify (if no corpus is provided)')
629 help="Corpus file to syllabify. Use - for reading from stdin")
633 help="String to be used when hyphenating a verse.")
636 default
="{hyphenated_verse}",
637 help="""Python format string for outputing the verse.
638 Possible keywords, to be used between curly braces in the format string,
640 *) hyphenated_verse: the verse after applying hyphenation
641 *) verse: the verse without hyphenation
642 *) metric: the verse metric (a number).""")
643 args
= vars(parser
.parse_args())
647 reader
= CorpusReader()
648 corpus
= reader
.read(args
['corpus'])
650 syllable_tokenizer
= SyllableTokenizerWithWordSeparation())
651 for verse
in corpus
.get_verses():
652 hyphenated_verse
= verse
.hyphenate(hyphen
= args
['hyphen'],
654 print(args
['format'].format(verse
= verse
.get_text(),
655 hyphenated_verse
= hyphenated_verse
,
656 metric
= verse
.get_metric()))
658 # read verse on command line arguments
659 verse
= Verse(" ".join(args
['verse']))
661 syllable_tokenizer
= SyllableTokenizerWithWordSeparation())
662 hyphenated_verse
= verse
.hyphenate(hyphen
= args
['hyphen'], add_space
= True)
663 print(args
['format'].format(verse
= verse
.get_text(),
664 hyphenated_verse
= hyphenated_verse
,
665 metric
= verse
.get_metric()))
669 if __name__
== '__main__':