1 # -*- coding: utf-8 -*-
3 # Copyright 2006 Zuza Software Foundation
5 # This file is part of translate.
7 # translate is free software; you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation; either version 2 of the License, or
10 # (at your option) any later version.
12 # translate is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with translate; if not, write to the Free Software
19 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 """Module to deal with different types and uses of segmentation"""
24 #XXX: This module is now deprecated: Use language specific segmenters in the
25 # lang package (character_iter, word_iter, sentence_iter, etc.).
27 punctuation
= u
".,;:!?-@#$%^*_()[]{}/\\'\"<>‘’‚‛“”„‟′″‴‵‶‷‹›«»±³¹²°¿©®×£¥"
29 def character_iter(text
):
30 """Returns an iterator over the characters in text."""
31 #We don't return more than one consecutive whitespace character
34 if c
.isspace() and prev
.isspace():
37 if not (c
in punctuation
):
41 """Returns a list of characters in text."""
42 return [c
for c
in character_iter(text
)]
45 """Returns an iterator over the words in text."""
46 #TODO: Consider replacing puctuation with space before split()
47 for w
in text
.split():
48 yield w
.strip(punctuation
).lower()
51 """Returns a list of words in text."""
52 return [w
for w
in word_iter(text
)]
54 def sentence_iter(text
):
55 """Returns an iterator over the senteces in text."""
56 #TODO: This is very naïve. We really should consider all punctuation,
57 #and return the punctuation with the sentence.
58 #TODO: Search for capital letter start with next sentence to avoid
59 #confusion with abbreviations. And remember Afrikaans "'n" :-)
60 for s
in text
.split(". "):
64 """Returns a list of senteces in text."""
65 return [s
for s
in sentence_iter(text
)]