for git v1.5.2 (and below): chdir to the directory of the target file before executin...
[translate_toolkit.git] / search / segment.py
blob2d660ff6be1b3c1f9387cb6afdca11a666649850
1 # -*- coding: utf-8 -*-
3 # Copyright 2006 Zuza Software Foundation
4 #
5 # This file is part of translate.
7 # translate is free software; you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation; either version 2 of the License, or
10 # (at your option) any later version.
12 # translate is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with translate; if not, write to the Free Software
19 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 """Module to deal with different types and uses of segmentation"""
24 #XXX: This module is now deprecated: Use language specific segmenters in the
25 # lang package (character_iter, word_iter, sentence_iter, etc.).
27 punctuation = u".,;:!?-@#$%^*_()[]{}/\\'\"<>‘’‚‛“”„‟′″‴‵‶‷‹›«»±³¹²°¿©®×£¥"
29 def character_iter(text):
30 """Returns an iterator over the characters in text."""
31 #We don't return more than one consecutive whitespace character
32 prev = 'A'
33 for c in text:
34 if c.isspace() and prev.isspace():
35 continue
36 prev = c
37 if not (c in punctuation):
38 yield c.lower()
40 def characters(text):
41 """Returns a list of characters in text."""
42 return [c for c in character_iter(text)]
44 def word_iter(text):
45 """Returns an iterator over the words in text."""
46 #TODO: Consider replacing puctuation with space before split()
47 for w in text.split():
48 yield w.strip(punctuation).lower()
50 def words(text):
51 """Returns a list of words in text."""
52 return [w for w in word_iter(text)]
54 def sentence_iter(text):
55 """Returns an iterator over the senteces in text."""
56 #TODO: This is very naïve. We really should consider all punctuation,
57 #and return the punctuation with the sentence.
58 #TODO: Search for capital letter start with next sentence to avoid
59 #confusion with abbreviations. And remember Afrikaans "'n" :-)
60 for s in text.split(". "):
61 yield s.strip()
63 def sentences(text):
64 """Returns a list of senteces in text."""
65 return [s for s in sentence_iter(text)]