improve extraction of episode's short_title,number
[mygpo-feedservice.git] / feedservice / utils.py
blob635d9b62dc18c05e6a6230a5f5ac7e7295b6a833
2 # This file is part of my.gpodder.org.
4 # my.gpodder.org is free software: you can redistribute it and/or modify it
5 # under the terms of the GNU Affero General Public License as published by
6 # the Free Software Foundation, either version 3 of the License, or (at your
7 # option) any later version.
9 # my.gpodder.org is distributed in the hope that it will be useful, but
10 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
11 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
12 # License for more details.
14 # You should have received a copy of the GNU Affero General Public License
15 # along with my.gpodder.org. If not, see <http://www.gnu.org/licenses/>.
18 import time
19 import re
20 from htmlentitydefs import entitydefs
23 def parse_time(value):
24 """
25 >>> parse_time(10)
28 >>> parse_time('05:10') #5*60+10
29 310
31 >>> parse_time('1:05:10') #60*60+5*60+10
32 3910
33 """
34 if value is None:
35 raise ValueError('None value in parse_time')
37 if isinstance(value, int):
38 # Don't need to parse already-converted time value
39 return value
41 if value == '':
42 raise ValueError('Empty valueing in parse_time')
44 for format in ('%H:%M:%S', '%M:%S'):
45 try:
46 t = time.strptime(value, format)
47 return t.tm_hour * 60*60 + t.tm_min * 60 + t.tm_sec
48 except ValueError, e:
49 continue
51 return int(value)
54 # taken from gpodder.util
55 def remove_html_tags(html):
56 """
57 Remove HTML tags from a string and replace numeric and
58 named entities with the corresponding character, so the
59 HTML text can be displayed in a simple text view.
60 """
61 if html is None:
62 return None
64 # If we would want more speed, we could make these global
65 re_strip_tags = re.compile('<[^>]*>')
66 re_unicode_entities = re.compile('&#(\d{2,4});')
67 re_html_entities = re.compile('&(.{2,8});')
68 re_newline_tags = re.compile('(<br[^>]*>|<[/]?ul[^>]*>|</li>)', re.I)
69 re_listing_tags = re.compile('<li[^>]*>', re.I)
71 result = html
73 # Convert common HTML elements to their text equivalent
74 result = re_newline_tags.sub('\n', result)
75 result = re_listing_tags.sub('\n * ', result)
76 result = re.sub('<[Pp]>', '\n\n', result)
78 # Remove all HTML/XML tags from the string
79 result = re_strip_tags.sub('', result)
80 # Convert numeric XML entities to their unicode character
81 result = re_unicode_entities.sub(lambda x: unichr(int(x.group(1))), result)
83 # Convert named HTML entities to their unicode character
84 result = re_html_entities.sub(lambda x: unicode(entitydefs.get(x.group(1),''), 'iso-8859-1'), result)
86 # Convert more than two newlines to two newlines
87 result = re.sub('([\r\n]{2})([\r\n])+', '\\1', result)
89 return result.strip()
92 # from http://stackoverflow.com/questions/2892931/longest-common-substring-from-more-than-two-strings-python
93 # this does not increase asymptotical complexity
94 # but can still waste more time than it saves.
95 def shortest_of(strings):
96 return min(strings, key=len)
98 def longest_substr(strings):
99 """
100 Returns the longest common substring of the given strings
103 substr = ""
104 if not strings:
105 return substr
106 reference = shortest_of(strings) #strings[0]
107 length = len(reference)
108 #find a suitable slice i:j
109 for i in xrange(length):
110 #only consider strings long at least len(substr) + 1
111 for j in xrange(i + len(substr) + 1, length):
112 candidate = reference[i:j]
113 if all(candidate in text for text in strings):
114 substr = candidate
115 return substr