feedservice/utils.py

   1 #
   2 # This file is part of my.gpodder.org.
   3 #
   4 # my.gpodder.org is free software: you can redistribute it and/or modify it
   5 # under the terms of the GNU Affero General Public License as published by
   6 # the Free Software Foundation, either version 3 of the License, or (at your
   7 # option) any later version.
   8 #
   9 # my.gpodder.org is distributed in the hope that it will be useful, but
  10 # WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  11 # or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public
  12 # License for more details.
  13 #
  14 # You should have received a copy of the GNU Affero General Public License
  15 # along with my.gpodder.org. If not, see <http://www.gnu.org/licenses/>.
  16 #
  17
  18 import time
  19 import re
  20 from htmlentitydefs import entitydefs
  21
  22
  23 def parse_time(value):
  24     """
  25     >>> parse_time(10)
  26     10
  27
  28     >>> parse_time('05:10') #5*60+10
  29     310
  30
  31     >>> parse_time('1:05:10') #60*60+5*60+10
  32     3910
  33     """
  34     if value is None:
  35         raise ValueError('None value in parse_time')
  36
  37     if isinstance(value, int):
  38         # Don't need to parse already-converted time value
  39         return value
  40
  41     if value == '':
  42         raise ValueError('Empty valueing in parse_time')
  43
  44     for format in ('%H:%M:%S', '%M:%S'):
  45         try:
  46             t = time.strptime(value, format)
  47             return t.tm_hour * 60*60 + t.tm_min * 60 + t.tm_sec
  48         except ValueError, e:
  49             continue
  50
  51     return int(value)
  52
  53
  54 # taken from gpodder.util
  55 def remove_html_tags(html):
  56     """
  57     Remove HTML tags from a string and replace numeric and
  58     named entities with the corresponding character, so the
  59     HTML text can be displayed in a simple text view.
  60     """
  61     if html is None:
  62         return None
  63
  64     # If we would want more speed, we could make these global
  65     re_strip_tags = re.compile('<[^>]*>')
  66     re_unicode_entities = re.compile('&#(\d{2,4});')
  67     re_html_entities = re.compile('&(.{2,8});')
  68     re_newline_tags = re.compile('(<br[^>]*>|<[/]?ul[^>]*>|</li>)', re.I)
  69     re_listing_tags = re.compile('<li[^>]*>', re.I)
  70
  71     result = html
  72
  73     # Convert common HTML elements to their text equivalent
  74     result = re_newline_tags.sub('\n', result)
  75     result = re_listing_tags.sub('\n * ', result)
  76     result = re.sub('<[Pp]>', '\n\n', result)
  77
  78     # Remove all HTML/XML tags from the string
  79     result = re_strip_tags.sub('', result)
  80     # Convert numeric XML entities to their unicode character
  81     result = re_unicode_entities.sub(lambda x: unichr(int(x.group(1))), result)
  82
  83     # Convert named HTML entities to their unicode character
  84     result = re_html_entities.sub(lambda x: unicode(entitydefs.get(x.group(1),''), 'iso-8859-1'), result)
  85
  86     # Convert more than two newlines to two newlines
  87     result = re.sub('([\r\n]{2})([\r\n])+', '\\1', result)
  88
  89     return result.strip()
  90
  91
  92 # from http://stackoverflow.com/questions/2892931/longest-common-substring-from-more-than-two-strings-python
  93 # this does not increase asymptotical complexity
  94 # but can still waste more time than it saves.
  95 def shortest_of(strings):
  96     return min(strings, key=len)
  97
  98 def longest_substr(strings):
  99     """
 100     Returns the longest common substring of the given strings
 101     """
 102
 103     substr = ""
 104     if not strings:
 105         return substr
 106     reference = shortest_of(strings) #strings[0]
 107     length = len(reference)
 108     #find a suitable slice i:j
 109     for i in xrange(length):
 110         #only consider strings long at least len(substr) + 1
 111         for j in xrange(i + len(substr) + 1, length):
 112             candidate = reference[i:j]
 113             if all(candidate in text for text in strings):
 114                 substr = candidate
 115     return substr
 116