get-last-order

   1 #!/usr/bin/python
   2 """Get last order for PKKA from tumblr using his API"""
   3
   4 import xml.dom
   5 import xml.dom.minidom
   6 import urllib2
   7 import re
   8
   9 url='http://orders.ecpsu.ru/api/read'
  10
  11 def remove_html_tags(data):
  12         """Remove all HTML tags except <a>"""
  13         p = re.compile('<(?!\/?a(?=>|\s.*>))\/?.*?>')
  14         return p.sub('', data)
  15
  16 def sanitize_urls(data):
  17         """Get content inside <a></a> tags and "href" attribute"""
  18         patt=r'<a.+?href="(http[^"]+)"(?:>|\s.*?>)([^<]+)</a>'
  19         repl=r'\2: \1'
  20         return re.sub(patt, repl, data)
  21
  22 doc = xml.dom.minidom.parse(urllib2.urlopen(url))
  23 posts = doc.getElementsByTagName('post')
  24 last_post = posts[0]
  25 dirty_body = last_post.getElementsByTagName('regular-body')[0].firstChild.data
  26 print dirty_body
  27 body_with_a = remove_html_tags(dirty_body)
  28 body = sanitize_urls(body_with_a)
  29 print body