[FileItem] Fix mimetype content lookup
[xbmc.git] / addons / metadata.generic.albums / lib / scraper.py
blob95d838806ac2572c40dc49916baaa68def386c90
1 # -*- coding: utf-8 -*-
3 import json
4 import socket
5 import sys
6 import time
7 import urllib.parse
8 import urllib.request
9 import _strptime # https://bugs.python.org/issue7980
10 from socket import timeout
11 from threading import Thread
12 from urllib.error import HTTPError, URLError
13 import xbmc
14 import xbmcaddon
15 import xbmcgui
16 import xbmcplugin
17 from .allmusic import allmusic_albumfind
18 from .allmusic import allmusic_albumdetails
19 from .discogs import discogs_albumfind
20 from .discogs import discogs_albummain
21 from .discogs import discogs_albumdetails
22 from .fanarttv import fanarttv_albumart
23 from .musicbrainz import musicbrainz_albumfind
24 from .musicbrainz import musicbrainz_albumdetails
25 from .musicbrainz import musicbrainz_albumlinks
26 from .musicbrainz import musicbrainz_albumart
27 from .nfo import nfo_geturl
28 from .theaudiodb import theaudiodb_albumdetails
29 from .wikipedia import wikipedia_albumdetails
30 from .utils import *
32 ADDONID = xbmcaddon.Addon().getAddonInfo('id')
33 ADDONNAME = xbmcaddon.Addon().getAddonInfo('name')
34 ADDONVERSION = xbmcaddon.Addon().getAddonInfo('version')
37 def log(txt):
38 message = '%s: %s' % (ADDONID, txt)
39 xbmc.log(msg=message, level=xbmc.LOGDEBUG)
41 def get_data(url, jsonformat, retry=True):
42 try:
43 if url.startswith('https://musicbrainz.org/'):
44 api_timeout('musicbrainztime')
45 elif url.startswith('https://api.discogs.com/'):
46 api_timeout('discogstime')
47 headers = {}
48 headers['User-Agent'] = '%s/%s ( http://kodi.tv )' % (ADDONNAME, ADDONVERSION)
49 req = urllib.request.Request(url, headers=headers)
50 resp = urllib.request.urlopen(req, timeout=5)
51 respdata = resp.read()
52 except URLError as e:
53 log('URLError: %s - %s' % (e.reason, url))
54 return
55 except HTTPError as e:
56 log('HTTPError: %s - %s' % (e.reason, url))
57 return
58 except socket.timeout as e:
59 log('socket: %s - %s' % (e, url))
60 return
61 if resp.getcode() == 503:
62 log('exceeding musicbrainz api limit')
63 if retry:
64 xbmc.sleep(1000)
65 get_data(url, jsonformat, retry=False)
66 else:
67 return
68 elif resp.getcode() == 429:
69 log('exceeding discogs api limit')
70 if retry:
71 xbmc.sleep(1000)
72 get_data(url, jsonformat, retry=False)
73 else:
74 return
75 if jsonformat:
76 respdata = json.loads(respdata)
77 return respdata
79 def api_timeout(scraper):
80 currenttime = round(time.time() * 1000)
81 previoustime = xbmcgui.Window(10000).getProperty(scraper)
82 if previoustime:
83 timeout = currenttime - int(previoustime)
84 if timeout < 1000:
85 xbmc.sleep(1000 - timeout)
86 xbmcgui.Window(10000).setProperty(scraper, str(round(time.time() * 1000)))
89 class Scraper():
90 def __init__(self, action, key, artist, album, url, nfo, settings):
91 # parse path settings
92 self.parse_settings(settings)
93 # this is just for backward compitability with xml based scrapers https://github.com/xbmc/xbmc/pull/11632
94 if action == 'resolveid':
95 # return the result
96 result = self.resolve_mbid(key)
97 self.return_resolved(result)
98 # search for artist name / album title matches
99 elif action == 'find':
100 # try musicbrainz first
101 result = self.find_album(artist, album, 'musicbrainz')
102 if result:
103 self.return_search(result)
104 # fallback to discogs
105 else:
106 result = self.find_album(artist, album, 'discogs')
107 if result:
108 self.return_search(result)
109 # return info id's
110 elif action == 'getdetails':
111 details = {}
112 links = {}
113 url = json.loads(url)
114 artist = url.get('artist')
115 album = url.get('album')
116 mbalbumid = url.get('mbalbumid')
117 mbreleasegroupid = url.get('mbreleasegroupid')
118 dcid = url.get('dcalbumid')
119 threads = []
120 extrascrapers = []
121 # we have musicbrainz album id
122 if mbalbumid:
123 # get the mbreleasegroupid, artist and album if we don't have them
124 if not mbreleasegroupid:
125 result = self.get_details(mbalbumid, 'musicbrainz', details)
126 if not result:
127 scrapers = [[mbalbumid, 'musicbrainz']]
128 else:
129 mbreleasegroupid = details['musicbrainz']['mbreleasegroupid']
130 artist = details['musicbrainz']['artist_description']
131 album = details['musicbrainz']['album']
132 scrapers = [[mbreleasegroupid, 'theaudiodb'], [mbreleasegroupid, 'fanarttv'], [mbreleasegroupid, 'coverarchive']]
133 else:
134 scrapers = [[mbalbumid, 'musicbrainz'], [mbreleasegroupid, 'theaudiodb'], [mbreleasegroupid, 'fanarttv'], [mbreleasegroupid, 'coverarchive']]
135 # get musicbrainz links to other metadata sites
136 lthread = Thread(target = self.get_links, args = (mbreleasegroupid, links))
137 lthread.start()
138 for item in scrapers:
139 thread = Thread(target = self.get_details, args = (item[0], item[1], details))
140 threads.append(thread)
141 thread.start()
142 # wait for the musicbrainz links to return
143 lthread.join()
144 if 'musicbrainz' in links:
145 # scrape allmusic if we have an url provided by musicbrainz
146 if 'allmusic' in links['musicbrainz']:
147 extrascrapers.append([{'url': links['musicbrainz']['allmusic']}, 'allmusic'])
148 # only scrape allmusic by artistname and albumtitle if explicitly enabled
149 elif self.inaccurate and artist and album:
150 extrascrapers.append([{'artist': artist, 'album': album}, 'allmusic'])
151 # scrape discogs if we have an url provided by musicbrainz
152 if 'discogs' in links['musicbrainz']:
153 extrascrapers.append([{'masterurl': links['musicbrainz']['discogs']}, 'discogs'])
154 # only scrape discogs by artistname and albumtitle if explicitly enabled
155 elif self.inaccurate and artist and album:
156 extrascrapers.append([{'artist': artist, 'album': album}, 'discogs'])
157 # scrape wikipedia if we have an url provided by musicbrainz
158 if 'wikipedia' in links['musicbrainz']:
159 extrascrapers.append([links['musicbrainz']['wikipedia'], 'wikipedia'])
160 elif 'wikidata' in links['musicbrainz']:
161 extrascrapers.append([links['musicbrainz']['wikidata'], 'wikidata'])
162 for item in extrascrapers:
163 thread = Thread(target = self.get_details, args = (item[0], item[1], details))
164 threads.append(thread)
165 thread.start()
166 # we have a discogs id
167 else:
168 thread = Thread(target = self.get_details, args = ({'url': dcid}, 'discogs', details))
169 threads.append(thread)
170 thread.start()
171 for thread in threads:
172 thread.join()
173 result = self.compile_results(details)
174 if result:
175 self.return_details(result)
176 # extract the mbalbumid from the provided musicbrainz url
177 elif action == 'NfoUrl':
178 # check if there is a musicbrainz url in the nfo file
179 mbalbumid = nfo_geturl(nfo)
180 if mbalbumid:
181 # return the result
182 result = self.resolve_mbid(mbalbumid)
183 self.return_nfourl(result)
184 xbmcplugin.endOfDirectory(int(sys.argv[1]))
186 def parse_settings(self, data):
187 settings = json.loads(data)
188 # note: path settings are taken from the db, they may not reflect the current settings.xml file
189 self.review = settings['review']
190 self.genre = settings['genre']
191 self.lang = settings['lang']
192 self.mood = settings['mood']
193 self.rating = settings['rating']
194 self.style = settings['style']
195 self.theme = settings['theme']
196 self.inaccurate = settings['inaccurate']
198 def resolve_mbid(self, mbalbumid):
199 item = {}
200 item['artist_description'] = ''
201 item['album'] = ''
202 item['mbalbumid'] = mbalbumid
203 item['mbreleasegroupid'] = ''
204 return item
206 def find_album(self, artist, album, site):
207 json = True
208 # musicbrainz
209 if site == 'musicbrainz':
210 url = MUSICBRAINZURL % (MUSICBRAINZSEARCH % (urllib.parse.quote_plus(album), urllib.parse.quote_plus(artist), urllib.parse.quote_plus(artist)))
211 scraper = musicbrainz_albumfind
212 # discogs
213 elif site == 'discogs':
214 url = DISCOGSURL % (DISCOGSSEARCH % (urllib.parse.quote_plus(album), urllib.parse.quote_plus(artist), DISCOGSKEY , DISCOGSSECRET))
215 scraper = discogs_albumfind
216 result = get_data(url, json)
217 if not result:
218 return
219 albumresults = scraper(result, artist, album)
220 return albumresults
222 def get_links(self, param, links):
223 json = True
224 url = MUSICBRAINZURL % (MUSICBRAINZLINKS % param)
225 result = get_data(url, json)
226 if result:
227 linkresults = musicbrainz_albumlinks(result)
228 links['musicbrainz'] = linkresults
229 return links
231 def get_details(self, param, site, details):
232 json = True
233 # theaudiodb
234 if site == 'theaudiodb':
235 url = AUDIODBURL % (AUDIODBKEY, AUDIODBDETAILS % param)
236 albumscraper = theaudiodb_albumdetails
237 # musicbrainz
238 elif site == 'musicbrainz':
239 url = MUSICBRAINZURL % (MUSICBRAINZDETAILS % param)
240 albumscraper = musicbrainz_albumdetails
241 # fanarttv
242 elif site == 'fanarttv':
243 url = FANARTVURL % (param, FANARTVKEY)
244 albumscraper = fanarttv_albumart
245 # coverarchive
246 elif site == 'coverarchive':
247 url = MUSICBRAINZART % (param)
248 albumscraper = musicbrainz_albumart
249 # discogs
250 elif site == 'discogs':
251 # musicbrainz provides a link to the master release, but we need the main release
252 if 'masterurl' in param:
253 masterdata = get_data(DISCOGSURL % (DISCOGSMASTER % (param['masterurl'], DISCOGSKEY , DISCOGSSECRET)), True)
254 if masterdata:
255 url = discogs_albummain(masterdata)
256 if url:
257 param['url'] = url
258 else:
259 return
260 else:
261 return
262 # search by artistname and albumtitle if we do not have an url
263 if not 'url' in param:
264 url = DISCOGSURL % (DISCOGSSEARCH % (urllib.parse.quote_plus(param['album']), urllib.parse.quote_plus(param['artist']), DISCOGSKEY , DISCOGSSECRET))
265 albumresult = get_data(url, json)
266 if albumresult:
267 albums = discogs_albumfind(albumresult, param['artist'], param['album'])
268 if albums:
269 albumresult = sorted(albums, key=lambda k: k['relevance'], reverse=True)
270 param['url'] = albumresult[0]['dcalbumid']
271 else:
272 return
273 else:
274 return
275 url = DISCOGSURL % (DISCOGSDETAILS % (param['url'], DISCOGSKEY, DISCOGSSECRET))
276 albumscraper = discogs_albumdetails
277 # wikipedia
278 elif site == 'wikipedia':
279 url = WIKIPEDIAURL % param
280 albumscraper = wikipedia_albumdetails
281 elif site == 'wikidata':
282 # resolve wikidata to wikipedia url
283 result = get_data(WIKIDATAURL % param, json)
284 try:
285 album = result['entities'][param]['sitelinks']['enwiki']['url'].rsplit('/', 1)[1]
286 except:
287 return
288 site = 'wikipedia'
289 url = WIKIPEDIAURL % album
290 albumscraper = wikipedia_albumdetails
291 # allmusic
292 elif site == 'allmusic':
293 json = False
294 # search by artistname and albumtitle if we do not have an url
295 if not 'url' in param:
296 url = ALLMUSICURL % (ALLMUSICSEARCH % (urllib.parse.quote_plus(param['artist']), urllib.parse.quote_plus(param['album'])))
297 albumresult = get_data(url, json)
298 if albumresult:
299 albums = allmusic_albumfind(albumresult, param['artist'], param['album'])
300 if albums:
301 param['url'] = albums[0]['url']
302 else:
303 return
304 else:
305 return
306 url = ALLMUSICDETAILS % param['url']
307 albumscraper = allmusic_albumdetails
308 result = get_data(url, json)
309 if not result:
310 return
311 albumresults = albumscraper(result)
312 if not albumresults:
313 return
314 details[site] = albumresults
315 return details
317 def compile_results(self, details):
318 result = {}
319 thumbs = []
320 extras = []
321 # merge metadata results, start with the least accurate sources
322 if 'discogs' in details:
323 for k, v in details['discogs'].items():
324 if v:
325 result[k] = v
326 if k == 'thumb' and v:
327 thumbs.append(v)
328 if 'wikipedia' in details:
329 for k, v in details['wikipedia'].items():
330 if v:
331 result[k] = v
332 if 'allmusic' in details:
333 for k, v in details['allmusic'].items():
334 if v:
335 result[k] = v
336 if k == 'thumb' and v:
337 thumbs.append(v)
338 if 'theaudiodb' in details:
339 for k, v in details['theaudiodb'].items():
340 if v:
341 result[k] = v
342 if k == 'thumb' and v:
343 thumbs.append(v)
344 if k == 'extras' and v:
345 extras.append(v)
346 if 'musicbrainz' in details:
347 for k, v in details['musicbrainz'].items():
348 if v:
349 result[k] = v
350 if 'coverarchive' in details:
351 for k, v in details['coverarchive'].items():
352 if v:
353 result[k] = v
354 if k == 'thumb' and v:
355 thumbs.append(v)
356 if k == 'extras' and v:
357 extras.append(v)
358 # prefer artwork from fanarttv
359 if 'fanarttv' in details:
360 for k, v in details['fanarttv'].items():
361 if v:
362 result[k] = v
363 if k == 'thumb' and v:
364 thumbs.append(v)
365 if k == 'extras' and v:
366 extras.append(v)
367 # use musicbrainz artist as it provides the mbartistid (used for resolveid in the artist scraper)
368 if 'musicbrainz' in details:
369 result['artist'] = details['musicbrainz']['artist']
370 # provide artwork from all scrapers for getthumb option
371 if result:
372 # thumb list from most accurate sources first
373 thumbs.reverse()
374 thumbnails = []
375 for thumblist in thumbs:
376 for item in thumblist:
377 thumbnails.append(item)
378 # the order for extra art does not matter
379 extraart = []
380 for extralist in extras:
381 for item in extralist:
382 extraart.append(item)
383 # add the extra art to the end of the thumb list
384 if extraart:
385 thumbnails.extend(extraart)
386 if thumbnails:
387 result['thumb'] = thumbnails
388 data = self.user_prefs(details, result)
389 return data
391 def user_prefs(self, details, result):
392 # user preferences
393 lang = 'description' + self.lang
394 if self.review == 'theaudiodb' and 'theaudiodb' in details:
395 if lang in details['theaudiodb']:
396 result['description'] = details['theaudiodb'][lang]
397 elif 'descriptionEN' in details['theaudiodb']:
398 result['description'] = details['theaudiodb']['descriptionEN']
399 elif (self.review in details) and ('description' in details[self.review]):
400 result['description'] = details[self.review]['description']
401 if (self.genre in details) and ('genre' in details[self.genre]):
402 result['genre'] = details[self.genre]['genre']
403 if (self.style in details) and ('styles' in details[self.style]):
404 result['styles'] = details[self.style]['styles']
405 if (self.mood in details) and ('moods' in details[self.mood]):
406 result['moods'] = details[self.mood]['moods']
407 if (self.theme in details) and ('themes' in details[self.theme]):
408 result['themes'] = details[self.theme]['themes']
409 if (self.rating in details) and ('rating' in details[self.rating]):
410 result['rating'] = details[self.rating]['rating']
411 result['votes'] = details[self.rating]['votes']
412 return result
414 def return_search(self, data):
415 items = []
416 for item in data:
417 listitem = xbmcgui.ListItem(item['album'], offscreen=True)
418 listitem.setArt({'thumb': item['thumb']})
419 listitem.setProperty('album.artist', item['artist_description'])
420 listitem.setProperty('album.year', item.get('year',''))
421 listitem.setProperty('album.type', item.get('type',''))
422 listitem.setProperty('album.releasestatus', item.get('releasestatus',''))
423 listitem.setProperty('album.label', item.get('label',''))
424 listitem.setProperty('relevance', item['relevance'])
425 url = {'artist':item['artist_description'], 'album':item['album']}
426 if 'mbalbumid' in item:
427 url['mbalbumid'] = item['mbalbumid']
428 url['mbreleasegroupid'] = item['mbreleasegroupid']
429 if 'dcalbumid' in item:
430 url['dcalbumid'] = item['dcalbumid']
431 items.append((json.dumps(url), listitem, True))
432 if items:
433 xbmcplugin.addDirectoryItems(handle=int(sys.argv[1]), items=items)
435 def return_nfourl(self, item):
436 listitem = xbmcgui.ListItem(offscreen=True)
437 xbmcplugin.addDirectoryItem(handle=int(sys.argv[1]), url=json.dumps(item), listitem=listitem, isFolder=True)
439 def return_resolved(self, item):
440 listitem = xbmcgui.ListItem(path=json.dumps(item), offscreen=True)
441 xbmcplugin.setResolvedUrl(handle=int(sys.argv[1]), succeeded=True, listitem=listitem)
443 def return_details(self, item):
444 if not 'album' in item:
445 return
446 listitem = xbmcgui.ListItem(item['album'], offscreen=True)
447 if 'mbalbumid' in item:
448 listitem.setProperty('album.musicbrainzid', item['mbalbumid'])
449 listitem.setProperty('album.releaseid', item['mbalbumid'])
450 if 'mbreleasegroupid' in item:
451 listitem.setProperty('album.releasegroupid', item['mbreleasegroupid'])
452 if 'scrapedmbid' in item:
453 listitem.setProperty('album.scrapedmbid', item['scrapedmbid'])
454 if 'artist' in item:
455 listitem.setProperty('album.artists', str(len(item['artist'])))
456 for count, artist in enumerate(item['artist']):
457 listitem.setProperty('album.artist%i.name' % (count + 1), artist['artist'])
458 listitem.setProperty('album.artist%i.musicbrainzid' % (count + 1), artist.get('mbartistid', ''))
459 listitem.setProperty('album.artist%i.sortname' % (count + 1), artist.get('artistsort', ''))
460 if 'genre' in item:
461 listitem.setProperty('album.genre', item['genre'])
462 if 'styles' in item:
463 listitem.setProperty('album.styles', item['styles'])
464 if 'moods' in item:
465 listitem.setProperty('album.moods', item['moods'])
466 if 'themes' in item:
467 listitem.setProperty('album.themes', item['themes'])
468 if 'description' in item:
469 listitem.setProperty('album.review', item['description'])
470 if 'releasedate' in item:
471 listitem.setProperty('album.releasedate', item['releasedate'])
472 if 'originaldate' in item:
473 listitem.setProperty('album.originaldate', item['originaldate'])
474 if 'releasestatus' in item:
475 listitem.setProperty('album.releasestatus', item['releasestatus'])
476 if 'artist_description' in item:
477 listitem.setProperty('album.artist_description', item['artist_description'])
478 if 'label' in item:
479 listitem.setProperty('album.label', item['label'])
480 if 'type' in item:
481 listitem.setProperty('album.type', item['type'])
482 if 'compilation' in item:
483 listitem.setProperty('album.compilation', item['compilation'])
484 if 'year' in item:
485 listitem.setProperty('album.year', item['year'])
486 if 'rating' in item:
487 listitem.setProperty('album.rating', item['rating'])
488 if 'votes' in item:
489 listitem.setProperty('album.votes', item['votes'])
490 if 'thumb' in item:
491 listitem.setProperty('album.thumbs', str(len(item['thumb'])))
492 for count, thumb in enumerate(item['thumb']):
493 listitem.setProperty('album.thumb%i.url' % (count + 1), thumb['image'])
494 listitem.setProperty('album.thumb%i.aspect' % (count + 1), thumb['aspect'])
495 listitem.setProperty('album.thumb%i.preview' % (count + 1), thumb['preview'])
496 xbmcplugin.setResolvedUrl(handle=int(sys.argv[1]), succeeded=True, listitem=listitem)