[FileItem] Fix mimetype content lookup
[xbmc.git] / addons / metadata.generic.artists / lib / scraper.py
blob8d007a5f4cd4d412f58178ad6b517ca85b8435bf
1 # -*- coding: utf-8 -*-
3 import json
4 import socket
5 import sys
6 import time
7 import urllib.parse
8 import urllib.request
9 import _strptime # https://bugs.python.org/issue7980
10 from socket import timeout
11 from threading import Thread
12 from urllib.error import HTTPError, URLError
13 import xbmc
14 import xbmcaddon
15 import xbmcgui
16 import xbmcplugin
17 from .allmusic import allmusic_artistfind
18 from .allmusic import allmusic_artistdetails
19 from .allmusic import allmusic_artistalbums
20 from .discogs import discogs_artistfind
21 from .discogs import discogs_artistdetails
22 from .discogs import discogs_artistalbums
23 from .fanarttv import fanarttv_artistart
24 from .musicbrainz import musicbrainz_artistfind
25 from .musicbrainz import musicbrainz_artistdetails
26 from .nfo import nfo_geturl
27 from .theaudiodb import theaudiodb_artistdetails
28 from .theaudiodb import theaudiodb_artistalbums
29 from .theaudiodb import theaudiodb_mvids
30 from .wikipedia import wikipedia_artistdetails
31 from .utils import *
33 ADDONID = xbmcaddon.Addon().getAddonInfo('id')
34 ADDONNAME = xbmcaddon.Addon().getAddonInfo('name')
35 ADDONVERSION = xbmcaddon.Addon().getAddonInfo('version')
38 def log(txt):
39 message = '%s: %s' % (ADDONID, txt)
40 xbmc.log(msg=message, level=xbmc.LOGDEBUG)
42 def get_data(url, jsonformat, retry=True):
43 try:
44 if url.startswith('https://musicbrainz.org/'):
45 api_timeout('musicbrainztime')
46 elif url.startswith('https://api.discogs.com/'):
47 api_timeout('discogstime')
48 headers = {}
49 headers['User-Agent'] = '%s/%s ( http://kodi.tv )' % (ADDONNAME, ADDONVERSION)
50 req = urllib.request.Request(url, headers=headers)
51 resp = urllib.request.urlopen(req, timeout=5)
52 respdata = resp.read()
53 except URLError as e:
54 log('URLError: %s - %s' % (e.reason, url))
55 return
56 except HTTPError as e:
57 log('HTTPError: %s - %s' % (e.reason, url))
58 return
59 except socket.timeout as e:
60 log('socket: %s - %s' % (e, url))
61 return
62 if resp.getcode() == 503:
63 log('exceeding musicbrainz api limit')
64 if retry:
65 xbmc.sleep(1000)
66 get_data(url, jsonformat, retry=False)
67 else:
68 return
69 elif resp.getcode() == 429:
70 log('exceeding discogs api limit')
71 if retry:
72 xbmc.sleep(1000)
73 get_data(url, jsonformat, retry=False)
74 else:
75 return
76 if jsonformat:
77 respdata = json.loads(respdata)
78 return respdata
80 def api_timeout(scraper):
81 currenttime = round(time.time() * 1000)
82 previoustime = xbmcgui.Window(10000).getProperty(scraper)
83 if previoustime:
84 timeout = currenttime - int(previoustime)
85 if timeout < 1000:
86 xbmc.sleep(1000 - timeout)
87 xbmcgui.Window(10000).setProperty(scraper, str(round(time.time() * 1000)))
90 class Scraper():
91 def __init__(self, action, key, artist, url, nfo, settings):
92 # parse path settings
93 self.parse_settings(settings)
94 # this is just for backward compitability with xml based scrapers https://github.com/xbmc/xbmc/pull/11632
95 if action == 'resolveid':
96 # return the result
97 result = self.resolve_mbid(key)
98 self.return_resolved(result)
99 # search for artist name matches
100 elif action == 'find':
101 # try musicbrainz first
102 result = self.find_artist(artist, 'musicbrainz')
103 if result:
104 self.return_search(result)
105 # fallback to discogs
106 else:
107 result = self.find_artist(artist, 'discogs')
108 if result:
109 self.return_search(result)
110 # return info using id's
111 elif action == 'getdetails':
112 details = {}
113 discography = {}
114 mvids = {}
115 url = json.loads(url)
116 artist = url.get('artist')
117 mbartistid = url.get('mbartistid')
118 dcid = url.get('dcid')
119 threads = []
120 extrascrapers = []
121 discographyscrapers = []
122 # we have a musicbrainz id
123 if mbartistid:
124 scrapers = [[mbartistid, 'musicbrainz'], [mbartistid, 'theaudiodb'], [mbartistid, 'fanarttv']]
125 for item in scrapers:
126 thread = Thread(target = self.get_details, args = (item[0], item[1], details))
127 threads.append(thread)
128 thread.start()
129 # theaudiodb discograhy
130 thread = Thread(target = self.get_discography, args = (mbartistid, 'theaudiodb', discography))
131 threads.append(thread)
132 thread.start()
133 # theaudiodb mvids
134 thread = Thread(target = self.get_videolinks, args = (mbartistid, 'theaudiodb', mvids))
135 threads.append(thread)
136 thread.start()
137 # wait for musicbrainz to finish
138 threads[0].join()
139 # check if we have a result:
140 if 'musicbrainz' in details:
141 if not artist:
142 artist = details['musicbrainz']['artist']
143 # scrape allmusic if we have an url provided by musicbrainz
144 if 'allmusic' in details['musicbrainz']:
145 extrascrapers.append([{'url': details['musicbrainz']['allmusic']}, 'allmusic'])
146 # allmusic discograhy
147 discographyscrapers.append([{'url': details['musicbrainz']['allmusic']}, 'allmusic'])
148 # only scrape allmusic by artistname if explicitly enabled
149 elif self.inaccurate and artist:
150 extrascrapers.append([{'artist': artist}, 'allmusic'])
151 # scrape wikipedia if we have an url provided by musicbrainz
152 if 'wikipedia' in details['musicbrainz']:
153 extrascrapers.append([details['musicbrainz']['wikipedia'], 'wikipedia'])
154 elif 'wikidata' in details['musicbrainz']:
155 extrascrapers.append([details['musicbrainz']['wikidata'], 'wikidata'])
156 # scrape discogs if we have an url provided by musicbrainz
157 if 'discogs' in details['musicbrainz']:
158 extrascrapers.append([{'url': details['musicbrainz']['discogs']}, 'discogs'])
159 # discogs discograhy
160 discographyscrapers.append([{'url': details['musicbrainz']['discogs']}, 'discogs'])
161 # only scrape discogs by artistname if explicitly enabled
162 elif self.inaccurate and artist:
163 extrascrapers.append([{'artist': artist}, 'discogs'])
164 for item in extrascrapers:
165 thread = Thread(target = self.get_details, args = (item[0], item[1], details))
166 threads.append(thread)
167 thread.start()
168 # get allmusic / discogs discography if we have an url
169 for item in discographyscrapers:
170 thread = Thread(target = self.get_discography, args = (item[0], item[1], discography))
171 threads.append(thread)
172 thread.start()
173 # we have a discogs id
174 else:
175 thread = Thread(target = self.get_details, args = ({'url': dcid}, 'discogs', details))
176 threads.append(thread)
177 thread.start()
178 thread = Thread(target = self.get_discography, args = ({'url': dcid}, 'discogs', discography))
179 threads.append(thread)
180 thread.start()
181 if threads:
182 for thread in threads:
183 thread.join()
184 # merge discography items
185 for site, albumlist in discography.items():
186 if site in details:
187 details[site]['albums'] = albumlist
188 else:
189 details[site] = {}
190 details[site]['albums'] = albumlist
191 for site, mvidlist in mvids.items():
192 if site in details:
193 details[site]['mvids'] = mvidlist
194 else:
195 details[site] = {}
196 details[site]['mvids'] = mvidlist
197 result = self.compile_results(details)
198 if result:
199 self.return_details(result)
200 elif action == 'NfoUrl':
201 # check if there is a musicbrainz url in the nfo file
202 mbartistid = nfo_geturl(nfo)
203 if mbartistid:
204 # return the result
205 result = self.resolve_mbid(mbartistid)
206 self.return_nfourl(result)
207 xbmcplugin.endOfDirectory(int(sys.argv[1]))
209 def parse_settings(self, data):
210 settings = json.loads(data)
211 # note: path settings are taken from the db, they may not reflect the current settings.xml file
212 self.bio = settings['bio']
213 self.discog = settings['discog']
214 self.genre = settings['genre']
215 self.lang = settings['lang']
216 self.mood = settings['mood']
217 self.style = settings['style']
218 self.inaccurate = settings['inaccurate']
220 def resolve_mbid(self, mbartistid):
221 item = {}
222 item['artist'] = ''
223 item['mbartistid'] = mbartistid
224 return item
226 def find_artist(self, artist, site):
227 json = True
228 # musicbrainz
229 if site == 'musicbrainz':
230 url = MUSICBRAINZURL % (MUSICBRAINZSEARCH % urllib.parse.quote_plus(artist))
231 scraper = musicbrainz_artistfind
232 # musicbrainz
233 if site == 'discogs':
234 url = DISCOGSURL % (DISCOGSSEARCH % (urllib.parse.quote_plus(artist), DISCOGSKEY , DISCOGSSECRET))
235 scraper = discogs_artistfind
236 result = get_data(url, json)
237 if not result:
238 return
239 artistresults = scraper(result, artist)
240 return artistresults
242 def get_details(self, param, site, details, discography={}):
243 json = True
244 # theaudiodb
245 if site == 'theaudiodb':
246 url = AUDIODBURL % (AUDIODBKEY, AUDIODBDETAILS % param)
247 artistscraper = theaudiodb_artistdetails
248 # musicbrainz
249 elif site == 'musicbrainz':
250 url = MUSICBRAINZURL % (MUSICBRAINZDETAILS % param)
251 artistscraper = musicbrainz_artistdetails
252 # fanarttv
253 elif site == 'fanarttv':
254 url = FANARTVURL % (param, FANARTVKEY)
255 artistscraper = fanarttv_artistart
256 # discogs
257 elif site == 'discogs':
258 # search by artistname if we do not have an url
259 if 'artist' in param:
260 url = DISCOGSURL % (DISCOGSSEARCH % (urllib.parse.quote_plus(param['artist']), DISCOGSKEY , DISCOGSSECRET))
261 artistresult = get_data(url, json)
262 if artistresult:
263 artists = discogs_artistfind(artistresult, param['artist'])
264 if artists:
265 artistresult = sorted(artists, key=lambda k: k['relevance'], reverse=True)
266 param['url'] = artistresult[0]['dcid']
267 else:
268 return
269 else:
270 return
271 url = DISCOGSURL % (DISCOGSDETAILS % (param['url'], DISCOGSKEY, DISCOGSSECRET))
272 artistscraper = discogs_artistdetails
273 # wikipedia
274 elif site == 'wikipedia':
275 url = WIKIPEDIAURL % param
276 artistscraper = wikipedia_artistdetails
277 elif site == 'wikidata':
278 # resolve wikidata to wikipedia url
279 result = get_data(WIKIDATAURL % param, json)
280 try:
281 artist = result['entities'][param]['sitelinks']['enwiki']['url'].rsplit('/', 1)[1]
282 except:
283 return
284 site = 'wikipedia'
285 url = WIKIPEDIAURL % artist
286 artistscraper = wikipedia_artistdetails
287 # allmusic
288 elif site == 'allmusic':
289 json = False
290 # search by artistname if we do not have an url
291 if 'artist' in param:
292 url = ALLMUSICURL % urllib.parse.quote_plus(param['artist'])
293 artistresult = get_data(url, json)
294 if artistresult:
295 artists = allmusic_artistfind(artistresult, param['artist'])
296 if artists:
297 param['url'] = artists[0]['url']
298 else:
299 return
300 else:
301 return
302 url = param['url']
303 artistscraper = allmusic_artistdetails
304 result = get_data(url, json)
305 if not result:
306 return
307 artistresults = artistscraper(result)
308 if not artistresults:
309 return
310 details[site] = artistresults
311 # get allmusic / discogs discography if we searched by artistname
312 if (site == 'discogs' or site == 'allmusic') and 'artist' in param:
313 albums = self.get_discography(param, site, {})
314 if albums:
315 details[site]['albums'] = albums[site]
316 return details
318 def get_discography(self, param, site, discography):
319 json = True
320 if site == 'theaudiodb':
321 # theaudiodb - discography
322 albumsurl = AUDIODBURL % (AUDIODBKEY, AUDIODBDISCOGRAPHY % param)
323 scraper = theaudiodb_artistalbums
324 elif site == 'discogs':
325 # discogs - discography
326 albumsurl = DISCOGSURL % (DISCOGSDISCOGRAPHY % (param['url'], DISCOGSKEY, DISCOGSSECRET))
327 scraper = discogs_artistalbums
328 elif site == 'allmusic':
329 # allmusic - discography
330 json = False
331 albumsurl = param['url'] + '/discography'
332 scraper = allmusic_artistalbums
333 albumdata = get_data(albumsurl, json)
334 if not albumdata:
335 return
336 albumresults = scraper(albumdata)
337 if not albumresults:
338 return
339 discography[site] = albumresults
340 return discography
342 def get_videolinks(self, param, site, videolinks):
343 json = True
344 if site == 'theaudiodb':
345 #theaudiodb mvid links
346 mvidurl = AUDIODBURL % (AUDIODBKEY, AUDIODBMVIDS % param)
347 scraper = theaudiodb_mvids
348 mviddata = get_data(mvidurl, json)
349 if not mviddata:
350 return
351 mvidresults = scraper(mviddata)
352 if not mvidresults:
353 return
354 videolinks[site] = mvidresults
355 return videolinks
357 def compile_results(self, details):
358 result = {}
359 thumbs = []
360 fanart = []
361 extras = []
362 # merge metadata results, start with the least accurate sources
363 if 'discogs' in details:
364 for k, v in details['discogs'].items():
365 if v:
366 result[k] = v
367 if k == 'thumb' and v:
368 thumbs.append(v)
369 if 'wikipedia' in details:
370 for k, v in details['wikipedia'].items():
371 if v:
372 result[k] = v
373 if 'allmusic' in details:
374 for k, v in details['allmusic'].items():
375 if v:
376 result[k] = v
377 if k == 'thumb' and v:
378 thumbs.append(v)
379 if 'theaudiodb' in details:
380 for k, v in details['theaudiodb'].items():
381 if v:
382 result[k] = v
383 if k == 'thumb' and v:
384 thumbs.append(v)
385 elif k == 'fanart' and v:
386 fanart.append(v)
387 if k == 'extras' and v:
388 extras.append(v)
389 if 'musicbrainz' in details:
390 for k, v in details['musicbrainz'].items():
391 if v:
392 result[k] = v
393 if 'fanarttv' in details:
394 for k, v in details['fanarttv'].items():
395 if v:
396 result[k] = v
397 if k == 'thumb' and v:
398 thumbs.append(v)
399 elif k == 'fanart' and v:
400 fanart.append(v)
401 if k == 'extras' and v:
402 extras.append(v)
403 # merge artwork from all scrapers
404 if result:
405 # artworks from most accurate sources first
406 thumbs.reverse()
407 thumbnails = []
408 fanart.reverse()
409 fanarts = []
410 # extra art from most accurate sources first
411 extras.reverse()
412 extraart = []
413 for thumblist in thumbs:
414 for item in thumblist:
415 thumbnails.append(item)
416 for extralist in extras:
417 for item in extralist:
418 extraart.append(item)
419 # add the extra art to the end of the thumb list
420 if extraart:
421 thumbnails.extend(extraart)
422 for fanartlist in fanart:
423 for item in fanartlist:
424 fanarts.append(item)
425 # add the fanart to the end of the thumb list
426 if fanarts:
427 thumbnails.extend(fanarts)
428 if thumbnails:
429 result['thumb'] = thumbnails
430 data = self.user_prefs(details, result)
431 return data
433 def user_prefs(self, details, result):
434 # user preferences
435 lang = 'biography' + self.lang
436 if self.bio == 'theaudiodb' and 'theaudiodb' in details:
437 if lang in details['theaudiodb']:
438 result['biography'] = details['theaudiodb'][lang]
439 elif 'biographyEN' in details['theaudiodb']:
440 result['biography'] = details['theaudiodb']['biographyEN']
441 elif (self.bio in details) and ('biography' in details[self.bio]):
442 result['biography'] = details[self.bio]['biography']
443 if (self.discog in details) and ('albums' in details[self.discog]):
444 result['albums'] = details[self.discog]['albums']
445 if (self.genre in details) and ('genre' in details[self.genre]):
446 result['genre'] = details[self.genre]['genre']
447 if (self.style in details) and ('styles' in details[self.style]):
448 result['styles'] = details[self.style]['styles']
449 if (self.mood in details) and ('moods' in details[self.mood]):
450 result['moods'] = details[self.mood]['moods']
451 return result
453 def return_search(self, data):
454 items = []
455 for item in data:
456 listitem = xbmcgui.ListItem(item['artist'], offscreen=True)
457 listitem.setArt({'thumb': item['thumb']})
458 listitem.setProperty('artist.genre', item['genre'])
459 listitem.setProperty('artist.born', item['born'])
460 listitem.setProperty('relevance', item['relevance'])
461 if 'type' in item:
462 listitem.setProperty('artist.type', item['type'])
463 if 'gender' in item:
464 listitem.setProperty('artist.gender', item['gender'])
465 if 'disambiguation' in item:
466 listitem.setProperty('artist.disambiguation', item['disambiguation'])
467 url = {'artist':item['artist']}
468 if 'mbartistid' in item:
469 url['mbartistid'] = item['mbartistid']
470 if 'dcid' in item:
471 url['dcid'] = item['dcid']
472 items.append((json.dumps(url), listitem, True))
473 if items:
474 xbmcplugin.addDirectoryItems(handle=int(sys.argv[1]), items=items)
476 def return_nfourl(self, item):
477 listitem = xbmcgui.ListItem(offscreen=True)
478 xbmcplugin.addDirectoryItem(handle=int(sys.argv[1]), url=json.dumps(item), listitem=listitem, isFolder=True)
480 def return_resolved(self, item):
481 listitem = xbmcgui.ListItem(path=json.dumps(item), offscreen=True)
482 xbmcplugin.setResolvedUrl(handle=int(sys.argv[1]), succeeded=True, listitem=listitem)
484 def return_details(self, item):
485 if not 'artist' in item:
486 return
487 listitem = xbmcgui.ListItem(item['artist'], offscreen=True)
488 if 'mbartistid' in item:
489 listitem.setProperty('artist.musicbrainzid', item['mbartistid'])
490 if 'genre' in item:
491 listitem.setProperty('artist.genre', item['genre'])
492 if 'biography' in item:
493 listitem.setProperty('artist.biography', item['biography'])
494 if 'gender' in item:
495 listitem.setProperty('artist.gender', item['gender'])
496 if 'styles' in item:
497 listitem.setProperty('artist.styles', item['styles'])
498 if 'moods' in item:
499 listitem.setProperty('artist.moods', item['moods'])
500 if 'instruments' in item:
501 listitem.setProperty('artist.instruments', item['instruments'])
502 if 'disambiguation' in item:
503 listitem.setProperty('artist.disambiguation', item['disambiguation'])
504 if 'type' in item:
505 listitem.setProperty('artist.type', item['type'])
506 if 'sortname' in item:
507 listitem.setProperty('artist.sortname', item['sortname'])
508 if 'active' in item:
509 listitem.setProperty('artist.years_active', item['active'])
510 if 'born' in item:
511 listitem.setProperty('artist.born', item['born'])
512 if 'formed' in item:
513 listitem.setProperty('artist.formed', item['formed'])
514 if 'died' in item:
515 listitem.setProperty('artist.died', item['died'])
516 if 'disbanded' in item:
517 listitem.setProperty('artist.disbanded', item['disbanded'])
518 if 'thumb' in item:
519 listitem.setProperty('artist.thumbs', str(len(item['thumb'])))
520 for count, thumb in enumerate(item['thumb']):
521 listitem.setProperty('artist.thumb%i.url' % (count + 1), thumb['image'])
522 listitem.setProperty('artist.thumb%i.preview' % (count + 1), thumb['preview'])
523 listitem.setProperty('artist.thumb%i.aspect' % (count + 1), thumb['aspect'])
524 if 'albums' in item:
525 listitem.setProperty('artist.albums', str(len(item['albums'])))
526 for count, album in enumerate(item['albums']):
527 listitem.setProperty('artist.album%i.title' % (count + 1), album['title'])
528 listitem.setProperty('artist.album%i.year' % (count + 1), album['year'])
529 if 'musicbrainzreleasegroupid' in album:
530 listitem.setProperty('artist.album%i.musicbrainzreleasegroupid' % (count + 1), album['musicbrainzreleasegroupid'])
531 if 'mvids' in item:
532 listitem.setProperty('artist.videolinks', str(len(item['mvids'])))
533 for count, mvid in enumerate(item['mvids']):
534 listitem.setProperty('artist.videolink%i.title' % (count + 1), mvid['title'])
535 listitem.setProperty('artist.videolink%i.mbtrackid' % (count + 1), mvid['mbtrackid'])
536 listitem.setProperty('artist.videolink%i.url' % (count + 1), mvid['url'])
537 listitem.setProperty('artist.videolink%i.thumb' % (count + 1), mvid['thumb'])
538 xbmcplugin.setResolvedUrl(handle=int(sys.argv[1]), succeeded=True, listitem=listitem)