fix other mandelbrot variants
[mu.git] / browse-slack / convert_slack.py
blob2faf6d36af583ee48ce6bd7124876bff62524dc2
1 # Import JSON from a Slack admin export into a disk image Mu can load.
3 # Dependencies: python, wget, awk, sed, netpbm
5 # Step 1: download a Slack archive and unpack it to some directory
7 # Step 2: download user avatars to subdirectory images/ and convert them to PPM in subdirectory images/ppm/
8 # grep image_72 . -r |grep -v users.json |awk '{print $3}' |sort |uniq |sed 's/?.*//' |sed 's,\\,,g' |sed 's/"//' |sed 's/",$//' > images.list
9 # mkdir images
10 # cd images
11 # wget -i ../images.list --wait=0.1
12 # # fix some lying images
13 # for f in $(file *.jpg |grep PNG |sed 's/:.*//'); do mv -i $f $(echo $f |sed 's/\.jpg$/.png/'); done
14 # #
15 # mkdir ppm
16 # for f in *.jpg; do jpegtopnm $f |pnmtoplainpnm > ppm/$(echo $f |sed 's/\.jpg$//').ppm; done
17 # for f in *.png; do pngtopnm $f |pnmtoplainpnm > ppm/$(echo $f |sed 's/\.png$//').ppm; done
19 # (Depending on your OS, you may need to replace pnmtoplainpnm with `pnmtopnm -plain`. Some places also have a pnm2pnm.
20 # I don't understand it either.)
22 # Step 3: construct a disk image out of the archives and avatars
23 # cd .. # go back to the top-level archive directory
24 # dd if=/dev/zero of=data.img count=201600 # 100MB
25 # python path/to/convert_slack.py > data.out 2> data.err
26 # dd if=data.out of=data.img conv=notrunc
27 # Currently this process yields errors for ~300 items (~70 posts and their comments)
28 # on the Future of Software group (https://futureofcoding.org/community). We fail to load those.
30 # Notes on input format:
31 # Redundant 'type' field that's always 'message'. Probably an "enterprise" feature.
33 from sys import argv, stderr
34 import json
35 from os import listdir
36 from os.path import isfile, join, basename, splitext
37 from urllib.parse import urlparse
38 import traceback
40 def look_up_ppm_image(url):
41 file_root = splitext(basename(urlparse(url).path))[0]
42 filename = f"images/ppm/{file_root}.ppm"
43 if isfile(filename):
44 with open(filename) as f:
45 return f.read()
47 user_idx = {}
48 with open('users.json') as f:
49 for idx, user in enumerate(json.load(f)):
50 if 'real_name' not in user:
51 user['real_name'] = ''
52 print(f"({json.dumps(user['id'])} \"@{user['name']}\" {json.dumps(user['real_name'])} [{look_up_ppm_image(user['profile']['image_72']) or ''}])")
53 user_idx[user['id']] = idx
55 def by(item):
56 if 'subtype' in item and item['subtype'] == 'bot_message' and 'username' in item:
57 federated_user = item['username']
58 if federated_user not in user_idx:
59 user_idx[federated_user] = len(user_idx)
60 return user_idx[federated_user]
61 return user_idx[item['user']]
63 item_idx = {}
64 def parent(item):
65 if 'thread_ts' in item and item['thread_ts'] != item['ts']:
66 # comment
67 return item_idx[item['thread_ts']]
68 else:
69 return -1
71 items = []
72 for channel in json.load(open('channels.json')):
73 for filename in sorted(listdir(channel['name'])):
74 with open(join(channel['name'], filename)) as f:
75 for item in json.load(f):
76 item['channel_name'] = channel['name']
77 items.append(item)
79 idx = 0
80 for item in sorted(items, key=lambda item: item['ts']):
81 try:
82 print(f"({json.dumps(item['ts'])} {parent(item)} {json.dumps(item['channel_name'])} {by(item)} {json.dumps(item['text'])})")
83 item_idx[item['ts']] = idx
84 idx += 1 # only increment when actually used and no exception raised
85 except KeyError:
86 traceback.print_exc(file=stderr)
87 stderr.write(repr(item)+'\n')