Add files via upload
[LoveSoStrong.git] / parse_message_file.py
blob477a2404255ecb964cba799b0facd93e80ecd620
1 from __future__ import absolute_import, division, print_function, unicode_literals
2 import json
3 import gzip
4 import bz2
5 import sys
6 import io
8 try:
9 import lzma
10 except ImportError:
11 try:
12 from backports import lzma
13 except ImportError:
14 lzma = None
16 try:
17 from io import StringIO
18 except ImportError:
19 try:
20 from cStringIO import StringIO
21 except ImportError:
22 from StringIO import StringIO
24 PY2 = sys.version_info[0] == 2
26 def open_compressed_file(filename):
27 """ Open a file, trying various compression methods if available. """
28 if filename.endswith('.gz'):
29 return gzip.open(filename, 'rt', encoding='utf-8')
30 elif filename.endswith('.bz2'):
31 return bz2.open(filename, 'rt', encoding='utf-8')
32 elif filename.endswith('.xz') or filename.endswith('.lzma'):
33 if lzma:
34 return lzma.open(filename, 'rt', encoding='utf-8')
35 else:
36 raise ImportError("lzma module is not available")
37 else:
38 return io.open(filename, 'r', encoding='utf-8')
40 def save_compressed_file(data, filename):
41 """ Save data to a file, using various compression methods if specified. """
42 if filename.endswith('.gz'):
43 with gzip.open(filename, 'wt', encoding='utf-8') as file:
44 file.write(data)
45 elif filename.endswith('.bz2'):
46 with bz2.open(filename, 'wt', encoding='utf-8') as file:
47 file.write(data)
48 elif filename.endswith('.xz') or filename.endswith('.lzma'):
49 if lzma:
50 with lzma.open(filename, 'wt', encoding='utf-8') as file:
51 file.write(data)
52 else:
53 raise ImportError("lzma module is not available")
54 else:
55 with io.open(filename, 'w', encoding='utf-8') as file:
56 file.write(data)
58 def parse_line(line):
59 """ Parse a line in the format 'var: value' and return the key and value. """
60 parts = line.split(":", 1)
61 if len(parts) == 2:
62 key = parts[0].strip()
63 value = parts[1].strip()
64 return key, value
65 return None, None
67 def validate_non_negative_integer(value, variable_name, line_number):
68 """ Validate and convert a value to a non-negative integer. """
69 try:
70 int_value = int(value)
71 if int_value < 0:
72 raise ValueError
73 return int_value
74 except ValueError:
75 raise ValueError("{0} on line {1} should be a non-negative integer, but got '{2}'.".format(variable_name, line_number, value))
77 def parse_file(filename, validate_only=False, verbose=False):
78 with open_compressed_file(filename) as file:
79 lines = file.readlines()
80 return parse_lines(lines, validate_only, verbose)
82 def parse_string(data, validate_only=False, verbose=False):
83 lines = StringIO(data).readlines()
84 return parse_lines(lines, validate_only, verbose)
86 def parse_lines(lines, validate_only=False, verbose=False):
87 services = []
88 current_service = None
89 in_user_list = False
90 in_message_list = False
91 in_message_thread = False
92 in_user_info = False
93 in_message_post = False
94 in_bio_body = False
95 in_message_body = False
96 in_comment_section = False
97 in_include_service = False
98 in_include_users = False
99 in_include_messages = False
100 in_category_list = False
101 in_description_body = False
102 in_include_categories = False
103 in_categorization_list = False
104 include_files = []
105 user_id = None
106 current_bio = None
107 current_message = None
108 current_thread = None
109 current_category = None
110 categorization_values = []
111 category_ids = {'Categories': set(), 'Forums': set()}
112 post_id = 1
114 def parse_include_files(file_list):
115 included_services = []
116 for include_file in file_list:
117 included_services.extend(parse_file(include_file, validate_only, verbose))
118 return included_services
120 def parse_include_users(file_list):
121 users = {}
122 for include_file in file_list:
123 included_users = parse_file(include_file, validate_only, verbose)
124 for service in included_users:
125 users.update(service['Users'])
126 return users
128 def parse_include_messages(file_list):
129 messages = []
130 for include_file in file_list:
131 included_messages = parse_file(include_file, validate_only, verbose)
132 for service in included_messages:
133 messages.extend(service['MessageThreads'])
134 return messages
136 def parse_include_categories(file_list):
137 categories = []
138 for include_file in file_list:
139 included_categories = parse_file(include_file, validate_only, verbose)
140 for service in included_categories:
141 categories.extend(service['Categories'])
142 return categories
144 try:
145 for line_number, line in enumerate(lines, 1):
146 line = line.strip()
147 if line == "--- Include Service Start ---":
148 in_include_service = True
149 include_files = []
150 if verbose:
151 print("Line {0}: {1} (Starting include service section)".format(line_number, line))
152 continue
153 elif line == "--- Include Service End ---":
154 in_include_service = False
155 if verbose:
156 print("Line {0}: {1} (Ending include service section)".format(line_number, line))
157 services.extend(parse_include_files(include_files))
158 continue
159 elif in_include_service:
160 include_files.append(line)
161 if verbose:
162 print("Line {0}: {1} (Including file for service)".format(line_number, line))
163 continue
164 elif line == "--- Include Users Start ---":
165 in_include_users = True
166 include_files = []
167 if verbose:
168 print("Line {0}: {1} (Starting include users section)".format(line_number, line))
169 continue
170 elif line == "--- Include Users End ---":
171 in_include_users = False
172 if verbose:
173 print("Line {0}: {1} (Ending include users section)".format(line_number, line))
174 if current_service:
175 current_service['Users'].update(parse_include_users(include_files))
176 continue
177 elif in_include_users:
178 include_files.append(line)
179 if verbose:
180 print("Line {0}: {1} (Including file for users)".format(line_number, line))
181 continue
182 elif line == "--- Include Messages Start ---":
183 in_include_messages = True
184 include_files = []
185 if verbose:
186 print("Line {0}: {1} (Starting include messages section)".format(line_number, line))
187 continue
188 elif line == "--- Include Messages End ---":
189 in_include_messages = False
190 if verbose:
191 print("Line {0}: {1} (Ending include messages section)".format(line_number, line))
192 if current_service:
193 current_service['MessageThreads'].extend(parse_include_messages(include_files))
194 continue
195 elif in_include_messages:
196 include_files.append(line)
197 if verbose:
198 print("Line {0}: {1} (Including file for messages)".format(line_number, line))
199 continue
200 elif line == "--- Include Categories Start ---":
201 in_include_categories = True
202 include_files = []
203 if verbose:
204 print("Line {0}: {1} (Starting include categories section)".format(line_number, line))
205 continue
206 elif line == "--- Include Categories End ---":
207 in_include_categories = False
208 if verbose:
209 print("Line {0}: {1} (Ending include categories section)".format(line_number, line))
210 if current_service:
211 current_service['Categories'].extend(parse_include_categories(include_files))
212 for category in current_service['Categories']:
213 category_ids[category['Type']].add(category['ID'])
214 continue
215 elif in_include_categories:
216 include_files.append(line)
217 if verbose:
218 print("Line {0}: {1} (Including file for categories)".format(line_number, line))
219 continue
220 elif line == "--- Start Archive Service ---":
221 current_service = {'Users': {}, 'MessageThreads': [], 'Categories': [], 'Interactions': [], 'Categorization': []}
222 if verbose:
223 print("Line {0}: {1} (Starting new archive service)".format(line_number, line))
224 continue
225 elif line == "--- End Archive Service ---":
226 services.append(current_service)
227 current_service = None
228 if verbose:
229 print("Line {0}: {1} (Ending archive service)".format(line_number, line))
230 continue
231 elif line == "--- Start Comment Section ---":
232 in_comment_section = True
233 if verbose:
234 print("Line {0}: {1} (Starting comment section)".format(line_number, line))
235 continue
236 elif line == "--- End Comment Section ---":
237 in_comment_section = False
238 if verbose:
239 print("Line {0}: {1} (Ending comment section)".format(line_number, line))
240 continue
241 elif in_comment_section:
242 if verbose:
243 print("Line {0}: {1} (Comment)".format(line_number, line))
244 continue
245 elif line == "--- Start Category List ---":
246 in_category_list = True
247 current_category = {}
248 if verbose:
249 print("Line {0}: {1} (Starting category list)".format(line_number, line))
250 continue
251 elif line == "--- End Category List ---":
252 in_category_list = False
253 if current_category:
254 kind_split = current_category.get('Kind', '').split(",")
255 current_category['Type'] = kind_split[0].strip()
256 current_category['Level'] = kind_split[1].strip()
257 if current_category['Type'] not in categorization_values:
258 raise ValueError("Invalid 'Type' value '{0}' on line {1}. Expected one of {2}.".format(current_category['Type'], line_number, categorization_values))
259 if current_category['InSub'] != 0 and current_category['InSub'] not in category_ids[current_category['Type']]:
260 raise ValueError("InSub value '{0}' on line {1} does not match any existing ID values.".format(current_category['InSub'], line_number))
261 current_service['Categories'].append(current_category)
262 category_ids[current_category['Type']].add(current_category['ID'])
263 current_category = None
264 if verbose:
265 print("Line {0}: {1} (Ending category list)".format(line_number, line))
266 continue
267 elif line == "--- Start Categorization List ---":
268 in_categorization_list = True
269 current_service['Categorization'] = {}
270 if verbose:
271 print("Line {0}: {1} (Starting categorization list)".format(line_number, line))
272 continue
273 elif line == "--- End Categorization List ---":
274 in_categorization_list = False
275 if verbose:
276 print("Line {0}: {1} (Ending categorization list)".format(line_number, line))
277 categorization_values = set(current_service['Categorization'].keys())
278 continue
279 elif current_service is not None:
280 key, value = parse_line(line)
281 if key == "Entry":
282 current_service['Entry'] = validate_non_negative_integer(value, "Entry", line_number)
283 elif key == "Service":
284 current_service['Service'] = value
285 elif key == "Categories":
286 current_service['Categorization']['Categories'] = [category.strip() for category in value.split(",")]
287 if verbose:
288 print("Line {0}: Categories set to {1}".format(line_number, current_service['Categorization']['Categories']))
289 elif key == "Forums":
290 current_service['Categorization']['Forums'] = [forum.strip() for forum in value.split(",")]
291 if verbose:
292 print("Line {0}: Forums set to {1}".format(line_number, current_service['Categorization']['Forums']))
293 elif in_category_list:
294 if key == "Kind":
295 current_category['Kind'] = value
296 elif key == "ID":
297 current_category['ID'] = validate_non_negative_integer(value, "ID", line_number)
298 elif key == "InSub":
299 current_category['InSub'] = validate_non_negative_integer(value, "InSub", line_number)
300 elif key == "Headline":
301 current_category['Headline'] = value
302 elif key == "Description":
303 current_category['Description'] = value
304 elif line == "--- Start User List ---":
305 in_user_list = True
306 if verbose:
307 print("Line {0}: {1} (Starting user list)".format(line_number, line))
308 continue
309 elif line == "--- End User List ---":
310 in_user_list = False
311 if verbose:
312 print("Line {0}: {1} (Ending user list)".format(line_number, line))
313 continue
314 elif line == "--- Start User Info ---":
315 in_user_info = True
316 if verbose:
317 print("Line {0}: {1} (Starting user info)".format(line_number, line))
318 continue
319 elif line == "--- End User Info ---":
320 in_user_info = False
321 user_id = None
322 if verbose:
323 print("Line {0}: {1} (Ending user info)".format(line_number, line))
324 continue
325 elif line == "--- Start Message List ---":
326 in_message_list = True
327 if verbose:
328 print("Line {0}: {1} (Starting message list)".format(line_number, line))
329 continue
330 elif line == "--- End Message List ---":
331 in_message_list = False
332 if verbose:
333 print("Line {0}: {1} (Ending message list)".format(line_number, line))
334 continue
335 elif line == "--- Start Message Thread ---":
336 in_message_thread = True
337 current_thread = {'Title': '', 'Messages': []}
338 post_id = 1
339 if verbose:
340 print("Line {0}: {1} (Starting message thread)".format(line_number, line))
341 continue
342 elif line == "--- End Message Thread ---":
343 in_message_thread = False
344 current_service['MessageThreads'].append(current_thread)
345 current_thread = None
346 if verbose:
347 print("Line {0}: {1} (Ending message thread)".format(line_number, line))
348 continue
349 elif line == "--- Start Message Post ---":
350 in_message_post = True
351 current_message = {}
352 if verbose:
353 print("Line {0}: {1} (Starting message post)".format(line_number, line))
354 continue
355 elif line == "--- End Message Post ---":
356 in_message_post = False
357 if current_message:
358 current_thread['Messages'].append(current_message)
359 current_message = None
360 if verbose:
361 print("Line {0}: {1} (Ending message post)".format(line_number, line))
362 continue
363 elif in_message_list and key == "Interactions":
364 current_service['Interactions'] = [interaction.strip() for interaction in value.split(",")]
365 if verbose:
366 print("Line {0}: Interactions set to {1}".format(line_number, current_service['Interactions']))
368 if in_user_list and in_user_info:
369 if key == "User":
370 user_id = validate_non_negative_integer(value, "User", line_number)
371 current_service['Users'][user_id] = {'Bio': ""}
372 if verbose:
373 print("Line {0}: User ID set to {1}".format(line_number, user_id))
374 elif key == "Name":
375 if user_id is not None:
376 current_service['Users'][user_id]['Name'] = value
377 if verbose:
378 print("Line {0}: Name set to {1}".format(line_number, value))
379 elif key == "Handle":
380 if user_id is not None:
381 current_service['Users'][user_id]['Handle'] = value
382 if verbose:
383 print("Line {0}: Handle set to {1}".format(line_number, value))
384 elif key == "Location":
385 if user_id is not None:
386 current_service['Users'][user_id]['Location'] = value
387 if verbose:
388 print("Line {0}: Location set to {1}".format(line_number, value))
389 elif key == "Joined":
390 if user_id is not None:
391 current_service['Users'][user_id]['Joined'] = value
392 if verbose:
393 print("Line {0}: Joined date set to {1}".format(line_number, value))
394 elif key == "Birthday":
395 if user_id is not None:
396 current_service['Users'][user_id]['Birthday'] = value
397 if verbose:
398 print("Line {0}: Birthday set to {1}".format(line_number, value))
399 elif line == "--- Start Bio Body ---":
400 if user_id is not None:
401 current_bio = []
402 in_bio_body = True
403 if verbose:
404 print("Line {0}: Starting bio body".format(line_number))
405 elif line == "--- End Bio Body ---":
406 if user_id is not None and current_bio is not None:
407 current_service['Users'][user_id]['Bio'] = "\n".join(current_bio)
408 current_bio = None
409 in_bio_body = False
410 if verbose:
411 print("Line {0}: Ending bio body".format(line_number))
412 elif in_bio_body and current_bio is not None:
413 current_bio.append(line)
414 if verbose:
415 print("Line {0}: Adding to bio body: {1}".format(line_number, line))
416 elif in_message_list and in_message_thread:
417 if key == "Thread":
418 current_thread['Thread'] = validate_non_negative_integer(value, "Thread", line_number)
419 if verbose:
420 print("Line {0}: Thread ID set to {1}".format(line_number, value))
421 elif key == "Category":
422 current_thread['Category'] = [category.strip() for category in value.split(",")]
423 if verbose:
424 print("Line {0}: Category set to {1}".format(line_number, current_thread['Category']))
425 elif key == "Forum":
426 current_thread['Forum'] = [forum.strip() for forum in value.split(",")]
427 if verbose:
428 print("Line {0}: Forum set to {1}".format(line_number, current_thread['Forum']))
429 elif key == "Title":
430 current_thread['Title'] = value
431 if verbose:
432 print("Line {0}: Title set to {1}".format(line_number, value))
433 elif key == "Author":
434 current_message['Author'] = value
435 if verbose:
436 print("Line {0}: Author set to {1}".format(line_number, value))
437 elif key == "Time":
438 current_message['Time'] = value
439 if verbose:
440 print("Line {0}: Time set to {1}".format(line_number, value))
441 elif key == "Date":
442 current_message['Date'] = value
443 if verbose:
444 print("Line {0}: Date set to {1}".format(line_number, value))
445 elif key == "Type":
446 message_type = value
447 if message_type not in current_service['Interactions']:
448 raise ValueError("Unexpected message type '{0}' found on line {1}. Expected one of {2}".format(message_type, line_number, current_service['Interactions']))
449 current_message['Type'] = message_type
450 if verbose:
451 print("Line {0}: Type set to {1}".format(line_number, message_type))
452 elif key == "Post":
453 post_value = validate_non_negative_integer(value, "Post", line_number)
454 current_message['Post'] = post_value
455 if 'post_ids' not in current_thread:
456 current_thread['post_ids'] = set()
457 current_thread['post_ids'].add(post_value)
458 if verbose:
459 print("Line {0}: Post ID set to {1}".format(line_number, post_value))
460 elif key == "Nested":
461 nested_value = validate_non_negative_integer(value, "Nested", line_number)
462 if nested_value != 0 and nested_value not in current_thread.get('post_ids', set()):
463 raise ValueError(
464 "Nested value '{0}' on line {1} does not match any existing Post values in the current thread. Existing Post IDs: {2}".format(
465 nested_value, line_number, list(current_thread.get('post_ids', set())))
467 current_message['Nested'] = nested_value
468 if verbose:
469 print("Line {0}: Nested set to {1}".format(line_number, nested_value))
470 elif line == "--- Start Message Body ---":
471 if current_message is not None:
472 current_message['Message'] = []
473 in_message_body = True
474 if verbose:
475 print("Line {0}: Starting message body".format(line_number))
476 elif line == "--- End Message Body ---":
477 if current_message is not None and 'Message' in current_message:
478 current_message['Message'] = "\n".join(current_message['Message'])
479 in_message_body = False
480 if verbose:
481 print("Line {0}: Ending message body".format(line_number))
482 elif in_message_body and current_message is not None and 'Message' in current_message:
483 current_message['Message'].append(line)
484 if verbose:
485 print("Line {0}: Adding to message body: {1}".format(line_number, line))
486 except Exception as e:
487 if validate_only:
488 return False, "Error: {0}".format(str(e)), lines[line_number - 1]
489 else:
490 raise
492 if validate_only:
493 return True, "", ""
495 return services
497 def display_services(services):
498 for service in services:
499 print("Service Entry: {0}".format(service['Entry']))
500 print("Service: {0}".format(service['Service']))
501 print("Interactions: {0}".format(', '.join(service['Interactions'])))
502 if 'Categorization' in service and service['Categorization']:
503 for category_type, category_levels in service['Categorization'].items():
504 print("{0}: {0}".format(category_type, ', '.join(category_levels)))
505 print("Category List:")
506 for category in service['Categories']:
507 print(" Type: {0}, Level: {1}".format(category['Type'], category['Level']))
508 print(" ID: {0}".format(category['ID']))
509 print(" InSub: {0}".format(category['InSub']))
510 print(" Headline: {0}".format(category['Headline']))
511 print(" Description: {0}".format(category['Description'].strip()))
512 print("")
513 print("User List:")
514 for user_id, user_info in service['Users'].items():
515 print(" User ID: {0}".format(user_id))
516 print(" Name: {0}".format(user_info['Name']))
517 print(" Handle: {0}".format(user_info['Handle']))
518 print(" Location: {0}".format(user_info.get('Location', '')))
519 print(" Joined: {0}".format(user_info.get('Joined', '')))
520 print(" Birthday: {0}".format(user_info.get('Birthday', '')))
521 print(" Bio: {0}".format(user_info.get('Bio', '').strip()))
522 print("")
523 print("Message Threads:")
524 for idx, thread in enumerate(service['MessageThreads']):
525 print(" --- Message Thread {0} ---".format(idx+1))
526 if thread['Title']:
527 print(" Title: {0}".format(thread['Title']))
528 if 'Category' in thread:
529 print(" Category: {0}".format(', '.join(thread['Category'])))
530 if 'Forum' in thread:
531 print(" Forum: {0}".format(', '.join(thread['Forum'])))
532 for message in thread['Messages']:
533 print(" {0} ({1} on {2}): [{3}] Post ID: {4} Nested: {5}".format(
534 message['Author'], message['Time'], message['Date'], message['Type'], message['Post'], message['Nested']))
535 print(" {0}".format(message['Message'].strip()))
536 print("")
538 def to_json(services):
539 """ Convert the services data structure to JSON """
540 return json.dumps(services, indent=2)
542 def from_json(json_str):
543 """ Convert a JSON string back to the services data structure """
544 return json.loads(json_str)
546 def load_from_json_file(json_filename):
547 """ Load the services data structure from a JSON file """
548 with open_compressed_file(json_filename) as file:
549 return json.load(file)
551 def save_to_json_file(services, json_filename):
552 """ Save the services data structure to a JSON file """
553 json_data = json.dumps(services, indent=2)
554 save_compressed_file(json_data, json_filename)
556 def services_to_string(services, line_ending="lf"):
557 """ Convert the services data structure back to the original text format """
558 lines = []
559 for service in services:
560 lines.append("--- Start Archive Service ---")
561 lines.append("Entry: {0}".format(service['Entry']))
562 lines.append("Service: {0}".format(service['Service']))
564 lines.append("--- Start User List ---")
565 for user_id, user_info in service['Users'].items():
566 lines.append("--- Start User Info ---")
567 lines.append("User: {0}".format(user_id))
568 lines.append("Name: {0}".format(user_info['Name']))
569 lines.append("Handle: {0}".format(user_info['Handle']))
570 if 'Location' in user_info:
571 lines.append("Location: {0}".format(user_info['Location']))
572 if 'Joined' in user_info:
573 lines.append("Joined: {0}".format(user_info['Joined']))
574 if 'Birthday' in user_info:
575 lines.append("Birthday: {0}".format(user_info['Birthday']))
576 if 'Bio' in user_info:
577 lines.append("Bio:")
578 lines.append("--- Start Bio Body ---")
579 lines.extend(user_info['Bio'].split("\n"))
580 lines.append("--- End Bio Body ---")
581 lines.append("--- End User Info ---")
582 lines.append("--- End User List ---")
584 if 'Categorization' in service and service['Categorization']:
585 lines.append("--- Start Categorization List ---")
586 for category_type, category_levels in service['Categorization'].items():
587 lines.append("{0}: {1}".format(category_type, ', '.join(category_levels)))
588 lines.append("--- End Categorization List ---")
590 if 'Categories' in service and service['Categories']:
591 for category in service['Categories']:
592 lines.append("--- Start Category List ---")
593 lines.append("Kind: {0}, {1}".format(category['Type'], category['Level']))
594 lines.append("ID: {0}".format(category['ID']))
595 lines.append("InSub: {0}".format(category['InSub']))
596 lines.append("Headline: {0}".format(category['Headline']))
597 lines.append("Description: {0}".format(category['Description']))
598 lines.append("--- End Category List ---")
600 lines.append("--- Start Message List ---")
601 lines.append("Interactions: {0}".format(', '.join(service['Interactions'])))
602 for thread in service['MessageThreads']:
603 lines.append("--- Start Message Thread ---")
604 lines.append("Thread: {0}".format(thread['Thread']))
605 if 'Category' in thread:
606 lines.append("Category: {0}".format(', '.join(thread['Category'])))
607 if 'Forum' in thread:
608 lines.append("Forum: {0}".format(', '.join(thread['Forum'])))
609 if 'Title' in thread:
610 lines.append("Title: {0}".format(thread['Title']))
611 for message in thread['Messages']:
612 lines.append("--- Start Message Post ---")
613 lines.append("Author: {0}".format(message['Author']))
614 lines.append("Time: {0}".format(message['Time']))
615 lines.append("Date: {0}".format(message['Date']))
616 lines.append("Type: {0}".format(message['Type']))
617 lines.append("Post: {0}".format(message['Post']))
618 lines.append("Nested: {0}".format(message['Nested']))
619 lines.append("Message:")
620 lines.append("--- Start Message Body ---")
621 lines.extend(message['Message'].split("\n"))
622 lines.append("--- End Message Body ---")
623 lines.append("--- End Message Post ---")
624 lines.append("--- End Message Thread ---")
625 lines.append("--- End Message List ---")
627 lines.append("--- End Archive Service ---")
629 line_sep = {"lf": "\n", "cr": "\r", "crlf": "\r\n"}
630 return line_sep.get(line_ending, "\n").join(lines)
632 def save_services_to_file(services, filename, line_ending="lf"):
633 """ Save the services data structure to a file in the original text format """
634 data = services_to_string(services, line_ending)
635 save_compressed_file(data, filename)