bin/crashreportScraper.py

   1 #!/usr/bin/env python3
   2
   3 # This file is part of the LibreOffice project.
   4 #
   5 # This Source Code Form is subject to the terms of the Mozilla Public
   6 # License, v. 2.0. If a copy of the MPL was not distributed with this
   7 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
   8
   9 # Use this script to retrieve information from https://crashreport.libreoffice.org
  10 # about a specific version of LibreOffice
  11 # Usage sample: ./crashreportScraper.py --version 7.2.0.4 --repository /path/to/libreoffice/repository/
  12
  13 import argparse
  14 import requests
  15 from bs4 import BeautifulSoup
  16 import sys
  17 import os
  18 from datetime import datetime
  19 import urllib.parse
  20 import re
  21
  22 def convert_str_to_date(value):
  23     value = value.replace('.', '')
  24     value = value.replace('March', 'Mar')
  25     value = value.replace('April', 'Apr')
  26     value = value.replace('June', 'Jun')
  27     value = value.replace('July', 'Jul')
  28     value = value.replace('Sept', 'Sep')
  29     # reset the time leaving the date
  30     value = ", ".join(value.split(", ")[:-1])
  31     return datetime.strptime(value, '%b %d, %Y')
  32
  33 def parse_version_url(url):
  34     crashReports = {}
  35
  36     try:
  37         html_text = requests.get(url, timeout=200).text
  38         soup = BeautifulSoup(html_text, 'html.parser')
  39     except requests.exceptions.Timeout:
  40         print("Timeout requesting " + url)
  41         sys.exit(1)
  42
  43     table = soup.find("table", {"id": "data-table"}).tbody
  44     for tr in table.find_all("tr"):
  45         td_list = tr.find_all("td")
  46         crashName = td_list[0].a.text.strip()
  47         crashNumber = int(td_list[1].text.strip())
  48         firstCrashDate = convert_str_to_date(td_list[5].text.strip())
  49         lastCrashDate = convert_str_to_date(td_list[6].text.strip())
  50         crashReports[crashName] = [crashNumber, firstCrashDate, lastCrashDate]
  51
  52     return crashReports
  53
  54 def parse_reports_and_get_most_recent_report_from_last_page(url):
  55     try:
  56         html_text = requests.get(url, timeout=200).text
  57         soup = BeautifulSoup(html_text, 'html.parser')
  58     except requests.exceptions.Timeout:
  59         print("Timeout")
  60         raise
  61
  62     count = 0
  63     try:
  64         os_tab = soup.find("table", {"id": "os_tab"}).tbody
  65     except AttributeError:
  66         print("os_tab not found")
  67         raise
  68
  69     tr_list = os_tab.find_all("tr")
  70     for tr in tr_list:
  71         td_list = tr.find_all("td")
  72         count += int(td_list[1].text.strip())
  73
  74     reports = soup.find("div", {"id": "reports"}).tbody
  75     ID, currentID = "", ""
  76     version, currentVersion = 0, 0
  77     OS, currentOS = "", ""
  78
  79     tr_list = reports.find_all("tr")
  80     for tr in tr_list:
  81         td_list = tr.find_all("td")
  82
  83         currentID = td_list[0].a.text.strip()
  84         currentVersion = int(''.join(re.findall("\d+", td_list[2].text)))
  85         currentOS = td_list[3].text.strip()
  86
  87         # get most recent version
  88         # symbols on linux are not very informative generally
  89         if currentOS == "windows" and currentVersion > version:
  90             version = currentVersion
  91             ID = currentID
  92             OS = currentOS
  93
  94     if not ID:
  95         ID = currentID
  96
  97     if not OS:
  98         OS = currentOS
  99
 100     return count, ID, OS
 101
 102 def parse_details_and_get_info(url, gitRepo):
 103     try:
 104         html_text = requests.get(url, timeout=200).text
 105         soup = BeautifulSoup(html_text, 'html.parser')
 106     except requests.exceptions.Timeout:
 107         print("Timeout")
 108         raise
 109
 110     details = soup.find("div", {"id": "details"}).tbody
 111     tr_list = details.find_all("tr")
 112     reason = tr_list[8].td.text.strip()
 113
 114     stack = ""
 115     codeLine = ""
 116
 117     count = 0
 118     frames = soup.find("div", {"id": "frames"}).tbody
 119     for tr in frames.find_all("tr"):
 120         td_list = tr.find_all("td")
 121         source = td_list[3].text.strip()
 122         if source and count <= 10:
 123             source = source.replace("\\", "/").replace("C:/cygwin64/home/buildslave/source/libo-core/", "")
 124             stack += source + "\n"
 125             count += 1
 126
 127             codeFile = source.split(":")[0]
 128             codeNumber = source.split(":")[1]
 129             try:
 130                 with open(os.path.join(gitRepo, codeFile)) as f:
 131                     lines = f.readlines()
 132                     for index, line in enumerate(lines):
 133                         if index + 1 == int(codeNumber):
 134                             codeLine += line.strip().replace("\"", "'") + "\n"
 135             except FileNotFoundError:
 136                 codeLine += "\n"
 137                 continue
 138
 139     if stack:
 140         #multiline
 141         stack = "\"" + stack + "\""
 142
 143     if codeLine:
 144         #multiline
 145         codeLine = "\"" + codeLine + "\""
 146
 147     metadata = soup.find("div", {"id": "metadata"}).tbody
 148     tr_list = metadata.find_all("tr")
 149     unoCommands = ""
 150     for tr in tr_list:
 151         if tr.th.text.strip() == "Last-4-Uno-Commands":
 152             unoCommands = tr.td.text.strip()
 153
 154     return reason, stack, codeLine, unoCommands
 155
 156
 157 if __name__ == '__main__':
 158
 159     parser = argparse.ArgumentParser()
 160
 161     parser.add_argument('--version', action='store', dest="version", required=True)
 162     parser.add_argument('--repository', action="store", dest="repository", required=True)
 163
 164     args = parser.parse_args()
 165
 166     crashes = parse_version_url(
 167             "https://crashreport.libreoffice.org/stats/version/" + args.version + "?limit=1000&days=30")
 168
 169     print(str(len(crashes)) + " crash reports in version " + args.version)
 170
 171     crashesInFile = []
 172     fileName = "crashes_" + args.version.replace(".", "_") + ".csv"
 173     print("Using " + fileName)
 174
 175     bInsertHeader = False
 176     if os.path.exists(fileName):
 177         with open(fileName, "r") as f:
 178             lines = f.readlines()
 179             for line in lines:
 180                 crashesInFile.append(line.split("\t")[0])
 181     else:
 182         bInsertHeader = True
 183
 184     with open(fileName, "a") as f:
 185         if bInsertHeader:
 186             line = '\t'.join(["Name", "Ratio", "Count", "First report", "Last Report",
 187                 "ID", "Reason", "OS", "Stack", "Code Lines", "Last 4 UNO Commands", '\n'])
 188             f.write(line)
 189             f.flush()
 190
 191         for k, lDate in crashes.items():
 192             if k not in crashesInFile:
 193                 print("Parsing " + k)
 194                 try:
 195                     crashCount, crashID, crashOS = parse_reports_and_get_most_recent_report_from_last_page(
 196                             "https://crashreport.libreoffice.org/stats/signature/" + urllib.parse.quote(k))
 197                     crashReason, crashStack, codeLine, unoCommands = parse_details_and_get_info(
 198                             "https://crashreport.libreoffice.org/stats/crash_details/" + crashID, args.repository)
 199                     ratio = round(crashCount / ((lDate[2] - lDate[1]).days + 1), 2)
 200                     line = '\t'.join([k, str(ratio), str(crashCount) , lDate[1].strftime('%y/%m/%d'), lDate[2].strftime('%y/%m/%d'),
 201                             crashID, crashReason, crashOS, crashStack, codeLine, unoCommands, '\n'])
 202                     f.write(line)
 203                     f.flush()
 204                 except (requests.exceptions.Timeout, AttributeError):
 205                     continue