bin/crashreportScraper.py

   1 #!/usr/bin/env python3
   2
   3 # This file is part of the LibreOffice project.
   4 #
   5 # This Source Code Form is subject to the terms of the Mozilla Public
   6 # License, v. 2.0. If a copy of the MPL was not distributed with this
   7 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
   8
   9 # Use this script to retrieve information from https://crashreport.libreoffice.org
  10 # about a specific version of LibreOffice
  11 # Usage sample: ./crashreportScraper.py --version 7.2.0.4 --repository /path/to/libreoffice/repository/
  12
  13 import argparse
  14 import requests
  15 from bs4 import BeautifulSoup
  16 import sys
  17 import os
  18 import math
  19 from datetime import datetime
  20 import urllib.parse
  21
  22 def convert_str_to_date(value):
  23     value = value.replace('.', '')
  24     value = value.replace('March', 'Mar')
  25     value = value.replace('April', 'Apr')
  26     value = value.replace('June', 'Jun')
  27     value = value.replace('July', 'Jul')
  28     value = value.replace('Sept', 'Sep')
  29     # reset the time leaving the date
  30     value = ", ".join(value.split(", ")[:-1])
  31     return datetime.strptime(value, '%b %d, %Y')
  32
  33 def parse_version_url(url):
  34     crashReports = {}
  35
  36     try:
  37         html_text = requests.get(url, timeout=200).text
  38         soup = BeautifulSoup(html_text, 'html.parser')
  39     except requests.exceptions.Timeout:
  40         print("Timeout requesting " + url)
  41         sys.exit(1)
  42
  43     table = soup.find("table", {"id": "data-table"}).tbody
  44     for tr in table.find_all("tr"):
  45         td_list = tr.find_all("td")
  46         crashName = td_list[0].a.text.strip()
  47         crashNumber = int(td_list[1].text.strip())
  48         firstCrashDate = convert_str_to_date(td_list[5].text.strip())
  49         lastCrashDate = convert_str_to_date(td_list[6].text.strip())
  50         crashReports[crashName] = [crashNumber, firstCrashDate, lastCrashDate]
  51
  52     return crashReports
  53
  54 def parse_reports_and_get_most_recent_report_from_last_page(url):
  55     try:
  56         html_text = requests.get(url, timeout=200).text
  57         soup = BeautifulSoup(html_text, 'html.parser')
  58     except requests.exceptions.Timeout:
  59         print("Timeout")
  60         raise
  61
  62     count = 0
  63     try:
  64         os_tab = soup.find("table", {"id": "os_tab"}).tbody
  65     except AttributeError:
  66         print("os_tab not found")
  67         raise
  68
  69     tr_list = os_tab.find_all("tr")
  70     for tr in tr_list:
  71         td_list = tr.find_all("td")
  72         count += int(td_list[1].text.strip())
  73
  74     # There are 50 reports on each page.
  75     # Go to the last page based on the total count to get a recent report
  76     last_page = math.ceil( count / 50 )
  77
  78     if last_page > 1:
  79         url = url + "?page=" + str(last_page)
  80         try:
  81             html_text = requests.get(url, timeout=200).text
  82             soup = BeautifulSoup(html_text, 'html.parser')
  83         except requests.exceptions.Timeout:
  84             print("Timeout")
  85             raise
  86
  87     reports = soup.find("div", {"id": "reports"}).tbody
  88     ID, currentID = "", ""
  89     version, currentVersion = "", ""
  90     OS, currentOS = "", ""
  91
  92     tr_list = reports.find_all("tr")
  93     for tr in tr_list:
  94         td_list = tr.find_all("td")
  95
  96         currentID = td_list[0].a.text.strip()
  97         currentVersion = td_list[2].text.strip().split(': ')[1]
  98         currentOS = td_list[3].text.strip()
  99
 100         # get most recent version
 101         # symbols on linux are not very informative generally
 102         if currentOS == "windows" and currentVersion > version:
 103             version = currentVersion
 104             ID = currentID
 105             OS = currentOS
 106
 107     if not version:
 108         version = currentVersion
 109
 110     if not ID:
 111         ID = currentID
 112
 113     if not OS:
 114         OS = currentOS
 115
 116     return count, ID, version, OS
 117
 118 def parse_details_and_get_info(url, gitRepo):
 119     try:
 120         html_text = requests.get(url, timeout=200).text
 121         soup = BeautifulSoup(html_text, 'html.parser')
 122     except requests.exceptions.Timeout:
 123         print("Timeout")
 124         raise
 125
 126     details = soup.find("div", {"id": "details"}).tbody
 127     tr_list = details.find_all("tr")
 128     reason = tr_list[8].td.text.strip()
 129
 130     stack = ""
 131     codeLine = ""
 132
 133     count = 0
 134     frames = soup.find("div", {"id": "frames"}).tbody
 135     for tr in frames.find_all("tr"):
 136         td_list = tr.find_all("td")
 137         source = td_list[3].text.strip()
 138         if source and count <= 10:
 139             source = source.replace("\\", "/").replace("C:/cygwin64/home/buildslave/source/libo-core/", "")
 140             stack += source + "\n"
 141             count += 1
 142
 143             codeFile = source.split(":")[0]
 144             codeNumber = source.split(":")[1]
 145             try:
 146                 with open(os.path.join(gitRepo, codeFile)) as f:
 147                     lines = f.readlines()
 148                     for index, line in enumerate(lines):
 149                         if index + 1 == int(codeNumber):
 150                             codeLine += line.strip().replace("\"", "'") + "\n"
 151             except FileNotFoundError:
 152                 codeLine += "\n"
 153                 continue
 154
 155     if stack:
 156         #multiline
 157         stack = "\"" + stack + "\""
 158
 159     if codeLine:
 160         #multiline
 161         codeLine = "\"" + codeLine + "\""
 162
 163     metadata = soup.find("div", {"id": "metadata"}).tbody
 164     tr_list = metadata.find_all("tr")
 165     unoCommands = ""
 166     for tr in tr_list:
 167         if tr.th.text.strip() == "Last-4-Uno-Commands":
 168             unoCommands = tr.td.text.strip()
 169
 170     return reason, stack, codeLine, unoCommands
 171
 172
 173 if __name__ == '__main__':
 174
 175     parser = argparse.ArgumentParser()
 176
 177     parser.add_argument('--version', action='store', dest="version", required=True)
 178     parser.add_argument('--repository', action="store", dest="repository", required=True)
 179
 180     args = parser.parse_args()
 181
 182     crashes = parse_version_url(
 183             "https://crashreport.libreoffice.org/stats/version/" + args.version + "?limit=1000&days=30")
 184
 185     print(str(len(crashes)) + " crash reports in version " + args.version)
 186
 187     crashesInFile = []
 188     fileName = "crashes_" + args.version.replace(".", "_") + ".csv"
 189     print("Using " + fileName)
 190
 191     bInsertHeader = False
 192     if os.path.exists(fileName):
 193         with open(fileName, "r") as f:
 194             lines = f.readlines()
 195             for line in lines:
 196                 crashesInFile.append(line.split("\t")[0])
 197     else:
 198         bInsertHeader = True
 199
 200     with open(fileName, "a") as f:
 201         if bInsertHeader:
 202             line = '\t'.join(["Name", "Ratio", "Count", "First report", "Last Report",
 203                 "ID", "Version", "Reason", "OS", "Stack", "Code Lines", "Last 4 UNO Commands", '\n'])
 204             f.write(line)
 205             f.flush()
 206
 207         for k, lDate in crashes.items():
 208             if k not in crashesInFile:
 209                 print("Parsing " + k)
 210                 try:
 211                     crashCount, crashID, crashVersion, crashOS = parse_reports_and_get_most_recent_report_from_last_page(
 212                             "https://crashreport.libreoffice.org/stats/signature/" + urllib.parse.quote(k))
 213                     crashReason, crashStack, codeLine, unoCommands = parse_details_and_get_info(
 214                             "https://crashreport.libreoffice.org/stats/crash_details/" + crashID, args.repository)
 215                     ratio = round(crashCount / ((lDate[2] - lDate[1]).days + 1), 2)
 216                     line = '\t'.join([k, str(ratio), str(crashCount) , lDate[1].strftime('%y/%m/%d'), lDate[2].strftime('%y/%m/%d'),
 217                             crashID, crashVersion, crashReason, crashOS, crashStack, codeLine, unoCommands, '\n'])
 218                     f.write(line)
 219                     f.flush()
 220                 except (requests.exceptions.Timeout, AttributeError):
 221                     continue