bin/crashreportScraper.py

   1 #!/usr/bin/env python3
   2
   3 # This file is part of the LibreOffice project.
   4 #
   5 # This Source Code Form is subject to the terms of the Mozilla Public
   6 # License, v. 2.0. If a copy of the MPL was not distributed with this
   7 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
   8
   9 # Use this script to retrieve information from https://crashreport.libreoffice.org
  10 # about a specific version of LibreOffice
  11 # Usage sample: ./crashreportScraper.py --version 7.2.0.4 --repository /path/to/libreoffice/repository/
  12
  13 import argparse
  14 import requests
  15 from bs4 import BeautifulSoup
  16 import sys
  17 import os
  18 import math
  19 from datetime import datetime
  20 import urllib.parse
  21
  22 def convert_str_to_date(value):
  23     value = value.replace('.', '')
  24     value = value.replace('March', 'Mar')
  25     value = value.replace('April', 'Apr')
  26     value = value.replace('June', 'Jun')
  27     value = value.replace('July', 'Jul')
  28     value = value.replace('Sept', 'Sep')
  29     # reset the time leaving the date
  30     value = ", ".join(value.split(", ")[:-1])
  31     dtDate = datetime.strptime(value, '%b %d, %Y')
  32
  33     return dtDate.strftime('%y/%m/%d')
  34
  35 def parse_version_url(url):
  36     crashReports = {}
  37
  38     try:
  39         html_text = requests.get(url, timeout=200).text
  40         soup = BeautifulSoup(html_text, 'html.parser')
  41     except requests.exceptions.Timeout:
  42         print("Timeout requesting " + url)
  43         sys.exit(1)
  44
  45     table = soup.find("table", {"id": "data-table"}).tbody
  46     for tr in table.find_all("tr"):
  47         td_list = tr.find_all("td")
  48         crashName = td_list[0].a.text.strip()
  49         crashNumber = int(td_list[1].text.strip())
  50         firstCrashDate = convert_str_to_date(td_list[5].text.strip())
  51         lastCrashDate = convert_str_to_date(td_list[6].text.strip())
  52         crashReports[crashName] = [crashNumber, firstCrashDate, lastCrashDate]
  53
  54     return crashReports
  55
  56 def parse_reports_and_get_most_recent_report_from_last_page(url):
  57     try:
  58         html_text = requests.get(url, timeout=200).text
  59         soup = BeautifulSoup(html_text, 'html.parser')
  60     except requests.exceptions.Timeout:
  61         print("Timeout")
  62         raise
  63
  64     count = 0
  65     try:
  66         os_tab = soup.find("table", {"id": "os_tab"}).tbody
  67     except AttributeError:
  68         print("os_tab not found")
  69         raise
  70
  71     tr_list = os_tab.find_all("tr")
  72     for tr in tr_list:
  73         td_list = tr.find_all("td")
  74         count += int(td_list[1].text.strip())
  75
  76     # There are 50 reports on each page.
  77     # Go to the last page based on the total count to get a recent report
  78     last_page = math.ceil( count / 50 )
  79
  80     if last_page > 1:
  81         url = url + "?page=" + str(last_page)
  82         try:
  83             html_text = requests.get(url, timeout=200).text
  84             soup = BeautifulSoup(html_text, 'html.parser')
  85         except requests.exceptions.Timeout:
  86             print("Timeout")
  87             raise
  88
  89     reports = soup.find("div", {"id": "reports"}).tbody
  90     ID, currentID = "", ""
  91     version, currentVersion = "", ""
  92     OS, currentOS = "", ""
  93
  94     tr_list = reports.find_all("tr")
  95     for tr in tr_list:
  96         td_list = tr.find_all("td")
  97
  98         currentID = td_list[0].a.text.strip()
  99         currentVersion = td_list[2].text.strip().split(': ')[1]
 100         currentOS = td_list[3].text.strip()
 101
 102         # get most recent version
 103         # symbols on linux are not very informative generally
 104         if currentOS == "windows" and currentVersion > version:
 105             version = currentVersion
 106             ID = currentID
 107             OS = currentOS
 108
 109     if not version:
 110         version = currentVersion
 111
 112     if not ID:
 113         ID = currentID
 114
 115     if not OS:
 116         OS = currentOS
 117
 118     return count, ID, version, OS
 119
 120 def parse_details_and_get_info(url, gitRepo):
 121     try:
 122         html_text = requests.get(url, timeout=200).text
 123         soup = BeautifulSoup(html_text, 'html.parser')
 124     except requests.exceptions.Timeout:
 125         print("Timeout")
 126         raise
 127
 128     details = soup.find("div", {"id": "details"}).tbody
 129     tr_list = details.find_all("tr")
 130     reason = tr_list[8].td.text.strip()
 131
 132     stack = ""
 133     codeLine = ""
 134
 135     count = 0
 136     frames = soup.find("div", {"id": "frames"}).tbody
 137     for tr in frames.find_all("tr"):
 138         td_list = tr.find_all("td")
 139         source = td_list[3].text.strip()
 140         if source and count <= 10:
 141             source = source.replace("\\", "/").replace("C:/cygwin64/home/buildslave/source/libo-core/", "")
 142             stack += source + "\n"
 143             count += 1
 144
 145             codeFile = source.split(":")[0]
 146             codeNumber = source.split(":")[1]
 147             try:
 148                 with open(os.path.join(gitRepo, codeFile)) as f:
 149                     lines = f.readlines()
 150                     for index, line in enumerate(lines):
 151                         if index + 1 == int(codeNumber):
 152                             codeLine += line.strip().replace("\"", "'") + "\n"
 153             except FileNotFoundError:
 154                 codeLine += "\n"
 155                 continue
 156
 157     if stack:
 158         #multiline
 159         stack = "\"" + stack + "\""
 160
 161     if codeLine:
 162         #multiline
 163         codeLine = "\"" + codeLine + "\""
 164
 165     metadata = soup.find("div", {"id": "metadata"}).tbody
 166     tr_list = metadata.find_all("tr")
 167     unoCommands = ""
 168     for tr in tr_list:
 169         if tr.th.text.strip() == "Last-4-Uno-Commands":
 170             unoCommands = tr.td.text.strip()
 171
 172     return reason, stack, codeLine, unoCommands
 173
 174
 175 if __name__ == '__main__':
 176
 177     parser = argparse.ArgumentParser()
 178
 179     parser.add_argument('--version', action='store', dest="version", required=True)
 180     parser.add_argument('--repository', action="store", dest="repository", required=True)
 181
 182     args = parser.parse_args()
 183
 184     crashes = parse_version_url(
 185             "https://crashreport.libreoffice.org/stats/version/" + args.version + "?limit=1000&days=30")
 186
 187     print(str(len(crashes)) + " crash reports in version " + args.version)
 188
 189     crashesInFile = []
 190     fileName = "crashes_" + args.version.replace(".", "_") + ".csv"
 191     print("Using " + fileName)
 192
 193     bInsertHeader = False
 194     if os.path.exists(fileName):
 195         with open(fileName, "r") as f:
 196             lines = f.readlines()
 197             for line in lines:
 198                 crashesInFile.append(line.split("\t")[0])
 199     else:
 200         bInsertHeader = True
 201
 202     with open(fileName, "a") as f:
 203         if bInsertHeader:
 204             line = '\t'.join(["Name", "Count", "First report", "Last Report",
 205                 "ID", "Version", "Reason", "OS", "Stack", "Code Lines", "Last 4 UNO Commands", '\n'])
 206             f.write(line)
 207             f.flush()
 208
 209         for k, lDate in crashes.items():
 210             if k not in crashesInFile:
 211                 print("Parsing " + k)
 212                 try:
 213                     crashCount, crashID, crashVersion, crashOS = parse_reports_and_get_most_recent_report_from_last_page(
 214                             "https://crashreport.libreoffice.org/stats/signature/" + urllib.parse.quote(k))
 215                     crashReason, crashStack, codeLine, unoCommands = parse_details_and_get_info(
 216                             "https://crashreport.libreoffice.org/stats/crash_details/" + crashID, args.repository)
 217                     line = '\t'.join([k, str(crashCount), lDate[1], lDate[2],
 218                             crashID, crashVersion, crashReason, crashOS, crashStack, codeLine, unoCommands, '\n'])
 219                     f.write(line)
 220                     f.flush()
 221                 except (requests.exceptions.Timeout, AttributeError):
 222                     continue