Update git submodules
[LibreOffice.git] / bin / crashreportScraper.py
blob876570d3a028a7829d60dfec0185ca8239fc4935
1 #!/usr/bin/env python3
3 # This file is part of the LibreOffice project.
5 # This Source Code Form is subject to the terms of the Mozilla Public
6 # License, v. 2.0. If a copy of the MPL was not distributed with this
7 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 # Use this script to retrieve information from https://crashreport.libreoffice.org
10 # about a specific version of LibreOffice
11 # Usage sample: ./crashreportScraper.py --version 7.2.0.4 --repository /path/to/libreoffice/repository/
13 import argparse
14 import requests
15 from bs4 import BeautifulSoup
16 import sys
17 import os
18 import math
19 from datetime import datetime
20 import urllib.parse
22 def convert_str_to_date(value):
23 value = value.replace('.', '')
24 value = value.replace('March', 'Mar')
25 value = value.replace('April', 'Apr')
26 value = value.replace('June', 'Jun')
27 value = value.replace('July', 'Jul')
28 value = value.replace('Sept', 'Sep')
29 # reset the time leaving the date
30 value = ", ".join(value.split(", ")[:-1])
31 return datetime.strptime(value, '%b %d, %Y')
33 def parse_version_url(url):
34 crashReports = {}
36 try:
37 html_text = requests.get(url, timeout=200).text
38 soup = BeautifulSoup(html_text, 'html.parser')
39 except requests.exceptions.Timeout:
40 print("Timeout requesting " + url)
41 sys.exit(1)
43 table = soup.find("table", {"id": "data-table"}).tbody
44 for tr in table.find_all("tr"):
45 td_list = tr.find_all("td")
46 crashName = td_list[0].a.text.strip()
47 crashNumber = int(td_list[1].text.strip())
48 firstCrashDate = convert_str_to_date(td_list[5].text.strip())
49 lastCrashDate = convert_str_to_date(td_list[6].text.strip())
50 crashReports[crashName] = [crashNumber, firstCrashDate, lastCrashDate]
52 return crashReports
54 def parse_reports_and_get_most_recent_report_from_last_page(url):
55 try:
56 html_text = requests.get(url, timeout=200).text
57 soup = BeautifulSoup(html_text, 'html.parser')
58 except requests.exceptions.Timeout:
59 print("Timeout")
60 raise
62 count = 0
63 try:
64 os_tab = soup.find("table", {"id": "os_tab"}).tbody
65 except AttributeError:
66 print("os_tab not found")
67 raise
69 tr_list = os_tab.find_all("tr")
70 for tr in tr_list:
71 td_list = tr.find_all("td")
72 count += int(td_list[1].text.strip())
74 # There are 50 reports on each page.
75 # Go to the last page based on the total count to get a recent report
76 last_page = math.ceil( count / 50 )
78 if last_page > 1:
79 url = url + "?page=" + str(last_page)
80 try:
81 html_text = requests.get(url, timeout=200).text
82 soup = BeautifulSoup(html_text, 'html.parser')
83 except requests.exceptions.Timeout:
84 print("Timeout")
85 raise
87 reports = soup.find("div", {"id": "reports"}).tbody
88 ID, currentID = "", ""
89 version, currentVersion = "", ""
90 OS, currentOS = "", ""
92 tr_list = reports.find_all("tr")
93 for tr in tr_list:
94 td_list = tr.find_all("td")
96 currentID = td_list[0].a.text.strip()
97 currentVersion = td_list[2].text.strip().split(': ')[1]
98 currentOS = td_list[3].text.strip()
100 # get most recent version
101 # symbols on linux are not very informative generally
102 if currentOS == "windows" and currentVersion > version:
103 version = currentVersion
104 ID = currentID
105 OS = currentOS
107 if not version:
108 version = currentVersion
110 if not ID:
111 ID = currentID
113 if not OS:
114 OS = currentOS
116 return count, ID, version, OS
118 def parse_details_and_get_info(url, gitRepo):
119 try:
120 html_text = requests.get(url, timeout=200).text
121 soup = BeautifulSoup(html_text, 'html.parser')
122 except requests.exceptions.Timeout:
123 print("Timeout")
124 raise
126 details = soup.find("div", {"id": "details"}).tbody
127 tr_list = details.find_all("tr")
128 reason = tr_list[8].td.text.strip()
130 stack = ""
131 codeLine = ""
133 count = 0
134 frames = soup.find("div", {"id": "frames"}).tbody
135 for tr in frames.find_all("tr"):
136 td_list = tr.find_all("td")
137 source = td_list[3].text.strip()
138 if source and count <= 10:
139 source = source.replace("\\", "/").replace("C:/cygwin64/home/buildslave/source/libo-core/", "")
140 stack += source + "\n"
141 count += 1
143 codeFile = source.split(":")[0]
144 codeNumber = source.split(":")[1]
145 try:
146 with open(os.path.join(gitRepo, codeFile)) as f:
147 lines = f.readlines()
148 for index, line in enumerate(lines):
149 if index + 1 == int(codeNumber):
150 codeLine += line.strip().replace("\"", "'") + "\n"
151 except FileNotFoundError:
152 codeLine += "\n"
153 continue
155 if stack:
156 #multiline
157 stack = "\"" + stack + "\""
159 if codeLine:
160 #multiline
161 codeLine = "\"" + codeLine + "\""
163 metadata = soup.find("div", {"id": "metadata"}).tbody
164 tr_list = metadata.find_all("tr")
165 unoCommands = ""
166 for tr in tr_list:
167 if tr.th.text.strip() == "Last-4-Uno-Commands":
168 unoCommands = tr.td.text.strip()
170 return reason, stack, codeLine, unoCommands
173 if __name__ == '__main__':
175 parser = argparse.ArgumentParser()
177 parser.add_argument('--version', action='store', dest="version", required=True)
178 parser.add_argument('--repository', action="store", dest="repository", required=True)
180 args = parser.parse_args()
182 crashes = parse_version_url(
183 "https://crashreport.libreoffice.org/stats/version/" + args.version + "?limit=1000&days=30")
185 print(str(len(crashes)) + " crash reports in version " + args.version)
187 crashesInFile = []
188 fileName = "crashes_" + args.version.replace(".", "_") + ".csv"
189 print("Using " + fileName)
191 bInsertHeader = False
192 if os.path.exists(fileName):
193 with open(fileName, "r") as f:
194 lines = f.readlines()
195 for line in lines:
196 crashesInFile.append(line.split("\t")[0])
197 else:
198 bInsertHeader = True
200 with open(fileName, "a") as f:
201 if bInsertHeader:
202 line = '\t'.join(["Name", "Ratio", "Count", "First report", "Last Report",
203 "ID", "Version", "Reason", "OS", "Stack", "Code Lines", "Last 4 UNO Commands", '\n'])
204 f.write(line)
205 f.flush()
207 for k, lDate in crashes.items():
208 if k not in crashesInFile:
209 print("Parsing " + k)
210 try:
211 crashCount, crashID, crashVersion, crashOS = parse_reports_and_get_most_recent_report_from_last_page(
212 "https://crashreport.libreoffice.org/stats/signature/" + urllib.parse.quote(k))
213 crashReason, crashStack, codeLine, unoCommands = parse_details_and_get_info(
214 "https://crashreport.libreoffice.org/stats/crash_details/" + crashID, args.repository)
215 ratio = round(crashCount / ((lDate[2] - lDate[1]).days + 1), 2)
216 line = '\t'.join([k, str(ratio), str(crashCount) , lDate[1].strftime('%y/%m/%d'), lDate[2].strftime('%y/%m/%d'),
217 crashID, crashVersion, crashReason, crashOS, crashStack, codeLine, unoCommands, '\n'])
218 f.write(line)
219 f.flush()
220 except (requests.exceptions.Timeout, AttributeError):
221 continue