update credits
[LibreOffice.git] / bin / crashreportScraper.py
blob37636f2c462eadcda0c9d4dfa5c23a622fff4e20
1 #!/usr/bin/env python3
3 # This file is part of the LibreOffice project.
5 # This Source Code Form is subject to the terms of the Mozilla Public
6 # License, v. 2.0. If a copy of the MPL was not distributed with this
7 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 # Use this script to retrieve information from https://crashreport.libreoffice.org
10 # about a specific version of LibreOffice
11 # Usage sample: ./crashreportScraper.py --version 7.2.0.4 --repository /path/to/libreoffice/repository/
13 import argparse
14 import requests
15 from bs4 import BeautifulSoup
16 import sys
17 import os
18 from datetime import datetime
19 import urllib.parse
20 import re
22 def convert_str_to_date(value):
23 value = value.replace('.', '')
24 value = value.replace('March', 'Mar')
25 value = value.replace('April', 'Apr')
26 value = value.replace('June', 'Jun')
27 value = value.replace('July', 'Jul')
28 value = value.replace('Sept', 'Sep')
29 # reset the time leaving the date
30 value = ", ".join(value.split(", ")[:-1])
31 return datetime.strptime(value, '%b %d, %Y')
33 def parse_version_url(url):
34 crashReports = {}
36 try:
37 html_text = requests.get(url, timeout=200).text
38 soup = BeautifulSoup(html_text, 'html.parser')
39 except requests.exceptions.Timeout:
40 print("Timeout requesting " + url)
41 sys.exit(1)
43 table = soup.find("table", {"id": "data-table"}).tbody
44 for tr in table.find_all("tr"):
45 td_list = tr.find_all("td")
46 crashName = td_list[0].a.text.strip()
47 crashNumber = int(td_list[1].text.strip())
48 firstCrashDate = convert_str_to_date(td_list[5].text.strip())
49 lastCrashDate = convert_str_to_date(td_list[6].text.strip())
50 crashReports[crashName] = [crashNumber, firstCrashDate, lastCrashDate]
52 return crashReports
54 def parse_reports_and_get_most_recent_report_from_last_page(url):
55 try:
56 html_text = requests.get(url, timeout=200).text
57 soup = BeautifulSoup(html_text, 'html.parser')
58 except requests.exceptions.Timeout:
59 print("Timeout")
60 raise
62 count = 0
63 try:
64 os_tab = soup.find("table", {"id": "os_tab"}).tbody
65 except AttributeError:
66 print("os_tab not found")
67 raise
69 tr_list = os_tab.find_all("tr")
70 for tr in tr_list:
71 td_list = tr.find_all("td")
72 count += int(td_list[1].text.strip())
74 reports = soup.find("div", {"id": "reports"}).tbody
75 ID, currentID = "", ""
76 version, currentVersion = 0, 0
77 OS, currentOS = "", ""
79 tr_list = reports.find_all("tr")
80 for tr in tr_list:
81 td_list = tr.find_all("td")
83 currentID = td_list[0].a.text.strip()
84 currentVersion = int(''.join(re.findall("\d+", td_list[2].text)))
85 currentOS = td_list[3].text.strip()
87 # get most recent version
88 # symbols on linux are not very informative generally
89 if currentOS == "windows" and currentVersion > version:
90 version = currentVersion
91 ID = currentID
92 OS = currentOS
94 if not ID:
95 ID = currentID
97 if not OS:
98 OS = currentOS
100 return count, ID, OS
102 def parse_details_and_get_info(url, gitRepo):
103 try:
104 html_text = requests.get(url, timeout=200).text
105 soup = BeautifulSoup(html_text, 'html.parser')
106 except requests.exceptions.Timeout:
107 print("Timeout")
108 raise
110 details = soup.find("div", {"id": "details"}).tbody
111 tr_list = details.find_all("tr")
112 reason = tr_list[8].td.text.strip()
114 stack = ""
115 codeLine = ""
117 count = 0
118 frames = soup.find("div", {"id": "frames"}).tbody
119 for tr in frames.find_all("tr"):
120 td_list = tr.find_all("td")
121 source = td_list[3].text.strip()
122 if source and count <= 10:
123 source = source.replace("\\", "/").replace("C:/cygwin64/home/buildslave/source/libo-core/", "")
124 stack += source + "\n"
125 count += 1
127 codeFile = source.split(":")[0]
128 codeNumber = source.split(":")[1]
129 try:
130 with open(os.path.join(gitRepo, codeFile)) as f:
131 lines = f.readlines()
132 for index, line in enumerate(lines):
133 if index + 1 == int(codeNumber):
134 codeLine += line.strip().replace("\"", "'") + "\n"
135 except FileNotFoundError:
136 codeLine += "\n"
137 continue
139 if stack:
140 #multiline
141 stack = "\"" + stack + "\""
143 if codeLine:
144 #multiline
145 codeLine = "\"" + codeLine + "\""
147 metadata = soup.find("div", {"id": "metadata"}).tbody
148 tr_list = metadata.find_all("tr")
149 unoCommands = ""
150 for tr in tr_list:
151 if tr.th.text.strip() == "Last-4-Uno-Commands":
152 unoCommands = tr.td.text.strip()
154 return reason, stack, codeLine, unoCommands
157 if __name__ == '__main__':
159 parser = argparse.ArgumentParser()
161 parser.add_argument('--version', action='store', dest="version", required=True)
162 parser.add_argument('--repository', action="store", dest="repository", required=True)
164 args = parser.parse_args()
166 crashes = parse_version_url(
167 "https://crashreport.libreoffice.org/stats/version/" + args.version + "?limit=1000&days=30")
169 print(str(len(crashes)) + " crash reports in version " + args.version)
171 crashesInFile = []
172 fileName = "crashes_" + args.version.replace(".", "_") + ".csv"
173 print("Using " + fileName)
175 bInsertHeader = False
176 if os.path.exists(fileName):
177 with open(fileName, "r") as f:
178 lines = f.readlines()
179 for line in lines:
180 crashesInFile.append(line.split("\t")[0])
181 else:
182 bInsertHeader = True
184 with open(fileName, "a") as f:
185 if bInsertHeader:
186 line = '\t'.join(["Name", "Ratio", "Count", "First report", "Last Report",
187 "ID", "Reason", "OS", "Stack", "Code Lines", "Last 4 UNO Commands", '\n'])
188 f.write(line)
189 f.flush()
191 for k, lDate in crashes.items():
192 if k not in crashesInFile:
193 print("Parsing " + k)
194 try:
195 crashCount, crashID, crashOS = parse_reports_and_get_most_recent_report_from_last_page(
196 "https://crashreport.libreoffice.org/stats/signature/" + urllib.parse.quote(k))
197 crashReason, crashStack, codeLine, unoCommands = parse_details_and_get_info(
198 "https://crashreport.libreoffice.org/stats/crash_details/" + crashID, args.repository)
199 ratio = round(crashCount / ((lDate[2] - lDate[1]).days + 1), 2)
200 line = '\t'.join([k, str(ratio), str(crashCount) , lDate[1].strftime('%y/%m/%d'), lDate[2].strftime('%y/%m/%d'),
201 crashID, crashReason, crashOS, crashStack, codeLine, unoCommands, '\n'])
202 f.write(line)
203 f.flush()
204 except (requests.exceptions.Timeout, AttributeError):
205 continue