3 # This file is part of the LibreOffice project.
5 # This Source Code Form is subject to the terms of the Mozilla Public
6 # License, v. 2.0. If a copy of the MPL was not distributed with this
7 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 # Use this script to retrieve information from https://crashreport.libreoffice.org
10 # about a specific version of LibreOffice
11 # Usage sample: ./crashreportScraper.py --version 7.2.0.4 --repository /path/to/libreoffice/repository/
15 from bs4
import BeautifulSoup
18 from datetime
import datetime
22 def convert_str_to_date(value
):
23 value
= value
.replace('.', '')
24 value
= value
.replace('March', 'Mar')
25 value
= value
.replace('April', 'Apr')
26 value
= value
.replace('June', 'Jun')
27 value
= value
.replace('July', 'Jul')
28 value
= value
.replace('Sept', 'Sep')
29 # reset the time leaving the date
30 value
= ", ".join(value
.split(", ")[:-1])
31 return datetime
.strptime(value
, '%b %d, %Y')
33 def parse_version_url(url
):
37 html_text
= requests
.get(url
, timeout
=200).text
38 soup
= BeautifulSoup(html_text
, 'html.parser')
39 except requests
.exceptions
.Timeout
:
40 print("Timeout requesting " + url
)
43 table
= soup
.find("table", {"id": "data-table"}).tbody
44 for tr
in table
.find_all("tr"):
45 td_list
= tr
.find_all("td")
46 crashName
= td_list
[0].a
.text
.strip()
47 crashNumber
= int(td_list
[1].text
.strip())
48 firstCrashDate
= convert_str_to_date(td_list
[5].text
.strip())
49 lastCrashDate
= convert_str_to_date(td_list
[6].text
.strip())
50 crashReports
[crashName
] = [crashNumber
, firstCrashDate
, lastCrashDate
]
54 def parse_reports_and_get_most_recent_report_from_last_page(url
):
56 html_text
= requests
.get(url
, timeout
=200).text
57 soup
= BeautifulSoup(html_text
, 'html.parser')
58 except requests
.exceptions
.Timeout
:
64 os_tab
= soup
.find("table", {"id": "os_tab"}).tbody
65 except AttributeError:
66 print("os_tab not found")
69 tr_list
= os_tab
.find_all("tr")
71 td_list
= tr
.find_all("td")
72 count
+= int(td_list
[1].text
.strip())
74 reports
= soup
.find("div", {"id": "reports"}).tbody
75 ID
, currentID
= "", ""
76 version
, currentVersion
= 0, 0
77 OS
, currentOS
= "", ""
79 tr_list
= reports
.find_all("tr")
81 td_list
= tr
.find_all("td")
83 currentID
= td_list
[0].a
.text
.strip()
84 currentVersion
= int(''.join(re
.findall("\d+", td_list
[2].text
)))
85 currentOS
= td_list
[3].text
.strip()
87 # get most recent version
88 # symbols on linux are not very informative generally
89 if currentOS
== "windows" and currentVersion
> version
:
90 version
= currentVersion
102 def parse_details_and_get_info(url
, gitRepo
):
104 html_text
= requests
.get(url
, timeout
=200).text
105 soup
= BeautifulSoup(html_text
, 'html.parser')
106 except requests
.exceptions
.Timeout
:
110 details
= soup
.find("div", {"id": "details"}).tbody
111 tr_list
= details
.find_all("tr")
112 reason
= tr_list
[8].td
.text
.strip()
118 frames
= soup
.find("div", {"id": "frames"}).tbody
119 for tr
in frames
.find_all("tr"):
120 td_list
= tr
.find_all("td")
121 source
= td_list
[3].text
.strip()
122 if source
and count
<= 10:
123 source
= source
.replace("\\", "/").replace("C:/cygwin64/home/buildslave/source/libo-core/", "")
124 stack
+= source
+ "\n"
127 codeFile
= source
.split(":")[0]
128 codeNumber
= source
.split(":")[1]
130 with
open(os
.path
.join(gitRepo
, codeFile
)) as f
:
131 lines
= f
.readlines()
132 for index
, line
in enumerate(lines
):
133 if index
+ 1 == int(codeNumber
):
134 codeLine
+= line
.strip().replace("\"", "'") + "\n"
135 except FileNotFoundError
:
141 stack
= "\"" + stack
+ "\""
145 codeLine
= "\"" + codeLine
+ "\""
147 metadata
= soup
.find("div", {"id": "metadata"}).tbody
148 tr_list
= metadata
.find_all("tr")
151 if tr
.th
.text
.strip() == "Last-4-Uno-Commands":
152 unoCommands
= tr
.td
.text
.strip()
154 return reason
, stack
, codeLine
, unoCommands
157 if __name__
== '__main__':
159 parser
= argparse
.ArgumentParser()
161 parser
.add_argument('--version', action
='store', dest
="version", required
=True)
162 parser
.add_argument('--repository', action
="store", dest
="repository", required
=True)
164 args
= parser
.parse_args()
166 crashes
= parse_version_url(
167 "https://crashreport.libreoffice.org/stats/version/" + args
.version
+ "?limit=1000&days=30")
169 print(str(len(crashes
)) + " crash reports in version " + args
.version
)
172 fileName
= "crashes_" + args
.version
.replace(".", "_") + ".csv"
173 print("Using " + fileName
)
175 bInsertHeader
= False
176 if os
.path
.exists(fileName
):
177 with
open(fileName
, "r") as f
:
178 lines
= f
.readlines()
180 crashesInFile
.append(line
.split("\t")[0])
184 with
open(fileName
, "a") as f
:
186 line
= '\t'.join(["Name", "Ratio", "Count", "First report", "Last Report",
187 "ID", "Reason", "OS", "Stack", "Code Lines", "Last 4 UNO Commands", '\n'])
191 for k
, lDate
in crashes
.items():
192 if k
not in crashesInFile
:
193 print("Parsing " + k
)
195 crashCount
, crashID
, crashOS
= parse_reports_and_get_most_recent_report_from_last_page(
196 "https://crashreport.libreoffice.org/stats/signature/" + urllib
.parse
.quote(k
))
197 crashReason
, crashStack
, codeLine
, unoCommands
= parse_details_and_get_info(
198 "https://crashreport.libreoffice.org/stats/crash_details/" + crashID
, args
.repository
)
199 ratio
= round(crashCount
/ ((lDate
[2] - lDate
[1]).days
+ 1), 2)
200 line
= '\t'.join([k
, str(ratio
), str(crashCount
) , lDate
[1].strftime('%y/%m/%d'), lDate
[2].strftime('%y/%m/%d'),
201 crashID
, crashReason
, crashOS
, crashStack
, codeLine
, unoCommands
, '\n'])
204 except (requests
.exceptions
.Timeout
, AttributeError):