3 # This file is part of the LibreOffice project.
5 # This Source Code Form is subject to the terms of the Mozilla Public
6 # License, v. 2.0. If a copy of the MPL was not distributed with this
7 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 # Use this script to retrieve information from https://crashreport.libreoffice.org
10 # about a specific version of LibreOffice
11 # Usage sample: ./crashreportScraper.py --version 7.2.0.4 --repository /path/to/libreoffice/repository/
15 from bs4
import BeautifulSoup
19 from datetime
import datetime
22 def convert_str_to_date(value
):
23 value
= value
.replace('.', '')
24 value
= value
.replace('March', 'Mar')
25 value
= value
.replace('April', 'Apr')
26 value
= value
.replace('June', 'Jun')
27 value
= value
.replace('July', 'Jul')
28 value
= value
.replace('Sept', 'Sep')
29 # reset the time leaving the date
30 value
= ", ".join(value
.split(", ")[:-1])
31 return datetime
.strptime(value
, '%b %d, %Y')
33 def parse_version_url(url
):
37 html_text
= requests
.get(url
, timeout
=200).text
38 soup
= BeautifulSoup(html_text
, 'html.parser')
39 except requests
.exceptions
.Timeout
:
40 print("Timeout requesting " + url
)
43 table
= soup
.find("table", {"id": "data-table"}).tbody
44 for tr
in table
.find_all("tr"):
45 td_list
= tr
.find_all("td")
46 crashName
= td_list
[0].a
.text
.strip()
47 crashNumber
= int(td_list
[1].text
.strip())
48 firstCrashDate
= convert_str_to_date(td_list
[5].text
.strip())
49 lastCrashDate
= convert_str_to_date(td_list
[6].text
.strip())
50 crashReports
[crashName
] = [crashNumber
, firstCrashDate
, lastCrashDate
]
54 def parse_reports_and_get_most_recent_report_from_last_page(url
):
56 html_text
= requests
.get(url
, timeout
=200).text
57 soup
= BeautifulSoup(html_text
, 'html.parser')
58 except requests
.exceptions
.Timeout
:
64 os_tab
= soup
.find("table", {"id": "os_tab"}).tbody
65 except AttributeError:
66 print("os_tab not found")
69 tr_list
= os_tab
.find_all("tr")
71 td_list
= tr
.find_all("td")
72 count
+= int(td_list
[1].text
.strip())
74 # There are 50 reports on each page.
75 # Go to the last page based on the total count to get a recent report
76 last_page
= math
.ceil( count
/ 50 )
79 url
= url
+ "?page=" + str(last_page
)
81 html_text
= requests
.get(url
, timeout
=200).text
82 soup
= BeautifulSoup(html_text
, 'html.parser')
83 except requests
.exceptions
.Timeout
:
87 reports
= soup
.find("div", {"id": "reports"}).tbody
88 ID
, currentID
= "", ""
89 version
, currentVersion
= "", ""
90 OS
, currentOS
= "", ""
92 tr_list
= reports
.find_all("tr")
94 td_list
= tr
.find_all("td")
96 currentID
= td_list
[0].a
.text
.strip()
97 currentVersion
= td_list
[2].text
.strip().split(': ')[1]
98 currentOS
= td_list
[3].text
.strip()
100 # get most recent version
101 # symbols on linux are not very informative generally
102 if currentOS
== "windows" and currentVersion
> version
:
103 version
= currentVersion
108 version
= currentVersion
116 return count
, ID
, version
, OS
118 def parse_details_and_get_info(url
, gitRepo
):
120 html_text
= requests
.get(url
, timeout
=200).text
121 soup
= BeautifulSoup(html_text
, 'html.parser')
122 except requests
.exceptions
.Timeout
:
126 details
= soup
.find("div", {"id": "details"}).tbody
127 tr_list
= details
.find_all("tr")
128 reason
= tr_list
[8].td
.text
.strip()
134 frames
= soup
.find("div", {"id": "frames"}).tbody
135 for tr
in frames
.find_all("tr"):
136 td_list
= tr
.find_all("td")
137 source
= td_list
[3].text
.strip()
138 if source
and count
<= 10:
139 source
= source
.replace("\\", "/").replace("C:/cygwin64/home/buildslave/source/libo-core/", "")
140 stack
+= source
+ "\n"
143 codeFile
= source
.split(":")[0]
144 codeNumber
= source
.split(":")[1]
146 with
open(os
.path
.join(gitRepo
, codeFile
)) as f
:
147 lines
= f
.readlines()
148 for index
, line
in enumerate(lines
):
149 if index
+ 1 == int(codeNumber
):
150 codeLine
+= line
.strip().replace("\"", "'") + "\n"
151 except FileNotFoundError
:
157 stack
= "\"" + stack
+ "\""
161 codeLine
= "\"" + codeLine
+ "\""
163 metadata
= soup
.find("div", {"id": "metadata"}).tbody
164 tr_list
= metadata
.find_all("tr")
167 if tr
.th
.text
.strip() == "Last-4-Uno-Commands":
168 unoCommands
= tr
.td
.text
.strip()
170 return reason
, stack
, codeLine
, unoCommands
173 if __name__
== '__main__':
175 parser
= argparse
.ArgumentParser()
177 parser
.add_argument('--version', action
='store', dest
="version", required
=True)
178 parser
.add_argument('--repository', action
="store", dest
="repository", required
=True)
180 args
= parser
.parse_args()
182 crashes
= parse_version_url(
183 "https://crashreport.libreoffice.org/stats/version/" + args
.version
+ "?limit=1000&days=30")
185 print(str(len(crashes
)) + " crash reports in version " + args
.version
)
188 fileName
= "crashes_" + args
.version
.replace(".", "_") + ".csv"
189 print("Using " + fileName
)
191 bInsertHeader
= False
192 if os
.path
.exists(fileName
):
193 with
open(fileName
, "r") as f
:
194 lines
= f
.readlines()
196 crashesInFile
.append(line
.split("\t")[0])
200 with
open(fileName
, "a") as f
:
202 line
= '\t'.join(["Name", "Ratio", "Count", "First report", "Last Report",
203 "ID", "Version", "Reason", "OS", "Stack", "Code Lines", "Last 4 UNO Commands", '\n'])
207 for k
, lDate
in crashes
.items():
208 if k
not in crashesInFile
:
209 print("Parsing " + k
)
211 crashCount
, crashID
, crashVersion
, crashOS
= parse_reports_and_get_most_recent_report_from_last_page(
212 "https://crashreport.libreoffice.org/stats/signature/" + urllib
.parse
.quote(k
))
213 crashReason
, crashStack
, codeLine
, unoCommands
= parse_details_and_get_info(
214 "https://crashreport.libreoffice.org/stats/crash_details/" + crashID
, args
.repository
)
215 ratio
= round(crashCount
/ ((lDate
[2] - lDate
[1]).days
+ 1), 2)
216 line
= '\t'.join([k
, str(ratio
), str(crashCount
) , lDate
[1].strftime('%y/%m/%d'), lDate
[2].strftime('%y/%m/%d'),
217 crashID
, crashVersion
, crashReason
, crashOS
, crashStack
, codeLine
, unoCommands
, '\n'])
220 except (requests
.exceptions
.Timeout
, AttributeError):