3 # This file is part of the LibreOffice project.
5 # This Source Code Form is subject to the terms of the Mozilla Public
6 # License, v. 2.0. If a copy of the MPL was not distributed with this
7 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
9 # Use this script to retrieve information from https://crashreport.libreoffice.org
10 # about a specific version of LibreOffice
11 # Usage sample: ./crashreportScraper.py --version 7.2.0.4 --repository /path/to/libreoffice/repository/
15 from bs4
import BeautifulSoup
19 from datetime
import datetime
22 def convert_str_to_date(value
):
23 value
= value
.replace('.', '')
24 value
= value
.replace('March', 'Mar')
25 value
= value
.replace('April', 'Apr')
26 value
= value
.replace('June', 'Jun')
27 value
= value
.replace('July', 'Jul')
28 value
= value
.replace('Sept', 'Sep')
29 # reset the time leaving the date
30 value
= ", ".join(value
.split(", ")[:-1])
31 dtDate
= datetime
.strptime(value
, '%b %d, %Y')
33 return dtDate
.strftime('%y/%m/%d')
35 def parse_version_url(url
):
39 html_text
= requests
.get(url
, timeout
=200).text
40 soup
= BeautifulSoup(html_text
, 'html.parser')
41 except requests
.exceptions
.Timeout
:
42 print("Timeout requesting " + url
)
45 table
= soup
.find("table", {"id": "data-table"}).tbody
46 for tr
in table
.find_all("tr"):
47 td_list
= tr
.find_all("td")
48 crashName
= td_list
[0].a
.text
.strip()
49 crashNumber
= int(td_list
[1].text
.strip())
50 firstCrashDate
= convert_str_to_date(td_list
[5].text
.strip())
51 lastCrashDate
= convert_str_to_date(td_list
[6].text
.strip())
52 crashReports
[crashName
] = [crashNumber
, firstCrashDate
, lastCrashDate
]
56 def parse_reports_and_get_most_recent_report_from_last_page(url
):
58 html_text
= requests
.get(url
, timeout
=200).text
59 soup
= BeautifulSoup(html_text
, 'html.parser')
60 except requests
.exceptions
.Timeout
:
66 os_tab
= soup
.find("table", {"id": "os_tab"}).tbody
67 except AttributeError:
68 print("os_tab not found")
71 tr_list
= os_tab
.find_all("tr")
73 td_list
= tr
.find_all("td")
74 count
+= int(td_list
[1].text
.strip())
76 # There are 50 reports on each page.
77 # Go to the last page based on the total count to get a recent report
78 last_page
= math
.ceil( count
/ 50 )
81 url
= url
+ "?page=" + str(last_page
)
83 html_text
= requests
.get(url
, timeout
=200).text
84 soup
= BeautifulSoup(html_text
, 'html.parser')
85 except requests
.exceptions
.Timeout
:
89 reports
= soup
.find("div", {"id": "reports"}).tbody
90 ID
, currentID
= "", ""
91 version
, currentVersion
= "", ""
92 OS
, currentOS
= "", ""
94 tr_list
= reports
.find_all("tr")
96 td_list
= tr
.find_all("td")
98 currentID
= td_list
[0].a
.text
.strip()
99 currentVersion
= td_list
[2].text
.strip().split(': ')[1]
100 currentOS
= td_list
[3].text
.strip()
102 # get most recent version
103 # symbols on linux are not very informative generally
104 if currentOS
== "windows" and currentVersion
> version
:
105 version
= currentVersion
110 version
= currentVersion
118 return count
, ID
, version
, OS
120 def parse_details_and_get_info(url
, gitRepo
):
122 html_text
= requests
.get(url
, timeout
=200).text
123 soup
= BeautifulSoup(html_text
, 'html.parser')
124 except requests
.exceptions
.Timeout
:
128 details
= soup
.find("div", {"id": "details"}).tbody
129 tr_list
= details
.find_all("tr")
130 reason
= tr_list
[8].td
.text
.strip()
136 frames
= soup
.find("div", {"id": "frames"}).tbody
137 for tr
in frames
.find_all("tr"):
138 td_list
= tr
.find_all("td")
139 source
= td_list
[3].text
.strip()
140 if source
and count
<= 10:
141 source
= source
.replace("\\", "/").replace("C:/cygwin64/home/buildslave/source/libo-core/", "")
142 stack
+= source
+ "\n"
145 codeFile
= source
.split(":")[0]
146 codeNumber
= source
.split(":")[1]
148 with
open(os
.path
.join(gitRepo
, codeFile
)) as f
:
149 lines
= f
.readlines()
150 for index
, line
in enumerate(lines
):
151 if index
+ 1 == int(codeNumber
):
152 codeLine
+= line
.strip().replace("\"", "'") + "\n"
153 except FileNotFoundError
:
159 stack
= "\"" + stack
+ "\""
163 codeLine
= "\"" + codeLine
+ "\""
165 metadata
= soup
.find("div", {"id": "metadata"}).tbody
166 tr_list
= metadata
.find_all("tr")
169 if tr
.th
.text
.strip() == "Last-4-Uno-Commands":
170 unoCommands
= tr
.td
.text
.strip()
172 return reason
, stack
, codeLine
, unoCommands
175 if __name__
== '__main__':
177 parser
= argparse
.ArgumentParser()
179 parser
.add_argument('--version', action
='store', dest
="version", required
=True)
180 parser
.add_argument('--repository', action
="store", dest
="repository", required
=True)
182 args
= parser
.parse_args()
184 crashes
= parse_version_url(
185 "https://crashreport.libreoffice.org/stats/version/" + args
.version
+ "?limit=1000&days=30")
187 print(str(len(crashes
)) + " crash reports in version " + args
.version
)
190 fileName
= "crashes_" + args
.version
.replace(".", "_") + ".csv"
191 print("Using " + fileName
)
193 bInsertHeader
= False
194 if os
.path
.exists(fileName
):
195 with
open(fileName
, "r") as f
:
196 lines
= f
.readlines()
198 crashesInFile
.append(line
.split("\t")[0])
202 with
open(fileName
, "a") as f
:
204 line
= '\t'.join(["Name", "Count", "First report", "Last Report",
205 "ID", "Version", "Reason", "OS", "Stack", "Code Lines", "Last 4 UNO Commands", '\n'])
209 for k
, lDate
in crashes
.items():
210 if k
not in crashesInFile
:
211 print("Parsing " + k
)
213 crashCount
, crashID
, crashVersion
, crashOS
= parse_reports_and_get_most_recent_report_from_last_page(
214 "https://crashreport.libreoffice.org/stats/signature/" + urllib
.parse
.quote(k
))
215 crashReason
, crashStack
, codeLine
, unoCommands
= parse_details_and_get_info(
216 "https://crashreport.libreoffice.org/stats/crash_details/" + crashID
, args
.repository
)
217 line
= '\t'.join([k
, str(crashCount
), lDate
[1], lDate
[2],
218 crashID
, crashVersion
, crashReason
, crashOS
, crashStack
, codeLine
, unoCommands
, '\n'])
221 except (requests
.exceptions
.Timeout
, AttributeError):