configure already sets TMPDIR to mixed path
[LibreOffice.git] / bin / find-can-be-private-symbols.py
blobda28310196f765f0e53e00a1940f9c726924b9ff
1 #!/usr/bin/python3
3 # Find exported symbols that can be made non-exported.
5 # Noting that (a) parsing these commands is a pain, the output is quite irregular and (b) I'm fumbling in the
6 # dark here, trying to guess what exactly constitutes an "import" vs an "export" of a symbol, linux linking
7 # is rather complex.
9 # Takes about 5min to run on a decent machine.
11 # The standalone function analysis is reasonable reliable, but the class/method analysis is less so
12 # (something to do with destructor thunks not showing up in my results?)
14 # Also, the class/method analysis will not catch problems like
15 # 'dynamic_cast from 'Foo' with hidden type visibility to 'Bar' with default type visibility'
16 # but loplugin:dyncastvisibility will do that for you
19 import subprocess
20 import re
22 exported_symbols1 = set()
23 imported_symbols1 = set()
24 exported_symbols2 = set() # decoded
25 imported_symbols2 = set() # decoded
26 # all names that exist in the source code
27 #all_source_names = set()
30 #subprocess_find_all_source_names = subprocess.Popen("git grep -oh -P '\\b\\w\\w\\w+\\b' -- '*.h*' | sort -u",
31 # stdout=subprocess.PIPE, stderr=subprocess.STDOUT, shell=True)
32 #with subprocess_find_all_source_names.stdout as txt:
33 # for line in txt:
34 # line = line.strip()
35 # all_source_names.add(line)
36 #subprocess_find_all_source_names.terminate()
38 # find all our shared libs
39 subprocess_find = subprocess.Popen("find ./instdir -name *.so && find ./workdir/LinkTarget/CppunitTest -name *.so",
40 stdout=subprocess.PIPE, shell=True)
41 with subprocess_find.stdout as txt:
42 for line in txt:
43 sharedlib = line.strip()
44 # look for exported symbols
45 subprocess_nm = subprocess.Popen(b"nm -D " + sharedlib, stdout=subprocess.PIPE, shell=True)
46 with subprocess_nm.stdout as txt2:
47 # We are looking for lines something like:
48 # 0000000000036ed0 T flash_component_getFactory
49 line_regex = re.compile(r'^[0-9a-fA-F]+ T ')
50 for line2_bytes in txt2:
51 line2 = line2_bytes.strip().decode("utf-8")
52 if line_regex.match(line2):
53 sym = line2.split(" ")[2].strip()
54 exported_symbols1.add(sym)
55 subprocess_nm.terminate()
56 # look for imported symbols
57 subprocess_objdump = subprocess.Popen(b"objdump -T " + sharedlib, stdout=subprocess.PIPE, shell=True)
58 with subprocess_objdump.stdout as txt2:
59 # ignore some header bumpf
60 txt2.readline()
61 txt2.readline()
62 txt2.readline()
63 txt2.readline()
64 # We are looking for lines something like:
65 # 0000000000000000 DF *UND* 0000000000000000 _ZN16FilterConfigItem10WriteInt32ERKN3rtl8OUStringEi
66 for line2_bytes in txt2:
67 line2 = line2_bytes.strip().decode("utf-8")
68 if "*UND*" not in line2: continue
69 tokens = line2.split(" ")
70 sym = tokens[len(tokens)-1].strip()
71 imported_symbols1.add(sym)
72 subprocess_objdump.terminate()
73 subprocess_find.terminate()
75 # look for imported symbols in executables
76 subprocess_find = subprocess.Popen("find ./instdir -name *.bin", stdout=subprocess.PIPE, shell=True)
77 with subprocess_find.stdout as txt:
78 for line in txt:
79 executable = line.strip()
80 # look for exported symbols
81 subprocess_nm = subprocess.Popen(b"nm -D " + executable + b" | grep -w U", stdout=subprocess.PIPE, shell=True)
82 with subprocess_nm.stdout as txt2:
83 # We are looking for lines something like:
84 # U sal_detail_deinitialize
85 for line2_bytes in txt2:
86 line2 = line2_bytes.strip().decode("utf-8")
87 sym = line2.split(" ")[1]
88 imported_symbols1.add(sym)
89 subprocess_find.terminate()
91 #progress = 0;
92 #for sym in sorted(imported_symbols - exported_symbols):
93 # progress += 1
94 # if (progress % 128 == 0): print( str(int(progress * 100 / len(diff))) + "%")
95 # filtered_sym = subprocess.check_output(["c++filt", sym]).strip().decode("utf-8")
96 # if filtered_sym.startswith("non-virtual thunk to "): filtered_sym = filtered_sym[21:]
97 # elif filtered_sym.startswith("virtual thunk to "): filtered_sym = filtered_sym[17:]
98 # print("Symbol imported but not exported? " + filtered_sym)
100 # Now we have to symbolize before comparing because sometimes (due to thunks) two
101 # different encoded names symbolize to the same method/func name
103 progress = 0
104 progress_max_len = len(imported_symbols1) + len(exported_symbols1)
105 for sym in imported_symbols1:
106 progress += 1
107 if (progress % 128 == 0): print( str(int(progress * 100 / progress_max_len)) + "%")
108 filtered_sym = subprocess.check_output(["c++filt", sym]).strip().decode("utf-8")
109 if filtered_sym.startswith("non-virtual thunk to "): filtered_sym = filtered_sym[21:]
110 elif filtered_sym.startswith("virtual thunk to "): filtered_sym = filtered_sym[17:]
111 imported_symbols2.add(filtered_sym)
112 progress = 0
113 for sym in exported_symbols1:
114 progress += 1
115 if (progress % 128 == 0): print( str(int(progress * 100 / progress_max_len)) + "%")
116 filtered_sym = subprocess.check_output(["c++filt", sym]).strip().decode("utf-8")
117 if filtered_sym.startswith("non-virtual thunk to "): filtered_sym = filtered_sym[21:]
118 elif filtered_sym.startswith("virtual thunk to "): filtered_sym = filtered_sym[17:]
119 exported_symbols2.add(filtered_sym)
121 unused_exports = exported_symbols2 - imported_symbols2
122 print("exported = " + str(len(exported_symbols2)))
123 print("imported = " + str(len(imported_symbols2)))
124 print("unused_exports = " + str(len(unused_exports)))
126 #def extractFunctionNameFromSignature(sym):
127 # i = sym.find("(")
128 # if i == -1: return sym
129 # return sym[:i]
131 # for each class, count how many symbols will become hidden if we mark the class as hidden
132 can_be_hidden_count = dict()
133 for sym in exported_symbols2:
134 i = sym.rfind("::")
135 if i == -1: continue
136 clz = sym[:i]
137 if clz in can_be_hidden_count:
138 can_be_hidden_count[clz] = can_be_hidden_count[clz] + 1
139 else:
140 can_be_hidden_count[clz] = 1
141 for sym in imported_symbols2:
142 i = sym.rfind("::")
143 if i == -1: continue
144 clz = sym[:i]
145 if clz in can_be_hidden_count:
146 can_be_hidden_count[clz] = can_be_hidden_count[clz] - 1
147 else:
148 can_be_hidden_count[clz] = -1
149 # convert to list, and sort the results in descending order
150 can_be_hidden_list = list()
151 for clz in can_be_hidden_count:
152 cnt = can_be_hidden_count[clz]
153 if cnt > 0:
154 can_be_hidden_list.append((cnt, clz))
155 can_be_hidden_list.sort(reverse=True)
156 with open("bin/find-can-be-private-symbols.classes.results", "wt") as f:
157 for i in can_be_hidden_list:
158 if i[0] < 10: break
159 f.write(str(i[0]) + " " + i[1] + "\n")
162 with open("bin/find-can-be-private-symbols.functions.results", "wt") as f:
163 for sym in sorted(unused_exports):
164 # Filter out most of the noise.
165 # No idea where these are coming from, but not our code.
166 if sym.startswith("CERT_"): continue
167 elif sym.startswith("DER_"): continue
168 elif sym.startswith("FORM_"): continue
169 elif sym.startswith("FPDF"): continue
170 elif sym.startswith("HASH_"): continue
171 elif sym.startswith("Hunspell_"): continue
172 elif sym.startswith("LL_"): continue
173 elif sym.startswith("LP_"): continue
174 elif sym.startswith("LU"): continue
175 elif sym.startswith("MIP"): continue
176 elif sym.startswith("MPS"): continue
177 elif sym.startswith("NSS"): continue
178 elif sym.startswith("NSC_"): continue
179 elif sym.startswith("PK11"): continue
180 elif sym.startswith("PL_"): continue
181 elif sym.startswith("PQ"): continue
182 elif sym.startswith("PBE_"): continue
183 elif sym.startswith("PORT_"): continue
184 elif sym.startswith("PRP_"): continue
185 elif sym.startswith("PR_"): continue
186 elif sym.startswith("PT_"): continue
187 elif sym.startswith("QS_"): continue
188 elif sym.startswith("REPORT_"): continue
189 elif sym.startswith("RSA_"): continue
190 elif sym.startswith("SEC"): continue
191 elif sym.startswith("SGN"): continue
192 elif sym.startswith("SOS"): continue
193 elif sym.startswith("SSL_"): continue
194 elif sym.startswith("VFY_"): continue
195 elif sym.startswith("_PR_"): continue
196 elif sym.startswith("ber_"): continue
197 elif sym.startswith("bfp_"): continue
198 elif sym.startswith("ldap_"): continue
199 elif sym.startswith("ne_"): continue
200 elif sym.startswith("opj_"): continue
201 elif sym.startswith("pg_"): continue
202 elif sym.startswith("pq"): continue
203 elif sym.startswith("presolve_"): continue
204 elif sym.startswith("sqlite3_"): continue
205 elif sym.startswith("libepubgen::"): continue
206 elif sym.startswith("lucene::"): continue
207 elif sym.startswith("Hunspell::"): continue
208 elif sym.startswith("sk_"): continue
209 elif sym.startswith("_Z"): continue
210 # dynamically loaded
211 elif sym.endswith("get_implementation"): continue
212 elif sym.endswith("component_getFactory"): continue
213 elif sym == "CreateUnoWrapper": continue
214 elif sym == "ExportDOC": continue
215 elif sym == "ExportRTF": continue
216 elif sym == "GetSaveWarningOfMSVBAStorage_ww8": continue
217 elif sym == "GetSpecialCharsForEdit": continue
218 elif sym.startswith("Import"): continue
219 elif sym.startswith("Java_com_sun_star_"): continue
220 elif sym.startswith("TestImport"): continue
221 elif sym.startswith("getAllCalendars_"): continue
222 elif sym.startswith("getAllCurrencies_"): continue
223 elif sym.startswith("getAllFormats"): continue
224 elif sym.startswith("getBreakIteratorRules_"): continue
225 elif sym.startswith("getCollationOptions_"): continue
226 elif sym.startswith("getCollatorImplementation_"): continue
227 elif sym.startswith("getContinuousNumberingLevels_"): continue
228 elif sym.startswith("getDateAcceptancePatterns_"): continue
229 elif sym.startswith("getForbiddenCharacters_"): continue
230 elif sym.startswith("getIndexAlgorithm_"): continue
231 elif sym.startswith("getLCInfo_"): continue
232 elif sym.startswith("getLocaleItem_"): continue
233 elif sym.startswith("getOutlineNumberingLevels_"): continue
234 elif sym.startswith("getReservedWords_"): continue
235 elif sym.startswith("getSTC_"): continue
236 elif sym.startswith("getSearchOptions_"): continue
237 elif sym.startswith("getTransliterations_"): continue
238 elif sym.startswith("getUnicodeScripts_"): continue
239 elif sym.startswith("lok_"): continue
240 # UDK API
241 elif sym.startswith("osl_"): continue
242 elif sym.startswith("rtl_"): continue
243 elif sym.startswith("typelib_"): continue
244 elif sym.startswith("typereg_"): continue
245 elif sym.startswith("uno_"): continue
246 # remove things we found that do not exist in our source code, they're not ours
247 #if not(extractFunctionNameFromSignature(sym) in all_source_names): continue
248 f.write(sym + "\n")