output actual characters instead of codepoints
[evolve-layout.git] / ngrams.py
blob33a2af34258b0d3bb4bfae74875852cad77405f6
1 #!/usr/bin/env python3
2 # encoding: utf-8
4 """Calculating ngram distributions (letters, bigrams, trigrams) from text or getting them from precomputed files."""
6 from layout_base import NEO_LAYOUT, key_to_finger, read_file, find_key
8 def split_uppercase_repeats(reps, layout=NEO_LAYOUT):
9 """Split uppercase repeats into two to three lowercase repeats.
11 TODO: treat left and right shift differently. Currently we always use both shifts (⇧ and ⇗) and half the value (but stay in integers => 1 stays 1). Needs major refactoring, since it needs knowledge of the layout. Temporary fix: always use both shifts. → Almost completely done in finger repeats evaluation. Only remaining: ⇧⇗ and ⇗⇧, but these aren’t relevant to finger collisions, only to handswitching (and there we ignore them, as the difference is at most one more letter without switching). Also remaining: very rare repeats are now counted more strongly, since
13 Shift und die Taste werden gleichzeitig gedrückt => in einem bigramm, in dem der erste Buchstabe groß ist, gibt es sowohl die Fingerwiederholung Shift-Buchstabe 1, als auch Shift-Buchstabe2. => einfach verdoppeln. - done
15 TODO: aB should be counted about 2x, Ab only 0.5 times, because shift is pressed and released a short time before the key.
17 Ab -> shift-a, shift-b, a-b.
18 aB -> a-shift, shift-b, a-b.
19 AB -> shift-a, shift-b, a-b, 0.5*(shift_L-shift_R, shift_R-shift_L)
21 Jeweils sowohl rechts als auch links.
24 >>> reps = [(12, "ab"), (6, "Ab"), (4, "aB"), (1, "AB")]
25 >>> split_uppercase_repeats(reps)
26 [(12, 'ab'), (6, '⇗b'), (6, 'ab'), (4, 'a⇧'), (4, 'ab'), (1, '⇧⇗'), (1, '⇗⇧'), (1, '⇗b'), (1, 'a⇧'), (1, 'ab')]
27 """
28 # replace uppercase by ⇧ + char1 and char1 + char2 and ⇧ + char2
29 # char1 and shift are pressed at the same time
30 upper = [(num, rep) for num, rep in reps if
31 (find_key(rep[0], layout=layout) == 1 or find_key(rep[1], layout=layout) == 1 or not rep == rep.lower())
33 reps = [(num, rep) for (num, rep) in reps if not
34 (find_key(rep[0], layout=layout) == 1 or find_key(rep[1], layout=layout) == 1 or not rep == rep.lower())
36 up = []
37 for num, rep in upper: # Ab = ab,⇗b aB = a⇧,ab AB = a⇧,⇗b,ab (A links, B rechts)
38 # calculate the lowercase only once, as it grows quite expensive in the bulk
39 rep0 = rep[0]
40 rep0_lower = rep0.lower()
41 rep1 = rep[1]
42 rep1_lower = rep1.lower()
44 # use both shifts, but half weight each
45 if not rep0 == rep0_lower and not rep1 == rep1_lower: # AB
46 up.append((max(1, num//2), "⇗⇧"))
47 up.append((max(1, num//2), "⇧⇗"))
48 if not rep0 == rep0_lower: # Ab od. AB
49 finger = key_to_finger(rep0_lower, layout=layout)
50 if finger and finger[-1] == "L":
51 up.append((num, "⇗"+rep1_lower))
52 elif finger and finger[-1] == "R":
53 up.append((num, "⇧"+rep1_lower))
54 if not rep1 == rep1_lower: # aB od. AB
55 finger = key_to_finger(rep1_lower, layout=layout)
56 if finger and finger[-1] == "L":
57 up.append((num, rep0_lower + "⇗"))
58 elif finger and finger[-1] == "R":
59 up.append((num, rep0_lower + "⇧"))
61 up.append((num, rep0_lower+rep1_lower))
63 reps.extend(up)
64 # cleanup
65 reps = [(int(num), r) for num, r in reps if r[1:]]
66 reps.sort()
67 reps.reverse()
68 return reps
70 def repeats_in_file(data):
71 """Sort the repeats in a file by the number of occurrances.
73 >>> data = read_file("testfile")
74 >>> repeats_in_file(data)[:3]
75 [(2, 'a\\n'), (2, 'Aa'), (1, 'ui')]
76 """
77 repeats = {}
78 for i in range(len(data)-1):
79 rep = data[i] + data[i+1]
80 if rep in repeats:
81 repeats[rep] += 1
82 else:
83 repeats[rep] = 1
84 sorted_repeats = [(repeats[i], i) for i in repeats]
85 sorted_repeats.sort()
86 sorted_repeats.reverse()
87 #reps = split_uppercase_repeats(sorted_repeats) # wrong place
88 return sorted_repeats
90 def split_uppercase_letters(reps, layout):
91 """Split uppercase letters into two lowercase letters (with shift).
93 >>> letters = [(4, "a"), (3, "A")]
94 >>> split_uppercase_letters(letters, layout=NEO_LAYOUT)
95 [(4, 'a'), (3, '⇗'), (3, 'a')]
96 """
97 # replace uppercase by ⇧ and char1
98 upper = [(num, rep) for num, rep in reps if not rep == rep.lower()]
99 reps = [rep for rep in reps if not rep in upper]
100 up = []
102 for num, rep in upper:
103 fing = key_to_finger(rep.lower(), layout=layout)
104 try:
105 hand = fing[-1]
106 if hand == "L":
107 up.append((num, "⇗"))
108 elif hand == "R":
109 up.append((num, "⇧"))
110 except IndexError:
111 # not in there (special letters not on keyboard layer 1)
112 pass
113 up.append((num, rep.lower()))
115 reps.extend(up)
116 reps = [(int(num), r) for num, r in reps]
117 reps.sort()
118 reps.reverse()
119 return reps
121 def letters_in_file(data):
122 """Sort the repeats in a file by the number of occurrances.
124 >>> data = read_file("testfile")
125 >>> letters_in_file(data)[:3]
126 [(5, 'a'), (4, '\\n'), (2, 'r')]
128 letters = {}
129 for letter in data:
130 if letter in letters:
131 letters[letter] += 1
132 else:
133 letters[letter] = 1
134 sort = [(letters[i], i) for i in letters]
135 sort.sort()
136 sort.reverse()
137 return sort
139 def unique_sort(liste):
140 """Count the occurrence of each item in a list.
142 >>> unique_sort([1, 2, 1])
143 [(1, 2), (2, 1)]
145 counter = {}
146 for i in liste:
147 if i in counter:
148 counter[i] += 1
149 else:
150 counter[i] = 1
152 sorted_repeats = [(counter[i], i) for i in counter]
153 sorted_repeats.sort()
154 return sorted_repeats
156 def repeats_in_file_sorted(data):
157 """Sort the repeats in a file by the number of occurrances.
159 >>> data = read_file("testfile")
160 >>> repeats_in_file_sorted(data)[:2]
161 [(1, '\\na'), (1, '\\ne')]
163 repeats = repeats_in_file(data)
164 repeats.reverse()
165 return repeats
167 def repeats_in_file_precalculated(data):
168 """Get the repeats from a precalculated file.
170 >>> data = read_file("2gramme.txt")
171 >>> repeats_in_file_precalculated(data)[:2]
172 [(10159250, 'en'), (10024681, 'er')]
174 reps = [line.lstrip().split(" ", 1) for line in data.splitlines() if line.split()[1:]]
175 reps = [(int(num), r) for num, r in reps if r[1:]]
176 #reps = split_uppercase_repeats(reps) # wrong place, don’t yet know the layout
178 return reps
181 def split_uppercase_trigrams(trigs):
182 """Split uppercase repeats into two to three lowercase repeats.
184 Here we don’t care about shift-collisions with the “second” letter, because we only use it for handswitching and the shift will always mean a handswitch afterwards (opposing shift). ⇒ Ab → Sh-ab, ignoring a-Sh-b. ⇒ for handswitching ignore trigrams with any of the shifts.
186 >>> trigs = [(8, "abc"), (7, "Abc"), (6, "aBc"), (5, "abC"), (4, "ABc"), (3, "aBC"), (2, "AbC"), (1, "ABC")]
187 >>> split_uppercase_trigrams(trigs)
188 [(8, 'abc'), (7, 'abc'), (3, '⇧bc'), (3, '⇧ab'), (3, '⇗bc'), (3, '⇗ab'), (3, 'a⇧b'), (3, 'a⇗b'), (2, '⇧bc'), (2, '⇗bc'), (2, 'b⇧c'), (2, 'b⇗c'), (2, 'a⇧b'), (2, 'a⇗b'), (2, 'ab⇧'), (2, 'ab⇗'), (1, '⇧b⇧'), (1, '⇧b⇧'), (1, '⇧b⇗'), (1, '⇧b⇗'), (1, '⇧a⇧'), (1, '⇧a⇧'), (1, '⇧a⇗'), (1, '⇧a⇗'), (1, '⇧ab'), (1, '⇗b⇧'), (1, '⇗b⇧'), (1, '⇗b⇗'), (1, '⇗b⇗'), (1, '⇗a⇧'), (1, '⇗a⇧'), (1, '⇗a⇗'), (1, '⇗a⇗'), (1, '⇗ab'), (1, 'b⇧c'), (1, 'b⇧c'), (1, 'b⇧c'), (1, 'b⇗c'), (1, 'b⇗c'), (1, 'b⇗c'), (1, 'a⇧b'), (1, 'a⇧b'), (1, 'a⇗b'), (1, 'a⇗b'), (1, 'ab⇧'), (1, 'ab⇗')]
189 >>> #[(8, 'abc'), (7, '⇧ab'), (7, 'abc'), (6, '⇧bc'), (6, 'a⇧b'), (5, 'b⇧c'), (5, 'ab⇧'), (4, '⇧a⇧'), (4, 'a⇧b'), (4, '⇧bc'), (3, 'a⇧b'), (3, '⇧b⇧'), (3, 'b⇧c'), (2, '⇧ab'), (2, 'ab⇧'), (2, 'b⇧c'), (1, '⇧a⇧'), (1, 'a⇧b'), (1, '⇧b⇧'), (1, 'b⇧c')]
191 # replace uppercase by ⇧ + char1 and char1 + char2
192 upper = [(num, trig) for num, trig in trigs if not trig == trig.lower()]
193 # and remove them temporarily from the list of trigrams - don’t compare list with list, else this takes ~20min!
194 trigs = [(num, trig) for num, trig in trigs if trig == trig.lower()]
195 up = []
196 # since this gets a bit more complex and the chance to err is high,
197 # we do this dumbly, just checking for the exact cases.
198 # TODO: Do it more elegantly: Replace every uppercase letter by "⇧"+lowercase
199 # and then turn the x-gram into multiple 3grams (four[:-1], four[1:]; five… ).
200 for num, trig in upper:
201 # Abc
202 if not trig[0] == trig[0].lower() and trig[1] == trig[1].lower() and trig[2] == trig[2].lower():
203 up.append((max(1, num//2), "⇧"+trig[:2].lower()))
204 up.append((max(1, num//2), "⇗"+trig[:2].lower()))
205 up.append((num, trig.lower()))
206 # aBc
207 elif trig[0] == trig[0].lower() and not trig[1] == trig[1].lower() and trig[2] == trig[2].lower():
208 up.append((max(1, num//2), "⇧"+trig[1:].lower()))
209 up.append((max(1, num//2), "⇗"+trig[1:].lower()))
210 up.append((max(1, num//2), trig[0].lower()+"⇧"+trig[1].lower()))
211 up.append((max(1, num//2), trig[0].lower()+"⇗"+trig[1].lower()))
213 # abC
214 elif trig[0] == trig[0].lower() and trig[1] == trig[1].lower() and not trig[2] == trig[2].lower():
215 up.append((max(1, num//2), trig[:2].lower() + "⇧"))
216 up.append((max(1, num//2), trig[:2].lower() + "⇗"))
217 up.append((max(1, num//2), trig[1].lower()+"⇧"+trig[2].lower()))
218 up.append((max(1, num//2), trig[1].lower()+"⇗"+trig[2].lower()))
220 # ABc (4, '⇧a⇧'), (4, 'a⇧b'), (4, '⇧bc')
221 elif not trig[0] == trig[0].lower() and not trig[1] == trig[1].lower() and trig[2] == trig[2].lower():
222 up.append((max(1, num//4), "⇧"+trig[0].lower()+"⇧"))
223 up.append((max(1, num//2), trig[0].lower()+"⇧"+trig[1].lower()))
224 up.append((max(1, num//2), "⇧" + trig[1:].lower()))
226 up.append((max(1, num//4), "⇗"+trig[0].lower()+"⇧"))
227 up.append((max(1, num//4), "⇧"+trig[0].lower()+"⇗"))
228 up.append((max(1, num//4), "⇗"+trig[0].lower()+"⇗"))
230 up.append((max(1, num//2), trig[0].lower()+"⇗"+trig[1].lower()))
231 up.append((max(1, num//2), "⇗" + trig[1:].lower()))
233 # aBC (3, 'a⇧b'), (3, '⇧b⇧'), (3, 'b⇧c')
234 elif trig[0] == trig[0].lower() and not trig[1] == trig[1].lower() and not trig[2] == trig[2].lower():
235 up.append((max(1, num//4), "⇧"+trig[1].lower()+"⇧"))
236 up.append((max(1, num//2), trig[0].lower()+"⇧"+trig[1].lower()))
237 up.append((max(1, num//2), trig[1].lower()+"⇧"+trig[2].lower()))
239 up.append((max(1, num//4), "⇗"+trig[1].lower()+"⇧"))
240 up.append((max(1, num//4), "⇧"+trig[1].lower()+"⇗"))
241 up.append((max(1, num//4), "⇗"+trig[1].lower()+"⇗"))
243 up.append((max(1, num//2), trig[0].lower()+"⇗"+trig[1].lower()))
244 up.append((max(1, num//2), trig[1].lower()+"⇗"+trig[2].lower()))
246 # AbC (2, '⇧ab'), (2, 'ab⇧'), (2, 'b⇧c')
247 elif not trig[0] == trig[0].lower() and trig[1] == trig[1].lower() and not trig[2] == trig[2].lower():
248 up.append((max(1, num//2), "⇧" + trig[:2].lower()))
249 up.append((max(1, num//2), trig[:2].lower() + "⇧"))
250 up.append((max(1, num//2), trig[1].lower()+"⇧"+trig[2].lower()))
252 up.append((max(1, num//2), "⇗" + trig[:2].lower()))
253 up.append((max(1, num//2), trig[:2].lower() + "⇗"))
254 up.append((max(1, num//2), trig[1].lower()+"⇗"+trig[2].lower()))
256 # ABC (1, '⇧a⇧'), (1, 'a⇧b'), (1, '⇧b⇧'), (1, 'b⇧c')
257 elif not trig[0] == trig[0].lower() and not trig[1] == trig[1].lower() and not trig[2] == trig[2].lower():
258 up.append((max(1, num//4), "⇧"+trig[0].lower()+"⇧"))
259 up.append((max(1, num//2), trig[0].lower()+"⇧"+trig[1].lower()))
260 up.append((max(1, num//4), "⇧"+trig[1].lower()+"⇧"))
261 up.append((max(1, num//2), trig[1].lower()+"⇧"+trig[2].lower()))
263 up.append((max(1, num//4), "⇗"+trig[0].lower()+"⇧"))
264 up.append((max(1, num//4), "⇧"+trig[0].lower()+"⇗"))
265 up.append((max(1, num//4), "⇗"+trig[0].lower()+"⇗"))
267 up.append((max(1, num//4), "⇗"+trig[1].lower()+"⇧"))
268 up.append((max(1, num//4), "⇧"+trig[1].lower()+"⇗"))
269 up.append((max(1, num//4), "⇗"+trig[1].lower()+"⇗"))
271 up.append((max(1, num//2), trig[0].lower()+"⇗"+trig[1].lower()))
272 up.append((max(1, num//2), trig[1].lower()+"⇗"+trig[2].lower()))
275 trigs.extend(up)
276 trigs = [(int(num), r) for num, r in trigs if r[1:]]
277 trigs.sort()
278 trigs.reverse()
279 return trigs
282 def trigrams_in_file(data):
283 """Sort the trigrams in a file by the number of occurrances.
285 >>> data = read_file("testfile")
286 >>> trigrams_in_file(data)[:12]
287 [(1, '⇧aa'), (1, '⇧aa'), (1, '⇧aa'), (1, '⇧aa'), (1, '⇗aa'), (1, '⇗aa'), (1, '⇗aa'), (1, '⇗aa'), (1, 'uia'), (1, 't⇧a'), (1, 't⇧a'), (1, 't⇗a')]
289 trigs = {}
290 for i in range(len(data)-2):
291 trig = data[i] + data[i+1] + data[i+2]
292 if trig in trigs:
293 trigs[trig] += 1
294 else:
295 trigs[trig] = 1
296 sorted_trigs = [(trigs[i], i) for i in trigs]
297 sorted_trigs.sort()
298 sorted_trigs.reverse()
299 trigs = split_uppercase_trigrams(sorted_trigs)
300 return trigs
302 def trigrams_in_file_precalculated(data):
303 """Get the repeats from a precalculated file.
305 CAREFUL: SLOW!
307 >>> data = read_file("3gramme.txt")
308 >>> trigrams_in_file_precalculated(data)[:6]
309 [(5678513, 'en '), (4414826, 'er '), (2891228, ' de'), (2302691, 'der'), (2272020, 'ie '), (2039215, 'ich')]
311 trigs = [line.lstrip().split(" ", 1) for line in data.splitlines() if line.split()[1:]]
312 trigs = [(int(num), r) for num, r in trigs if r[1:]]
313 trigs = split_uppercase_trigrams(trigs)
315 return trigs
317 def letters_in_file_precalculated(data):
318 """Get the repeats from a precalculated file.
320 >>> data = read_file("1gramme.txt")
321 >>> letters_in_file_precalculated(data)[:2]
322 [(44021504, 'e'), (26999087, 'n')]
324 letters = [line.lstrip().split(" ", 1) for line in data.splitlines() if line.split()[1:]]
325 return [(int(num), let) for num, let in letters]
328 def get_all_data(data=None, letters=None, repeats=None, number_of_letters=None, number_of_bigrams=None, trigrams=None, number_of_trigrams=None):
329 """Get letters, bigrams and trigrams.
331 @param data: a string of text.
332 @return: letters, number_of_letters, bigrams, number_of_bigrams, trigrams, number_of_trigrams
334 #data = read_file("/tmp/sskreszta")
336 # if we get a datastring, we use it for everything.
337 if data is not None:
338 letters = letters_in_file(data)
339 repeats = bigrams = repeats_in_file(data)
340 trigrams = trigrams_in_file(data)
341 number_of_letters = sum([i for i, s in letters])
342 number_of_bigrams = sum([i for i, s in bigrams])
343 number_of_trigrams = sum([i for i, s in trigrams])
345 # otherwise we get the missing values from the predefined files.
346 if letters is None or number_of_letters is None:
347 letterdata = read_file("1gramme.txt")
348 letters = letters_in_file_precalculated(letterdata)
349 #letters = letters_in_file(data)
350 number_of_letters = sum([i for i, s in letters])
352 if repeats is None or number_of_bigrams is None:
353 bigramdata = read_file("2gramme.txt")
354 bigrams = repeats_in_file_precalculated(bigramdata)
355 #repeats = repeats_in_file(data)
356 number_of_bigrams = sum([i for i, s in bigrams])
357 else: bigrams = repeats
359 if trigrams is None or number_of_trigrams is None:
360 trigramdata = read_file("3gramme.txt")
361 trigrams = trigrams_in_file_precalculated(trigramdata)
362 number_of_trigrams = sum([i for i, s in trigrams])
364 return letters, number_of_letters, bigrams, number_of_bigrams, trigrams, number_of_trigrams