4 """Calculating ngram distributions (letters, bigrams, trigrams) from text or getting them from precomputed files."""
6 from layout_base
import NEO_LAYOUT
, key_to_finger
, read_file
, find_key
8 def split_uppercase_repeats(reps
, layout
=NEO_LAYOUT
):
9 """Split uppercase repeats into two to three lowercase repeats.
11 TODO: treat left and right shift differently. Currently we always use both shifts (⇧ and ⇗) and half the value (but stay in integers => 1 stays 1). Needs major refactoring, since it needs knowledge of the layout. Temporary fix: always use both shifts. → Almost completely done in finger repeats evaluation. Only remaining: ⇧⇗ and ⇗⇧, but these aren’t relevant to finger collisions, only to handswitching (and there we ignore them, as the difference is at most one more letter without switching). Also remaining: very rare repeats are now counted more strongly, since
13 Shift und die Taste werden gleichzeitig gedrückt => in einem bigramm, in dem der erste Buchstabe groß ist, gibt es sowohl die Fingerwiederholung Shift-Buchstabe 1, als auch Shift-Buchstabe2. => einfach verdoppeln. - done
15 TODO: aB should be counted about 2x, Ab only 0.5 times, because shift is pressed and released a short time before the key.
17 Ab -> shift-a, shift-b, a-b.
18 aB -> a-shift, shift-b, a-b.
19 AB -> shift-a, shift-b, a-b, 0.5*(shift_L-shift_R, shift_R-shift_L)
21 Jeweils sowohl rechts als auch links.
24 >>> reps = [(12, "ab"), (6, "Ab"), (4, "aB"), (1, "AB")]
25 >>> split_uppercase_repeats(reps)
26 [(12, 'ab'), (6, '⇗b'), (6, 'ab'), (4, 'a⇧'), (4, 'ab'), (1, '⇧⇗'), (1, '⇗⇧'), (1, '⇗b'), (1, 'a⇧'), (1, 'ab')]
28 # replace uppercase by ⇧ + char1 and char1 + char2 and ⇧ + char2
29 # char1 and shift are pressed at the same time
30 upper
= [(num
, rep
) for num
, rep
in reps
if
31 (find_key(rep
[0], layout
=layout
) == 1 or find_key(rep
[1], layout
=layout
) == 1 or not rep
== rep
.lower())
33 reps
= [(num
, rep
) for (num
, rep
) in reps
if not
34 (find_key(rep
[0], layout
=layout
) == 1 or find_key(rep
[1], layout
=layout
) == 1 or not rep
== rep
.lower())
37 for num
, rep
in upper
: # Ab = ab,⇗b aB = a⇧,ab AB = a⇧,⇗b,ab (A links, B rechts)
38 # calculate the lowercase only once, as it grows quite expensive in the bulk
40 rep0_lower
= rep0
.lower()
42 rep1_lower
= rep1
.lower()
44 # use both shifts, but half weight each
45 if not rep0
== rep0_lower
and not rep1
== rep1_lower
: # AB
46 up
.append((max(1, num
//2), "⇗⇧"))
47 up
.append((max(1, num
//2), "⇧⇗"))
48 if not rep0
== rep0_lower
: # Ab od. AB
49 finger
= key_to_finger(rep0_lower
, layout
=layout
)
50 if finger
and finger
[-1] == "L":
51 up
.append((num
, "⇗"+rep1_lower
))
52 elif finger
and finger
[-1] == "R":
53 up
.append((num
, "⇧"+rep1_lower
))
54 if not rep1
== rep1_lower
: # aB od. AB
55 finger
= key_to_finger(rep1_lower
, layout
=layout
)
56 if finger
and finger
[-1] == "L":
57 up
.append((num
, rep0_lower
+ "⇗"))
58 elif finger
and finger
[-1] == "R":
59 up
.append((num
, rep0_lower
+ "⇧"))
61 up
.append((num
, rep0_lower
+rep1_lower
))
65 reps
= [(int(num
), r
) for num
, r
in reps
if r
[1:]]
70 def repeats_in_file(data
):
71 """Sort the repeats in a file by the number of occurrances.
73 >>> data = read_file("testfile")
74 >>> repeats_in_file(data)[:3]
75 [(2, 'a\\n'), (2, 'Aa'), (1, 'ui')]
78 for i
in range(len(data
)-1):
79 rep
= data
[i
] + data
[i
+1]
84 sorted_repeats
= [(repeats
[i
], i
) for i
in repeats
]
86 sorted_repeats
.reverse()
87 #reps = split_uppercase_repeats(sorted_repeats) # wrong place
90 def split_uppercase_letters(reps
, layout
):
91 """Split uppercase letters into two lowercase letters (with shift).
93 >>> letters = [(4, "a"), (3, "A")]
94 >>> split_uppercase_letters(letters, layout=NEO_LAYOUT)
95 [(4, 'a'), (3, '⇗'), (3, 'a')]
97 # replace uppercase by ⇧ and char1
98 upper
= [(num
, rep
) for num
, rep
in reps
if not rep
== rep
.lower()]
99 reps
= [rep
for rep
in reps
if not rep
in upper
]
102 for num
, rep
in upper
:
103 fing
= key_to_finger(rep
.lower(), layout
=layout
)
107 up
.append((num
, "⇗"))
109 up
.append((num
, "⇧"))
111 # not in there (special letters not on keyboard layer 1)
113 up
.append((num
, rep
.lower()))
116 reps
= [(int(num
), r
) for num
, r
in reps
]
121 def letters_in_file(data
):
122 """Sort the repeats in a file by the number of occurrances.
124 >>> data = read_file("testfile")
125 >>> letters_in_file(data)[:3]
126 [(5, 'a'), (4, '\\n'), (2, 'r')]
130 if letter
in letters
:
134 sort
= [(letters
[i
], i
) for i
in letters
]
139 def unique_sort(liste
):
140 """Count the occurrence of each item in a list.
142 >>> unique_sort([1, 2, 1])
152 sorted_repeats
= [(counter
[i
], i
) for i
in counter
]
153 sorted_repeats
.sort()
154 return sorted_repeats
156 def repeats_in_file_sorted(data
):
157 """Sort the repeats in a file by the number of occurrances.
159 >>> data = read_file("testfile")
160 >>> repeats_in_file_sorted(data)[:2]
161 [(1, '\\na'), (1, '\\ne')]
163 repeats
= repeats_in_file(data
)
167 def repeats_in_file_precalculated(data
):
168 """Get the repeats from a precalculated file.
170 >>> data = read_file("2gramme.txt")
171 >>> repeats_in_file_precalculated(data)[:2]
172 [(10159250, 'en'), (10024681, 'er')]
174 reps
= [line
.lstrip().split(" ", 1) for line
in data
.splitlines() if line
.split()[1:]]
175 reps
= [(int(num
), r
) for num
, r
in reps
if r
[1:]]
176 #reps = split_uppercase_repeats(reps) # wrong place, don’t yet know the layout
181 def split_uppercase_trigrams(trigs
):
182 """Split uppercase repeats into two to three lowercase repeats.
184 Here we don’t care about shift-collisions with the “second” letter, because we only use it for handswitching and the shift will always mean a handswitch afterwards (opposing shift). ⇒ Ab → Sh-ab, ignoring a-Sh-b. ⇒ for handswitching ignore trigrams with any of the shifts.
186 >>> trigs = [(8, "abc"), (7, "Abc"), (6, "aBc"), (5, "abC"), (4, "ABc"), (3, "aBC"), (2, "AbC"), (1, "ABC")]
187 >>> split_uppercase_trigrams(trigs)
188 [(8, 'abc'), (7, 'abc'), (3, '⇧bc'), (3, '⇧ab'), (3, '⇗bc'), (3, '⇗ab'), (3, 'a⇧b'), (3, 'a⇗b'), (2, '⇧bc'), (2, '⇗bc'), (2, 'b⇧c'), (2, 'b⇗c'), (2, 'a⇧b'), (2, 'a⇗b'), (2, 'ab⇧'), (2, 'ab⇗'), (1, '⇧b⇧'), (1, '⇧b⇧'), (1, '⇧b⇗'), (1, '⇧b⇗'), (1, '⇧a⇧'), (1, '⇧a⇧'), (1, '⇧a⇗'), (1, '⇧a⇗'), (1, '⇧ab'), (1, '⇗b⇧'), (1, '⇗b⇧'), (1, '⇗b⇗'), (1, '⇗b⇗'), (1, '⇗a⇧'), (1, '⇗a⇧'), (1, '⇗a⇗'), (1, '⇗a⇗'), (1, '⇗ab'), (1, 'b⇧c'), (1, 'b⇧c'), (1, 'b⇧c'), (1, 'b⇗c'), (1, 'b⇗c'), (1, 'b⇗c'), (1, 'a⇧b'), (1, 'a⇧b'), (1, 'a⇗b'), (1, 'a⇗b'), (1, 'ab⇧'), (1, 'ab⇗')]
189 >>> #[(8, 'abc'), (7, '⇧ab'), (7, 'abc'), (6, '⇧bc'), (6, 'a⇧b'), (5, 'b⇧c'), (5, 'ab⇧'), (4, '⇧a⇧'), (4, 'a⇧b'), (4, '⇧bc'), (3, 'a⇧b'), (3, '⇧b⇧'), (3, 'b⇧c'), (2, '⇧ab'), (2, 'ab⇧'), (2, 'b⇧c'), (1, '⇧a⇧'), (1, 'a⇧b'), (1, '⇧b⇧'), (1, 'b⇧c')]
191 # replace uppercase by ⇧ + char1 and char1 + char2
192 upper
= [(num
, trig
) for num
, trig
in trigs
if not trig
== trig
.lower()]
193 # and remove them temporarily from the list of trigrams - don’t compare list with list, else this takes ~20min!
194 trigs
= [(num
, trig
) for num
, trig
in trigs
if trig
== trig
.lower()]
196 # since this gets a bit more complex and the chance to err is high,
197 # we do this dumbly, just checking for the exact cases.
198 # TODO: Do it more elegantly: Replace every uppercase letter by "⇧"+lowercase
199 # and then turn the x-gram into multiple 3grams (four[:-1], four[1:]; five… ).
200 for num
, trig
in upper
:
202 if not trig
[0] == trig
[0].lower() and trig
[1] == trig
[1].lower() and trig
[2] == trig
[2].lower():
203 up
.append((max(1, num
//2), "⇧"+trig
[:2].lower()))
204 up
.append((max(1, num
//2), "⇗"+trig
[:2].lower()))
205 up
.append((num
, trig
.lower()))
207 elif trig
[0] == trig
[0].lower() and not trig
[1] == trig
[1].lower() and trig
[2] == trig
[2].lower():
208 up
.append((max(1, num
//2), "⇧"+trig
[1:].lower()))
209 up
.append((max(1, num
//2), "⇗"+trig
[1:].lower()))
210 up
.append((max(1, num
//2), trig
[0].lower()+"⇧"+trig
[1].lower()))
211 up
.append((max(1, num
//2), trig
[0].lower()+"⇗"+trig
[1].lower()))
214 elif trig
[0] == trig
[0].lower() and trig
[1] == trig
[1].lower() and not trig
[2] == trig
[2].lower():
215 up
.append((max(1, num
//2), trig
[:2].lower() + "⇧"))
216 up
.append((max(1, num
//2), trig
[:2].lower() + "⇗"))
217 up
.append((max(1, num
//2), trig
[1].lower()+"⇧"+trig
[2].lower()))
218 up
.append((max(1, num
//2), trig
[1].lower()+"⇗"+trig
[2].lower()))
220 # ABc (4, '⇧a⇧'), (4, 'a⇧b'), (4, '⇧bc')
221 elif not trig
[0] == trig
[0].lower() and not trig
[1] == trig
[1].lower() and trig
[2] == trig
[2].lower():
222 up
.append((max(1, num
//4), "⇧"+trig
[0].lower()+"⇧"))
223 up
.append((max(1, num
//2), trig
[0].lower()+"⇧"+trig
[1].lower()))
224 up
.append((max(1, num
//2), "⇧" + trig
[1:].lower()))
226 up
.append((max(1, num
//4), "⇗"+trig
[0].lower()+"⇧"))
227 up
.append((max(1, num
//4), "⇧"+trig
[0].lower()+"⇗"))
228 up
.append((max(1, num
//4), "⇗"+trig
[0].lower()+"⇗"))
230 up
.append((max(1, num
//2), trig
[0].lower()+"⇗"+trig
[1].lower()))
231 up
.append((max(1, num
//2), "⇗" + trig
[1:].lower()))
233 # aBC (3, 'a⇧b'), (3, '⇧b⇧'), (3, 'b⇧c')
234 elif trig
[0] == trig
[0].lower() and not trig
[1] == trig
[1].lower() and not trig
[2] == trig
[2].lower():
235 up
.append((max(1, num
//4), "⇧"+trig
[1].lower()+"⇧"))
236 up
.append((max(1, num
//2), trig
[0].lower()+"⇧"+trig
[1].lower()))
237 up
.append((max(1, num
//2), trig
[1].lower()+"⇧"+trig
[2].lower()))
239 up
.append((max(1, num
//4), "⇗"+trig
[1].lower()+"⇧"))
240 up
.append((max(1, num
//4), "⇧"+trig
[1].lower()+"⇗"))
241 up
.append((max(1, num
//4), "⇗"+trig
[1].lower()+"⇗"))
243 up
.append((max(1, num
//2), trig
[0].lower()+"⇗"+trig
[1].lower()))
244 up
.append((max(1, num
//2), trig
[1].lower()+"⇗"+trig
[2].lower()))
246 # AbC (2, '⇧ab'), (2, 'ab⇧'), (2, 'b⇧c')
247 elif not trig
[0] == trig
[0].lower() and trig
[1] == trig
[1].lower() and not trig
[2] == trig
[2].lower():
248 up
.append((max(1, num
//2), "⇧" + trig
[:2].lower()))
249 up
.append((max(1, num
//2), trig
[:2].lower() + "⇧"))
250 up
.append((max(1, num
//2), trig
[1].lower()+"⇧"+trig
[2].lower()))
252 up
.append((max(1, num
//2), "⇗" + trig
[:2].lower()))
253 up
.append((max(1, num
//2), trig
[:2].lower() + "⇗"))
254 up
.append((max(1, num
//2), trig
[1].lower()+"⇗"+trig
[2].lower()))
256 # ABC (1, '⇧a⇧'), (1, 'a⇧b'), (1, '⇧b⇧'), (1, 'b⇧c')
257 elif not trig
[0] == trig
[0].lower() and not trig
[1] == trig
[1].lower() and not trig
[2] == trig
[2].lower():
258 up
.append((max(1, num
//4), "⇧"+trig
[0].lower()+"⇧"))
259 up
.append((max(1, num
//2), trig
[0].lower()+"⇧"+trig
[1].lower()))
260 up
.append((max(1, num
//4), "⇧"+trig
[1].lower()+"⇧"))
261 up
.append((max(1, num
//2), trig
[1].lower()+"⇧"+trig
[2].lower()))
263 up
.append((max(1, num
//4), "⇗"+trig
[0].lower()+"⇧"))
264 up
.append((max(1, num
//4), "⇧"+trig
[0].lower()+"⇗"))
265 up
.append((max(1, num
//4), "⇗"+trig
[0].lower()+"⇗"))
267 up
.append((max(1, num
//4), "⇗"+trig
[1].lower()+"⇧"))
268 up
.append((max(1, num
//4), "⇧"+trig
[1].lower()+"⇗"))
269 up
.append((max(1, num
//4), "⇗"+trig
[1].lower()+"⇗"))
271 up
.append((max(1, num
//2), trig
[0].lower()+"⇗"+trig
[1].lower()))
272 up
.append((max(1, num
//2), trig
[1].lower()+"⇗"+trig
[2].lower()))
276 trigs
= [(int(num
), r
) for num
, r
in trigs
if r
[1:]]
282 def trigrams_in_file(data
):
283 """Sort the trigrams in a file by the number of occurrances.
285 >>> data = read_file("testfile")
286 >>> trigrams_in_file(data)[:12]
287 [(1, '⇧aa'), (1, '⇧aa'), (1, '⇧aa'), (1, '⇧aa'), (1, '⇗aa'), (1, '⇗aa'), (1, '⇗aa'), (1, '⇗aa'), (1, 'uia'), (1, 't⇧a'), (1, 't⇧a'), (1, 't⇗a')]
290 for i
in range(len(data
)-2):
291 trig
= data
[i
] + data
[i
+1] + data
[i
+2]
296 sorted_trigs
= [(trigs
[i
], i
) for i
in trigs
]
298 sorted_trigs
.reverse()
299 trigs
= split_uppercase_trigrams(sorted_trigs
)
302 def trigrams_in_file_precalculated(data
):
303 """Get the repeats from a precalculated file.
307 >>> data = read_file("3gramme.txt")
308 >>> trigrams_in_file_precalculated(data)[:6]
309 [(5678513, 'en '), (4414826, 'er '), (2891228, ' de'), (2302691, 'der'), (2272020, 'ie '), (2039215, 'ich')]
311 trigs
= [line
.lstrip().split(" ", 1) for line
in data
.splitlines() if line
.split()[1:]]
312 trigs
= [(int(num
), r
) for num
, r
in trigs
if r
[1:]]
313 trigs
= split_uppercase_trigrams(trigs
)
317 def letters_in_file_precalculated(data
):
318 """Get the repeats from a precalculated file.
320 >>> data = read_file("1gramme.txt")
321 >>> letters_in_file_precalculated(data)[:2]
322 [(44021504, 'e'), (26999087, 'n')]
324 letters
= [line
.lstrip().split(" ", 1) for line
in data
.splitlines() if line
.split()[1:]]
325 return [(int(num
), let
) for num
, let
in letters
]
328 def get_all_data(data
=None, letters
=None, repeats
=None, number_of_letters
=None, number_of_bigrams
=None, trigrams
=None, number_of_trigrams
=None):
329 """Get letters, bigrams and trigrams.
331 @param data: a string of text.
332 @return: letters, number_of_letters, bigrams, number_of_bigrams, trigrams, number_of_trigrams
334 #data = read_file("/tmp/sskreszta")
336 # if we get a datastring, we use it for everything.
338 letters
= letters_in_file(data
)
339 repeats
= bigrams
= repeats_in_file(data
)
340 trigrams
= trigrams_in_file(data
)
341 number_of_letters
= sum([i
for i
, s
in letters
])
342 number_of_bigrams
= sum([i
for i
, s
in bigrams
])
343 number_of_trigrams
= sum([i
for i
, s
in trigrams
])
345 # otherwise we get the missing values from the predefined files.
346 if letters
is None or number_of_letters
is None:
347 letterdata
= read_file("1gramme.txt")
348 letters
= letters_in_file_precalculated(letterdata
)
349 #letters = letters_in_file(data)
350 number_of_letters
= sum([i
for i
, s
in letters
])
352 if repeats
is None or number_of_bigrams
is None:
353 bigramdata
= read_file("2gramme.txt")
354 bigrams
= repeats_in_file_precalculated(bigramdata
)
355 #repeats = repeats_in_file(data)
356 number_of_bigrams
= sum([i
for i
, s
in bigrams
])
357 else: bigrams
= repeats
359 if trigrams
is None or number_of_trigrams
is None:
360 trigramdata
= read_file("3gramme.txt")
361 trigrams
= trigrams_in_file_precalculated(trigramdata
)
362 number_of_trigrams
= sum([i
for i
, s
in trigrams
])
364 return letters
, number_of_letters
, bigrams
, number_of_bigrams
, trigrams
, number_of_trigrams