4 """Check how much a given text diverges from a 1gram, 2gram and 3gram frequency.
6 usage: ./textcheck.py <textfile to check> [--best-lines]
8 --lines: check each line and return the 10 most similar lines.
10 idea: allow selecting different 1gram, 2gram and 3gram files.
22 """Get the data from a file.
24 >>> read_file("testfile")[:2]
27 with
open(path
, "r") as f
: #, encoding="UTF-8") as f:
31 def read_file_lines(path
):
32 """Get the data from a file.
34 >>> read_file("testfile")[:2]
37 with
open(path
) as f
: #, encoding="UTF-8") as f:
41 def letters_in_file(data
):
42 """Sort the repeats in a file by the number of occurrances.
44 >>> data = read_file("testfile")
45 >>> letters_in_file(data)[:3]
46 [(5, 'a'), (4, '\\n'), (2, '⇧')]
56 def letters_in_file_precalculated(data
):
57 """Get the repeats from a precalculated file.
59 >>> data = read_file("1gramme.txt")
60 >>> letters_in_file_precalculated(data)[:2]
61 [(44034982, 'e'), (27012723, 'n')]
63 letters
= [line
.lstrip().split(" ", 1) for line
in data
.splitlines() if line
.split()[1:]]
64 letters
= [(int(num
), let
) for num
, let
in letters
]
65 lett
= {l
: num
for num
, l
in letters
}
68 def repeats_in_file(data
):
69 """Sort the repeats in a file by the number of occurrances.
71 >>> data = read_file("testfile")
72 >>> repeats_in_file(data)[:3]
73 [(2, 'aa'), (2, 'a\\n'), (1, '⇧a')]
76 for i
in range(len(data
)-1):
77 rep
= data
[i
] + data
[i
+1]
84 def repeats_in_file_precalculated(data
):
85 """Get the repeats from a precalculated file.
87 >>> data = read_file("2gramme.txt")
88 >>> repeats_in_file_precalculated(data)[:2]
89 [(10162743, 'en'), (10028050, 'er')]
91 reps
= [line
.lstrip().split(" ", 1) for line
in data
.splitlines() if line
.split()[1:]]
92 reps
= [(int(num
), r
) for num
, r
in reps
if r
[1:]]
93 r
= {r
: num
for num
, r
in reps
}
96 def trigrams_in_file(data
):
97 """Sort the trigrams in a file by the number of occurrances.
99 >>> data = read_file("testfile")
100 >>> trigrams_in_file(data)[:12]
101 [(1, '⇧aa'), (1, '⇧aa'), (1, '⇧aa'), (1, '⇧aa'), (1, '⇗aa'), (1, '⇗aa'), (1, '⇗aa'), (1, '⇗aa'), (1, 'uia'), (1, 't⇧a'), (1, 't⇧a'), (1, 't⇗a')]
104 for i
in range(len(data
)-2):
105 trig
= data
[i
] + data
[i
+1] + data
[i
+2]
112 def trigrams_in_file_precalculated(data
):
113 """Get the repeats from a precalculated file.
117 >>> data = read_file("3gramme.txt")
118 >>> trigrams_in_file_precalculated(data)[:6]
119 [(5679632, 'en '), (4417443, 'er '), (2891983, ' de'), (2303238, 'der'), (2273056, 'ie '), (2039537, 'ich')]
121 trigs
= [line
.lstrip().split(" ", 1) for line
in data
.splitlines() if line
.split()[1:]]
122 trigs
= [(int(num
), r
) for num
, r
in trigs
if r
[1:]]
123 t
= {t
: num
for num
, t
in trigs
}
126 def normalize_occurrence_dict(d
):
127 """normalize a dict with keys and assorted occurrence numbers.
129 ⇒ sum([d[t] for t in d]) == 1.0
131 _sum
= sum([d
[t
] for t
in d
])
132 d
= {t
: d
[t
]/_sum
for t
in d
}
135 def occurrence_dict_difference(d1
, d2
):
136 """Get the difference between two occurrence dicts.
138 TODO: Evaluate which difference calculation would be best.
140 @return: dict with all keys (in d1 or in d2) and the difference as value."""
145 diff1
[t
] = abs(d1
[t
] - d2
[t
])
147 diff1
[t
] = abs(d1
[t
])
148 # add all from d2 which are not in d1
151 diff1
[t
] = abs(d2
[t
])
154 def check_dissimilarity(txt_1grams
, txt_2grams
, txt_3grams
, ref_1grams
, ref_2grams
, ref_3grams
):
155 """check the similarity of the txt and the ref (-erence)."""
157 # normalize all dicts
158 txt_1grams
= normalize_occurrence_dict(txt_1grams
)
159 txt_2grams
= normalize_occurrence_dict(txt_2grams
)
160 txt_3grams
= normalize_occurrence_dict(txt_3grams
)
161 ref_1grams
= normalize_occurrence_dict(ref_1grams
)
162 ref_2grams
= normalize_occurrence_dict(ref_2grams
)
163 ref_3grams
= normalize_occurrence_dict(ref_3grams
)
165 d1
= occurrence_dict_difference(txt_1grams
, ref_1grams
)
166 d2
= occurrence_dict_difference(txt_2grams
, ref_2grams
)
167 d3
= occurrence_dict_difference(txt_3grams
, ref_3grams
)
169 return 0.5*sum(d1
.values()), 0.5*sum(d2
.values()), 0.5*sum(d3
.values())
174 def cost(text
, diff123
):
175 """Cost for a text with the three differences (1gram, 2gram, 3gram)."""
176 #: prefer shorter text: 3% * log2. Double length means 1% more cost.
177 length_factor
= 100 + 3*log(len(text
), 2)
178 return sum(diff123
) * length_factor
180 def shorten(text
, max_len
=270):
181 """shorten a line, breaking at a sentence-end, if possible, and otherwise at word-end."""
184 shorted
= text
[:max_len
]
185 if end
in text
[:max_len
]:
186 shidx
= text
[:max_len
].rindex(end
)
187 shorted
= text
[:shidx
+1]
188 elif space
in text
[:max_len
]:
189 shidx
= text
[:max_len
].rindex(space
)
190 shorted
= text
[:shidx
]
191 if len(shorted
) >= max_len
/2:
193 return text
[:max_len
]
196 def run(textfile
, best_lines
=False, max_len
=270):
199 data
= read_file("1gramme.txt")
200 reference1grams
= letters_in_file_precalculated(data
)
201 data
= read_file("2gramme.txt")
202 reference2grams
= repeats_in_file_precalculated(data
)
203 data
= read_file("3gramme.txt")
204 reference3grams
= trigrams_in_file_precalculated(data
)
207 data
= read_file_lines(textfile
)
208 best_10
= [] # [(sum, (1, 2, 3), text), …]
210 l
= shorten(data
[1], max_len
=max_len
)
214 text1grams
= letters_in_file(l
)
215 text2grams
= repeats_in_file(l
)
216 text3grams
= trigrams_in_file(l
)
217 diss
= check_dissimilarity(text1grams
, text2grams
, text3grams
, reference1grams
, reference2grams
, reference3grams
)
218 if not best_10
[9:] or cost(l
, diss
) < best_10
[-1][0]:
219 best_10
.append((cost(l
, diss
), diss
, l
))
221 best_10
= best_10
[:10]
222 print("\n### new top 10:", cost(l
, diss
), diss
, l
, "\n")
223 print(cost(l
, diss
), diss
, l
)
224 print("\n### best 10 lines ###\n")
226 for s
, x
, t
in best_10
:
227 print("### best:", s
, x
, t
)
229 data
= read_file(textfile
)
230 text1grams
= letters_in_file(data
)
231 text2grams
= repeats_in_file(data
)
232 text3grams
= trigrams_in_file(data
)
233 diss
= check_dissimilarity(text1grams
, text2grams
, text3grams
, reference1grams
, reference2grams
, reference3grams
)
234 print(cost(data
, diss
), diss
)
239 if __name__
== "__main__":
242 from doctest
import testmod
250 if "--best-lines" in argv
:
258 run(textfile
, best_lines
=LINES
)