modified: n.fq
[GalaxyCodeBases.git] / tools / distribution / distribution.py
blobe2cbb26a94c0f5d315b0e8005a272423886b285f
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 # vim: set noexpandtab sw=4 ts=4:
4 # --
5 # A recent battle with vim and a Go program finally settled this for me.
6 # Tabs for indent, spaces for formatting. If you change your shiftwidth and
7 # tabstop to different values and your code looks ugly, say aloud: tabs
8 # for indent, spaces for formatting.
10 """
11 Generate Graphs Directly in the (ASCII- or Unicode-based) Terminal
13 If you find yourself typing:
14 [long | list | of | commands | sort | uniq -c | sort -rn]
16 Replace:
17 [| sort | uniq -c | sort -rn]
19 With:
20 [| distribution]
22 Then bask in the glory of your new-found data visualization. There are other
23 use cases as well.
24 """
26 import math,os,re,sys,time
28 class Histogram(object):
29 """
30 Takes the tokenDict built in the InputReader class and goes through it,
31 printing a histogram for each of the highest height entries
32 """
33 def __init__(self):
34 pass
36 def histogram_bar(self, s, histWidth, maxVal, barVal):
37 # given a value and max, return string for histogram bar of the proper
38 # number of characters, including unicode partial-width characters
39 returnBar = ''
41 # first case is partial-width chars
42 if s.charWidth < 1:
43 zeroChar = s.graphChars[-1]
44 elif len(s.histogramChar) > 1 and s.unicodeMode == False:
45 zeroChar = s.histogramChar[0]
46 oneChar = s.histogramChar[1]
47 else:
48 zeroChar = s.histogramChar
49 oneChar = s.histogramChar
51 # write out the full-width integer portion of the histogram
52 if s.logarithmic:
53 maxLog = math.log(maxVal)
54 if barVal > 0:
55 barLog = math.log(barVal)
56 else:
57 barLog = 0
58 intWidth = int(barLog / maxLog * histWidth)
59 remainderWidth = (barLog / maxLog * histWidth) - intWidth
60 else:
61 intWidth = int(barVal * 1.0 / maxVal * histWidth)
62 remainderWidth = (barVal * 1.0 / maxVal * histWidth) - intWidth
64 # write the zeroeth character intWidth times...
65 returnBar += zeroChar * intWidth
67 # we always have at least one remaining char for histogram - if
68 # we have full-width chars, then just print it, otherwise do a
69 # calculation of how much remainder we need to print
71 # FIXME: The remainder partial char printed does not take into
72 # account logarithmic scale (can humans notice?).
73 if s.charWidth == 1:
74 returnBar += oneChar
75 elif s.charWidth < 1:
76 # this is high-resolution, so figure out what remainder we
77 # have to represent
78 if remainderWidth > s.charWidth:
79 whichChar = int(remainderWidth / s.charWidth)
80 returnBar += s.graphChars[whichChar]
82 return returnBar
84 def write_hist(self, s, tokenDict):
85 maxTokenLen = 0
86 outputDict = {}
88 numItems = 0
89 maxVal = 0
90 s.totalValues = int(s.totalValues)
92 # given a dict, create a comparison tuple that sorts first by the value of a key,
93 # then by the key itself in case of a tie. this allows us to create deterministic sorts
94 # when we have multiple entries in our histogram with the same frequency.
95 def value_key_compare(dict):
96 return lambda key: (dict.get(key), key)
98 for k in sorted(tokenDict, key=value_key_compare(tokenDict), reverse=True):
99 # can't remember what feature "if k:" adds - i think there's an
100 # off-by-one death the script sometimes suffers without it.
101 if k:
102 outputDict[k] = tokenDict[k]
103 if len(str(k)) > maxTokenLen: maxTokenLen = len(str(k))
104 if outputDict[k] > maxVal: maxVal = outputDict[k]
105 numItems += 1
106 if numItems >= s.height:
107 break
109 s.endTime = int(time.time() * 1000)
110 totalMillis = s.endTime - s.startTime
111 if s.verbose == True:
112 sys.stderr.write("tokens/lines examined: {:,d}".format(s.totalObjects) + "\n")
113 sys.stderr.write(" tokens/lines matched: {:,d}".format(s.totalValues) + "\n")
114 sys.stderr.write(" histogram keys: {:,d}".format(len(tokenDict)) + "\n")
115 sys.stderr.write(" runtime: {:,.2f}ms".format(totalMillis) + "\n")
117 # the first entry will determine these values
118 maxValueWidth = 0
119 maxPctWidth = 0
120 sortedOutput = sorted(outputDict, key=value_key_compare(outputDict), reverse=True)
121 for i in range(0, len(sortedOutput)):
122 k = sortedOutput[i]
123 # can't remember what feature "if k:" adds - i think there's an
124 # off-by-one death the script sometimes suffers without it.
125 if k:
126 if maxValueWidth == 0:
127 testString = "%s" % outputDict[k]
128 maxValueWidth = len(testString)
129 testString = "(%2.2f%%)" % (outputDict[k] * 1.0 / s.totalValues * 100)
130 maxPctWidth = len(testString)
132 # we always output a single histogram char at the end, so
133 # we output one less than actual number here
134 histWidth = s.width - (maxTokenLen+1) - (maxValueWidth+1) - (maxPctWidth+1) - 1
136 # output a header
137 sys.stderr.write("Key".rjust(maxTokenLen) + "|")
138 sys.stderr.write("Ct".ljust(maxValueWidth) + " ")
139 sys.stderr.write("(Pct)".ljust(maxPctWidth) + " ")
140 sys.stderr.write("Histogram")
142 # get ready for the output, but sorting gets hosed if we print the
143 # colour code before the key, so put it on the line before
144 sys.stderr.write(s.keyColour)
145 sys.stderr.write("\n")
147 sys.stdout.write(str(k).rjust(maxTokenLen))
148 sys.stdout.write(s.regularColour)
149 sys.stdout.write("|")
150 sys.stdout.write(s.ctColour)
152 outVal = "%s" % outputDict[k]
153 sys.stdout.write(outVal.rjust(maxValueWidth) + " ")
155 pct = "(%2.2f%%)" % (outputDict[k] * 1.0 / s.totalValues * 100)
156 sys.stdout.write(s.pctColour)
157 sys.stdout.write(pct.rjust(maxPctWidth) + " ")
159 sys.stdout.write(s.graphColour)
160 sys.stdout.write(self.histogram_bar(s, histWidth, maxVal, outputDict[k]))
162 if i == len(sortedOutput) - 1:
163 # put the terminal back into a normal-colour mode on last entry
164 sys.stdout.write(s.regularColour)
165 else:
166 # we do these antics of printing $keyColour on the line before
167 # the key so that piping output to sort will work
168 sys.stdout.write(s.keyColour)
169 sys.stdout.write("\n")
171 class InputReader(object):
173 Reads stdin, parses it into a dictionary of key and value is number
174 of appearances of that key in the input - this will also prune the
175 token frequency dict on after a certain number of insertions to
176 prevent OOME on large datasets
178 def __init__(self):
179 self.tokenDict = {}
181 def prune_keys(self, s):
182 newDict = {}
183 numKeysTransferred = 0
184 for k in sorted(self.tokenDict, key=self.tokenDict.get, reverse=True):
185 if k:
186 newDict[k] = self.tokenDict[k]
187 numKeysTransferred += 1
188 if numKeysTransferred > s.maxKeys:
189 break
190 self.tokenDict = newDict
191 s.numPrunes += 1
193 def tokenize_input(self, s):
194 # how to split the input... typically we split on whitespace or
195 # word boundaries, but the user can specify any regexp
196 if s.tokenize == 'white': s.tokenize = r'\s+'
197 elif s.tokenize == 'word': s.tokenize = r'\W'
199 # how to match (filter) the input... typically we want either
200 # all-alpha or all-numeric, but again, user can specify
201 if s.matchRegexp == 'word': s.matchRegexp = r'^[A-Z,a-z]+$'
202 elif s.matchRegexp == 'num': s.matchRegexp = r'^\d+$'
203 elif s.matchRegexp == 'number': s.matchRegexp = r'^\d+$'
205 # docs say these are cached, but i got about 2x speed boost
206 # from doing the compile
207 pt = re.compile(s.tokenize)
208 pm = re.compile(s.matchRegexp)
210 nextStat = time.time() + s.statInterval
212 pruneObjects = 0
213 for line in sys.stdin:
214 line = line.rstrip('\n')
215 if s.tokenize:
216 for token in pt.split(line):
217 # user desires to break line into tokens...
218 s.totalObjects += 1
219 if pm.match(token):
220 s.totalValues += 1
221 pruneObjects += 1
222 if token in self.tokenDict:
223 self.tokenDict[token] += 1
224 else:
225 self.tokenDict[token] = 1
226 else:
227 # user just wants every line to be a token
228 s.totalObjects += 1
229 if pm.match(line):
230 s.totalValues += 1
231 pruneObjects += 1
232 if line in self.tokenDict:
233 self.tokenDict[line] += 1
234 else:
235 self.tokenDict[line] = 1
237 # prune the hash if it gets too large
238 if pruneObjects >= s.keyPruneInterval:
239 self.prune_keys(s)
240 pruneObjects = 0
242 if s.verbose and time.time() > nextStat:
243 sys.stderr.write("tokens/lines examined: {:,d} ; hash prunes: {:,d}...".format(s.totalObjects, s.numPrunes) + chr(13))
244 nextStat = time.time() + s.statInterval
246 def read_pretallied_tokens(self, s):
247 # the input is already just a series of keys with the frequency of the
248 # keys precomputed, as in "du -sb" - vk means the number is first, key
249 # second. kv means key first, number second
250 vk = re.compile(r'^\s*(\d+)\s+(.+)$')
251 kv = re.compile(r'^(.+?)\s+(\d+)$')
252 if s.graphValues == 'vk':
253 for line in sys.stdin:
254 m = vk.match(line)
255 try:
256 self.tokenDict[m.group(2)] = int(m.group(1))
257 s.totalValues += int(m.group(1))
258 s.totalObjects += 1
259 except:
260 sys.stderr.write(" E Input malformed+discarded (perhaps pass -g=kv?): %s\n" % line)
261 elif s.graphValues == 'kv':
262 for line in sys.stdin:
263 m = kv.match(line)
264 try:
265 self.tokenDict[m.group(1)] = int(m.group(2))
266 s.totalValues += int(m.group(2))
267 s.totalObjects += 1
268 except:
269 sys.stderr.write(" E Input malformed+discarded (perhaps pass -g=vk?): %s\n" % line)
271 def read_numerics(self, s, h):
272 # in this special mode, we print out the histogram here instead
273 # of later - because it's a far simpler histogram without all the
274 # totals, percentages, etc of the real histogram. we're just
275 # showing a graph of a series of numbers
276 lastVal = 0
277 maxVal = 0
278 maxWidth = 0
279 sumVal = 0
280 outList = []
281 for line in sys.stdin:
282 try:
283 line = float(line.rstrip())
284 except:
285 line = lastVal
287 graphVal = 0
288 if s.numOnly == 'mon':
289 if s.totalObjects > 0:
290 graphVal = line - lastVal
291 lastVal = line
292 else:
293 graphVal = line
295 if graphVal > maxVal:
296 maxVal = graphVal
297 maxWidth = len(str(graphVal))
299 sumVal += int(graphVal)
301 if s.totalObjects > 0:
302 outList.append(graphVal)
303 s.totalObjects += 1
305 # simple graphical output
306 for k in outList:
307 sys.stdout.write(s.keyColour)
308 sys.stdout.write(str(int(k)).rjust(maxWidth))
309 pct = "(%2.2f%%)" % (float(k) / float(sumVal) * 100)
310 sys.stdout.write(s.pctColour)
311 sys.stdout.write(pct.rjust(9) + " ")
312 sys.stdout.write(s.graphColour)
313 sys.stdout.write(h.histogram_bar(s, s.width - 11 - maxWidth, maxVal, k) + "\n")
314 sys.stdout.write(s.regularColour)
317 class Settings(object):
318 def __init__(self):
319 self.totalMillis = 0
320 self.startTime = int(time.time() * 1000)
321 self.endTime = 0
322 self.widthArg = 0
323 self.heightArg = 0
324 self.width = 80
325 self.height = 15
326 self.histogramChar = '-'
327 self.colourisedOutput = False
328 self.logarithmic = False
329 self.numOnly = 'XXX'
330 self.verbose = False
331 self.graphValues = ''
332 self.size = ''
333 self.tokenize = ''
334 # by default, everything matches (nothing is stripped out)
335 self.matchRegexp = '.'
336 # how often to give status if verbose
337 self.statInterval = 1.0
338 self.numPrunes = 0
339 # for colourised output
340 self.colourPalette = '0,0,32,35,34'
341 self.regularColour = ""
342 self.keyColour = ""
343 self.ctColour = ""
344 self.pctColour = ""
345 self.graphColour = ""
346 # for stats
347 self.totalObjects = 0
348 self.totalValues = 0
349 # every keyPruneInterval keys, prune the hash to maxKeys top keys
350 self.keyPruneInterval = 1500000
351 self.maxKeys = 5000
352 # for advanced graphing
353 self.unicodeMode = False
354 self.charWidth = 1
355 self.graphChars = []
356 self.partialBlocks = ["▏", "▎", "▍", "▌", "▋", "▊", "▉", "█"] # char=pb
357 self.partialLines = ["╸", "╾", "━"] # char=hl
359 # rcfile grabbing/parsing if specified
360 if len(sys.argv) > 1 and '--rcfile' in sys.argv[1]:
361 rcFile = sys.argv[1].split('=')[1]
362 rcFile = os.path.expanduser(rcFile)
363 else:
364 rcFile = os.environ.get('HOME') + '/.distributionrc'
366 # parse opts from the rcFile if it exists
367 try:
368 rcfileOptList = open(rcFile).readlines()
369 for rcOpt in rcfileOptList:
370 rcOpt = rcOpt.rstrip()
371 rcOpt = rcOpt.split('#')[0]
372 if rcOpt != '':
373 sys.argv.insert(0, rcOpt)
374 except:
375 # don't die or in fact do anything if rcfile doesn't exist
376 pass
378 # manual argument parsing easier than getopts IMO
379 for arg in sys.argv:
380 if arg in ('-h', '--help'):
381 doUsage(self)
382 sys.exit(0)
383 elif arg in ("-c", "--color", "--colour"):
384 self.colourisedOutput = True
385 elif arg in ("-g", "--graph"):
386 # can pass --graph without option, will default to value/key ordering
387 # since Unix prefers that for piping-to-sort reasons
388 self.graphValues = 'vk'
389 elif arg in ("-l", "--logarithmic"):
390 self.logarithmic = True
391 elif arg in ("-n", "--numonly"):
392 self.numOnly = 'abs'
393 elif arg in ("-v", "--verbose"):
394 self.verbose = True
395 else:
396 argList = arg.split('=', 1)
397 if argList[0] in ("-w", "--width"):
398 self.widthArg = int(argList[1])
399 elif argList[0] in ("-h", "--height"):
400 self.heightArg = int(argList[1])
401 elif argList[0] in ("-k", "--keys"):
402 self.maxKeys = int(argList[1])
403 elif argList[0] in ("-c", "--char"):
404 self.histogramChar = argList[1]
405 elif argList[0] in ("-g", "--graph"):
406 self.graphValues = argList[1]
407 elif argList[0] in ("-n", "--numonly"):
408 self.numOnly = argList[1]
409 elif argList[0] in ("-p", "--palette"):
410 self.colourPalette = argList[1]
411 self.colourisedOutput = True
412 elif argList[0] in ("-s", "--size"):
413 self.size = argList[1]
414 elif argList[0] in ("-t", "--tokenize"):
415 self.tokenize = argList[1]
416 elif argList[0] in ("-m", "--match"):
417 self.matchRegexp = argList[1]
419 # first, size, which might be further overridden by width/height later
420 if self.size in ("full", "fl", "f"):
421 # tput will tell us the term width/height even if input is stdin
422 self.width, self.height = os.popen('echo "`tput cols` `tput lines`"', 'r').read().split()
423 # convert to numerics from string
424 self.width = int(self.width)
425 self.height = int(self.height) - 3
426 # need room for the verbosity output
427 if self.verbose == True: self.height -= 4
428 # in case tput went all bad, ensure some minimum size
429 if self.width < 40: self.width = 40
430 if self.height < 10: self.height = 10
431 elif self.size in ("small", "sm", "s"):
432 self.width = 60
433 self.height = 10
434 elif self.size in ("medium", "med", "m"):
435 self.width = 100
436 self.height = 20
437 elif self.size in ("large", "lg", "l"):
438 self.width = 140
439 self.height = 35
441 # synonyms "monotonically-increasing": derivative, difference, delta, increasing
442 # so all "d" "i" and "m" words will be graphing those differences
443 if self.numOnly[0] in ('d', 'i', 'm'): self.numOnly = 'mon'
444 # synonyms "actual values": absolute, actual, number, normal, noop,
445 # so all "a" and "n" words will graph straight up numbers
446 if self.numOnly[0] in ('a', 'n'): self.numOnly = 'abs'
448 # override variables if they were explicitly given
449 if self.widthArg != 0: self.width = self.widthArg
450 if self.heightArg != 0: self.height = self.heightArg
452 # maxKeys should be at least a few thousand greater than height to reduce odds
453 # of throwing away high-count values that appear sparingly in the data
454 if self.maxKeys < self.height + 3000:
455 self.maxKeys = self.height + 3000
456 if self.verbose: sys.stderr.write("Updated maxKeys to %d (height + 3000)\n" % self.maxKeys)
458 # colour palette
459 if self.colourisedOutput == True:
460 cl = self.colourPalette.split(',')
461 # ANSI color code is ESC+[+NN+m where ESC=chr(27), [ and m are
462 # the literal characters, and NN is a two-digit number, typically
463 # from 31 to 37 - why is this knowledge still useful in 2014?
464 cl = [chr(27) + '[' + e + 'm' for e in cl]
465 (self.regularColour, self.keyColour, self.ctColour, self.pctColour, self.graphColour) = cl
467 # some useful ASCII-->utf-8 substitutions
468 if self.histogramChar == "ba": self.unicodeMode = True; self.histogramChar = "▬"
469 elif self.histogramChar == "bl": self.unicodeMode = True; self.histogramChar = "Ξ"
470 elif self.histogramChar == "em": self.unicodeMode = True; self.histogramChar = "—"
471 elif self.histogramChar == "me": self.unicodeMode = True; self.histogramChar = "⋯"
472 elif self.histogramChar == "di": self.unicodeMode = True; self.histogramChar = "♦"
473 elif self.histogramChar == "dt": self.unicodeMode = True; self.histogramChar = "•"
474 elif self.histogramChar == "sq": self.unicodeMode = True; self.histogramChar = "□"
476 # sub-full character width graphing systems
477 if self.histogramChar == "pb":
478 self.charWidth = 0.125;
479 self.graphChars = self.partialBlocks
480 elif self.histogramChar == "pl":
481 self.charWidth = 0.3334;
482 self.graphChars = self.partialLines
484 # detect whether the user has passed a multibyte unicode character directly as the histogram char
485 if ord(self.histogramChar[0]) >= 128:
486 self.unicodeMode = True
488 def doUsage(s):
489 print("")
490 print("usage: <commandWithOutput> | %s" % (scriptName))
491 print(" [--rcfile=<rcFile>]")
492 print(" [--size={sm|med|lg|full} | --width=<width> --height=<height>]")
493 print(" [--color] [--palette=r,k,c,p,g]")
494 print(" [--tokenize=<tokenChar>]")
495 print(" [--graph[=[kv|vk]] [--numonly[=derivative,diff|abs,absolute,actual]]")
496 print(" [--char=<barChars>|<substitutionString>]")
497 print(" [--help] [--verbose]")
498 print(" --keys=K every %d values added, prune hash to K keys (default 5000)" % (s.keyPruneInterval))
499 print(" --char=C character(s) to use for histogram character, some substitutions follow:")
500 print(" pl Use 1/3-width unicode partial lines to simulate 3x actual terminal width")
501 print(" pb Use 1/8-width unicode partial blocks to simulate 8x actual terminal width")
502 print(" ba (▬) Bar")
503 print(" bl (Ξ) Building")
504 print(" em (—) Emdash")
505 print(" me (⋯) Mid-Elipses")
506 print(" di (♦) Diamond")
507 print(" dt (•) Dot")
508 print(" sq (□) Square")
509 print(" --color colourise the output")
510 print(" --graph[=G] input is already key/value pairs. vk is default:")
511 print(" kv input is ordered key then value")
512 print(" vk input is ordered value then key")
513 print(" --height=N height of histogram, headers non-inclusive, overrides --size")
514 print(" --help get help")
515 print(" --logarithmic logarithmic graph")
516 print(" --match=RE only match lines (or tokens) that match this regexp, some substitutions follow:")
517 print(" word ^[A-Z,a-z]+\$ - tokens/lines must be entirely alphabetic")
518 print(" num ^\\d+\$ - tokens/lines must be entirely numeric")
519 print(" --numonly[=N] input is numerics, simply graph values without labels")
520 print(" actual input is just values (default - abs, absolute are synonymous to actual)")
521 print(" diff input monotonically-increasing, graph differences (of 2nd and later values)")
522 print(" --palette=P comma-separated list of ANSI colour values for portions of the output")
523 print(" in this order: regular, key, count, percent, graph. implies --color.")
524 print(" --rcfile=F use this rcfile instead of ~/.distributionrc - must be first argument!")
525 print(" --size=S size of histogram, can abbreviate to single character, overridden by --width/--height")
526 print(" small 40x10")
527 print(" medium 80x20")
528 print(" large 120x30")
529 print(" full terminal width x terminal height (approximately)")
530 print(" --tokenize=RE split input on regexp RE and make histogram of all resulting tokens")
531 print(" word [^\\w] - split on non-word characters like colons, brackets, commas, etc")
532 print(" white \\s - split on whitespace")
533 print(" --width=N width of the histogram report, N characters, overrides --size")
534 print(" --verbose be verbose")
535 print("")
536 print("You can use single-characters options, like so: -h=25 -w=20 -v. You must still include the =")
537 print("")
538 print("Samples:")
539 print(" du -sb /etc/* | %s --palette=0,37,34,33,32 --graph" % (scriptName))
540 print(" du -sk /etc/* | awk '{print $2\" \"$1}' | %s --graph=kv" % (scriptName))
541 print(" zcat /var/log/syslog*gz | %s --char=o --tokenize=white" % (scriptName))
542 print(" zcat /var/log/syslog*gz | awk '{print \$5}' | %s -t=word -m-word -h=15 -c=/" % (scriptName))
543 print(" zcat /var/log/syslog*gz | cut -c 1-9 | %s -width=60 -height=10 -char=em" % (scriptName))
544 print(" find /etc -type f | cut -c 6- | %s -tokenize=/ -w=90 -h=35 -c=dt" % (scriptName))
545 print(" cat /usr/share/dict/words | awk '{print length(\$1)}' | %s -c=* -w=50 -h=10 | sort -n" % (scriptName))
546 print("")
548 # simple argument parsing and call top-level routines
549 def main(argv):
550 # instantiate our classes
551 s = Settings()
552 i = InputReader()
553 h = Histogram()
555 if s.graphValues:
556 # user passed g=vk or g=kv
557 i.read_pretallied_tokens(s)
558 elif s.numOnly != 'XXX':
559 # s.numOnly was specified by the user
560 i.read_numerics(s, h)
561 # read_numerics will have output a graph already, so exit
562 sys.exit(0)
563 else:
564 # this is the original behaviour of distribution
565 i.tokenize_input(s)
567 h.write_hist(s, i.tokenDict)
569 # what is this magic?
570 scriptName = sys.argv[0]
571 if __name__ == "__main__":
572 main(sys.argv[1:])