2 # -*- coding: utf-8 -*-
3 # vim: set noexpandtab sw=4 ts=4:
5 # A recent battle with vim and a Go program finally settled this for me.
6 # Tabs for indent, spaces for formatting. If you change your shiftwidth and
7 # tabstop to different values and your code looks ugly, say aloud: tabs
8 # for indent, spaces for formatting.
11 Generate Graphs Directly in the (ASCII- or Unicode-based) Terminal
13 If you find yourself typing:
14 [long | list | of | commands | sort | uniq -c | sort -rn]
17 [| sort | uniq -c | sort -rn]
22 Then bask in the glory of your new-found data visualization. There are other
26 import math
,os
,re
,sys
,time
28 class Histogram(object):
30 Takes the tokenDict built in the InputReader class and goes through it,
31 printing a histogram for each of the highest height entries
36 def histogram_bar(self
, s
, histWidth
, maxVal
, barVal
):
37 # given a value and max, return string for histogram bar of the proper
38 # number of characters, including unicode partial-width characters
41 # first case is partial-width chars
43 zeroChar
= s
.graphChars
[-1]
44 elif len(s
.histogramChar
) > 1 and s
.unicodeMode
== False:
45 zeroChar
= s
.histogramChar
[0]
46 oneChar
= s
.histogramChar
[1]
48 zeroChar
= s
.histogramChar
49 oneChar
= s
.histogramChar
51 # write out the full-width integer portion of the histogram
53 maxLog
= math
.log(maxVal
)
55 barLog
= math
.log(barVal
)
58 intWidth
= int(barLog
/ maxLog
* histWidth
)
59 remainderWidth
= (barLog
/ maxLog
* histWidth
) - intWidth
61 intWidth
= int(barVal
* 1.0 / maxVal
* histWidth
)
62 remainderWidth
= (barVal
* 1.0 / maxVal
* histWidth
) - intWidth
64 # write the zeroeth character intWidth times...
65 returnBar
+= zeroChar
* intWidth
67 # we always have at least one remaining char for histogram - if
68 # we have full-width chars, then just print it, otherwise do a
69 # calculation of how much remainder we need to print
71 # FIXME: The remainder partial char printed does not take into
72 # account logarithmic scale (can humans notice?).
76 # this is high-resolution, so figure out what remainder we
78 if remainderWidth
> s
.charWidth
:
79 whichChar
= int(remainderWidth
/ s
.charWidth
)
80 returnBar
+= s
.graphChars
[whichChar
]
84 def write_hist(self
, s
, tokenDict
):
90 s
.totalValues
= int(s
.totalValues
)
92 # given a dict, create a comparison tuple that sorts first by the value of a key,
93 # then by the key itself in case of a tie. this allows us to create deterministic sorts
94 # when we have multiple entries in our histogram with the same frequency.
95 def value_key_compare(dict):
96 return lambda key
: (dict.get(key
), key
)
98 for k
in sorted(tokenDict
, key
=value_key_compare(tokenDict
), reverse
=True):
99 # can't remember what feature "if k:" adds - i think there's an
100 # off-by-one death the script sometimes suffers without it.
102 outputDict
[k
] = tokenDict
[k
]
103 if len(str(k
)) > maxTokenLen
: maxTokenLen
= len(str(k
))
104 if outputDict
[k
] > maxVal
: maxVal
= outputDict
[k
]
106 if numItems
>= s
.height
:
109 s
.endTime
= int(time
.time() * 1000)
110 totalMillis
= s
.endTime
- s
.startTime
111 if s
.verbose
== True:
112 sys
.stderr
.write("tokens/lines examined: {:,d}".format(s
.totalObjects
) + "\n")
113 sys
.stderr
.write(" tokens/lines matched: {:,d}".format(s
.totalValues
) + "\n")
114 sys
.stderr
.write(" histogram keys: {:,d}".format(len(tokenDict
)) + "\n")
115 sys
.stderr
.write(" runtime: {:,.2f}ms".format(totalMillis
) + "\n")
117 # the first entry will determine these values
120 sortedOutput
= sorted(outputDict
, key
=value_key_compare(outputDict
), reverse
=True)
121 for i
in range(0, len(sortedOutput
)):
123 # can't remember what feature "if k:" adds - i think there's an
124 # off-by-one death the script sometimes suffers without it.
126 if maxValueWidth
== 0:
127 testString
= "%s" % outputDict
[k
]
128 maxValueWidth
= len(testString
)
129 testString
= "(%2.2f%%)" % (outputDict
[k
] * 1.0 / s
.totalValues
* 100)
130 maxPctWidth
= len(testString
)
132 # we always output a single histogram char at the end, so
133 # we output one less than actual number here
134 histWidth
= s
.width
- (maxTokenLen
+1) - (maxValueWidth
+1) - (maxPctWidth
+1) - 1
137 sys
.stderr
.write("Key".rjust(maxTokenLen
) + "|")
138 sys
.stderr
.write("Ct".ljust(maxValueWidth
) + " ")
139 sys
.stderr
.write("(Pct)".ljust(maxPctWidth
) + " ")
140 sys
.stderr
.write("Histogram")
142 # get ready for the output, but sorting gets hosed if we print the
143 # colour code before the key, so put it on the line before
144 sys
.stderr
.write(s
.keyColour
)
145 sys
.stderr
.write("\n")
147 sys
.stdout
.write(str(k
).rjust(maxTokenLen
))
148 sys
.stdout
.write(s
.regularColour
)
149 sys
.stdout
.write("|")
150 sys
.stdout
.write(s
.ctColour
)
152 outVal
= "%s" % outputDict
[k
]
153 sys
.stdout
.write(outVal
.rjust(maxValueWidth
) + " ")
155 pct
= "(%2.2f%%)" % (outputDict
[k
] * 1.0 / s
.totalValues
* 100)
156 sys
.stdout
.write(s
.pctColour
)
157 sys
.stdout
.write(pct
.rjust(maxPctWidth
) + " ")
159 sys
.stdout
.write(s
.graphColour
)
160 sys
.stdout
.write(self
.histogram_bar(s
, histWidth
, maxVal
, outputDict
[k
]))
162 if i
== len(sortedOutput
) - 1:
163 # put the terminal back into a normal-colour mode on last entry
164 sys
.stdout
.write(s
.regularColour
)
166 # we do these antics of printing $keyColour on the line before
167 # the key so that piping output to sort will work
168 sys
.stdout
.write(s
.keyColour
)
169 sys
.stdout
.write("\n")
171 class InputReader(object):
173 Reads stdin, parses it into a dictionary of key and value is number
174 of appearances of that key in the input - this will also prune the
175 token frequency dict on after a certain number of insertions to
176 prevent OOME on large datasets
181 def prune_keys(self
, s
):
183 numKeysTransferred
= 0
184 for k
in sorted(self
.tokenDict
, key
=self
.tokenDict
.get
, reverse
=True):
186 newDict
[k
] = self
.tokenDict
[k
]
187 numKeysTransferred
+= 1
188 if numKeysTransferred
> s
.maxKeys
:
190 self
.tokenDict
= newDict
193 def tokenize_input(self
, s
):
194 # how to split the input... typically we split on whitespace or
195 # word boundaries, but the user can specify any regexp
196 if s
.tokenize
== 'white': s
.tokenize
= r
'\s+'
197 elif s
.tokenize
== 'word': s
.tokenize
= r
'\W'
199 # how to match (filter) the input... typically we want either
200 # all-alpha or all-numeric, but again, user can specify
201 if s
.matchRegexp
== 'word': s
.matchRegexp
= r
'^[A-Z,a-z]+$'
202 elif s
.matchRegexp
== 'num': s
.matchRegexp
= r
'^\d+$'
203 elif s
.matchRegexp
== 'number': s
.matchRegexp
= r
'^\d+$'
205 # docs say these are cached, but i got about 2x speed boost
206 # from doing the compile
207 pt
= re
.compile(s
.tokenize
)
208 pm
= re
.compile(s
.matchRegexp
)
210 nextStat
= time
.time() + s
.statInterval
213 for line
in sys
.stdin
:
214 line
= line
.rstrip('\n')
216 for token
in pt
.split(line
):
217 # user desires to break line into tokens...
222 if token
in self
.tokenDict
:
223 self
.tokenDict
[token
] += 1
225 self
.tokenDict
[token
] = 1
227 # user just wants every line to be a token
232 if line
in self
.tokenDict
:
233 self
.tokenDict
[line
] += 1
235 self
.tokenDict
[line
] = 1
237 # prune the hash if it gets too large
238 if pruneObjects
>= s
.keyPruneInterval
:
242 if s
.verbose
and time
.time() > nextStat
:
243 sys
.stderr
.write("tokens/lines examined: {:,d} ; hash prunes: {:,d}...".format(s
.totalObjects
, s
.numPrunes
) + chr(13))
244 nextStat
= time
.time() + s
.statInterval
246 def read_pretallied_tokens(self
, s
):
247 # the input is already just a series of keys with the frequency of the
248 # keys precomputed, as in "du -sb" - vk means the number is first, key
249 # second. kv means key first, number second
250 vk
= re
.compile(r
'^\s*(\d+)\s+(.+)$')
251 kv
= re
.compile(r
'^(.+?)\s+(\d+)$')
252 if s
.graphValues
== 'vk':
253 for line
in sys
.stdin
:
256 self
.tokenDict
[m
.group(2)] = int(m
.group(1))
257 s
.totalValues
+= int(m
.group(1))
260 sys
.stderr
.write(" E Input malformed+discarded (perhaps pass -g=kv?): %s\n" % line
)
261 elif s
.graphValues
== 'kv':
262 for line
in sys
.stdin
:
265 self
.tokenDict
[m
.group(1)] = int(m
.group(2))
266 s
.totalValues
+= int(m
.group(2))
269 sys
.stderr
.write(" E Input malformed+discarded (perhaps pass -g=vk?): %s\n" % line
)
271 def read_numerics(self
, s
, h
):
272 # in this special mode, we print out the histogram here instead
273 # of later - because it's a far simpler histogram without all the
274 # totals, percentages, etc of the real histogram. we're just
275 # showing a graph of a series of numbers
281 for line
in sys
.stdin
:
283 line
= float(line
.rstrip())
288 if s
.numOnly
== 'mon':
289 if s
.totalObjects
> 0:
290 graphVal
= line
- lastVal
295 if graphVal
> maxVal
:
297 maxWidth
= len(str(graphVal
))
299 sumVal
+= int(graphVal
)
301 if s
.totalObjects
> 0:
302 outList
.append(graphVal
)
305 # simple graphical output
307 sys
.stdout
.write(s
.keyColour
)
308 sys
.stdout
.write(str(int(k
)).rjust(maxWidth
))
309 pct
= "(%2.2f%%)" % (float(k
) / float(sumVal
) * 100)
310 sys
.stdout
.write(s
.pctColour
)
311 sys
.stdout
.write(pct
.rjust(9) + " ")
312 sys
.stdout
.write(s
.graphColour
)
313 sys
.stdout
.write(h
.histogram_bar(s
, s
.width
- 11 - maxWidth
, maxVal
, k
) + "\n")
314 sys
.stdout
.write(s
.regularColour
)
317 class Settings(object):
320 self
.startTime
= int(time
.time() * 1000)
326 self
.histogramChar
= '-'
327 self
.colourisedOutput
= False
328 self
.logarithmic
= False
331 self
.graphValues
= ''
334 # by default, everything matches (nothing is stripped out)
335 self
.matchRegexp
= '.'
336 # how often to give status if verbose
337 self
.statInterval
= 1.0
339 # for colourised output
340 self
.colourPalette
= '0,0,32,35,34'
341 self
.regularColour
= ""
345 self
.graphColour
= ""
347 self
.totalObjects
= 0
349 # every keyPruneInterval keys, prune the hash to maxKeys top keys
350 self
.keyPruneInterval
= 1500000
352 # for advanced graphing
353 self
.unicodeMode
= False
356 self
.partialBlocks
= ["▏", "▎", "▍", "▌", "▋", "▊", "▉", "█"] # char=pb
357 self
.partialLines
= ["╸", "╾", "━"] # char=hl
359 # rcfile grabbing/parsing if specified
360 if len(sys
.argv
) > 1 and '--rcfile' in sys
.argv
[1]:
361 rcFile
= sys
.argv
[1].split('=')[1]
362 rcFile
= os
.path
.expanduser(rcFile
)
364 rcFile
= os
.environ
.get('HOME') + '/.distributionrc'
366 # parse opts from the rcFile if it exists
368 rcfileOptList
= open(rcFile
).readlines()
369 for rcOpt
in rcfileOptList
:
370 rcOpt
= rcOpt
.rstrip()
371 rcOpt
= rcOpt
.split('#')[0]
373 sys
.argv
.insert(0, rcOpt
)
375 # don't die or in fact do anything if rcfile doesn't exist
378 # manual argument parsing easier than getopts IMO
380 if arg
in ('-h', '--help'):
383 elif arg
in ("-c", "--color", "--colour"):
384 self
.colourisedOutput
= True
385 elif arg
in ("-g", "--graph"):
386 # can pass --graph without option, will default to value/key ordering
387 # since Unix prefers that for piping-to-sort reasons
388 self
.graphValues
= 'vk'
389 elif arg
in ("-l", "--logarithmic"):
390 self
.logarithmic
= True
391 elif arg
in ("-n", "--numonly"):
393 elif arg
in ("-v", "--verbose"):
396 argList
= arg
.split('=', 1)
397 if argList
[0] in ("-w", "--width"):
398 self
.widthArg
= int(argList
[1])
399 elif argList
[0] in ("-h", "--height"):
400 self
.heightArg
= int(argList
[1])
401 elif argList
[0] in ("-k", "--keys"):
402 self
.maxKeys
= int(argList
[1])
403 elif argList
[0] in ("-c", "--char"):
404 self
.histogramChar
= argList
[1]
405 elif argList
[0] in ("-g", "--graph"):
406 self
.graphValues
= argList
[1]
407 elif argList
[0] in ("-n", "--numonly"):
408 self
.numOnly
= argList
[1]
409 elif argList
[0] in ("-p", "--palette"):
410 self
.colourPalette
= argList
[1]
411 self
.colourisedOutput
= True
412 elif argList
[0] in ("-s", "--size"):
413 self
.size
= argList
[1]
414 elif argList
[0] in ("-t", "--tokenize"):
415 self
.tokenize
= argList
[1]
416 elif argList
[0] in ("-m", "--match"):
417 self
.matchRegexp
= argList
[1]
419 # first, size, which might be further overridden by width/height later
420 if self
.size
in ("full", "fl", "f"):
421 # tput will tell us the term width/height even if input is stdin
422 self
.width
, self
.height
= os
.popen('echo "`tput cols` `tput lines`"', 'r').read().split()
423 # convert to numerics from string
424 self
.width
= int(self
.width
)
425 self
.height
= int(self
.height
) - 3
426 # need room for the verbosity output
427 if self
.verbose
== True: self
.height
-= 4
428 # in case tput went all bad, ensure some minimum size
429 if self
.width
< 40: self
.width
= 40
430 if self
.height
< 10: self
.height
= 10
431 elif self
.size
in ("small", "sm", "s"):
434 elif self
.size
in ("medium", "med", "m"):
437 elif self
.size
in ("large", "lg", "l"):
441 # synonyms "monotonically-increasing": derivative, difference, delta, increasing
442 # so all "d" "i" and "m" words will be graphing those differences
443 if self
.numOnly
[0] in ('d', 'i', 'm'): self
.numOnly
= 'mon'
444 # synonyms "actual values": absolute, actual, number, normal, noop,
445 # so all "a" and "n" words will graph straight up numbers
446 if self
.numOnly
[0] in ('a', 'n'): self
.numOnly
= 'abs'
448 # override variables if they were explicitly given
449 if self
.widthArg
!= 0: self
.width
= self
.widthArg
450 if self
.heightArg
!= 0: self
.height
= self
.heightArg
452 # maxKeys should be at least a few thousand greater than height to reduce odds
453 # of throwing away high-count values that appear sparingly in the data
454 if self
.maxKeys
< self
.height
+ 3000:
455 self
.maxKeys
= self
.height
+ 3000
456 if self
.verbose
: sys
.stderr
.write("Updated maxKeys to %d (height + 3000)\n" % self
.maxKeys
)
459 if self
.colourisedOutput
== True:
460 cl
= self
.colourPalette
.split(',')
461 # ANSI color code is ESC+[+NN+m where ESC=chr(27), [ and m are
462 # the literal characters, and NN is a two-digit number, typically
463 # from 31 to 37 - why is this knowledge still useful in 2014?
464 cl
= [chr(27) + '[' + e
+ 'm' for e
in cl
]
465 (self
.regularColour
, self
.keyColour
, self
.ctColour
, self
.pctColour
, self
.graphColour
) = cl
467 # some useful ASCII-->utf-8 substitutions
468 if self
.histogramChar
== "ba": self
.unicodeMode
= True; self
.histogramChar
= "▬"
469 elif self
.histogramChar
== "bl": self
.unicodeMode
= True; self
.histogramChar
= "Ξ"
470 elif self
.histogramChar
== "em": self
.unicodeMode
= True; self
.histogramChar
= "—"
471 elif self
.histogramChar
== "me": self
.unicodeMode
= True; self
.histogramChar
= "⋯"
472 elif self
.histogramChar
== "di": self
.unicodeMode
= True; self
.histogramChar
= "♦"
473 elif self
.histogramChar
== "dt": self
.unicodeMode
= True; self
.histogramChar
= "•"
474 elif self
.histogramChar
== "sq": self
.unicodeMode
= True; self
.histogramChar
= "□"
476 # sub-full character width graphing systems
477 if self
.histogramChar
== "pb":
478 self
.charWidth
= 0.125;
479 self
.graphChars
= self
.partialBlocks
480 elif self
.histogramChar
== "pl":
481 self
.charWidth
= 0.3334;
482 self
.graphChars
= self
.partialLines
484 # detect whether the user has passed a multibyte unicode character directly as the histogram char
485 if ord(self
.histogramChar
[0]) >= 128:
486 self
.unicodeMode
= True
490 print("usage: <commandWithOutput> | %s" % (scriptName
))
491 print(" [--rcfile=<rcFile>]")
492 print(" [--size={sm|med|lg|full} | --width=<width> --height=<height>]")
493 print(" [--color] [--palette=r,k,c,p,g]")
494 print(" [--tokenize=<tokenChar>]")
495 print(" [--graph[=[kv|vk]] [--numonly[=derivative,diff|abs,absolute,actual]]")
496 print(" [--char=<barChars>|<substitutionString>]")
497 print(" [--help] [--verbose]")
498 print(" --keys=K every %d values added, prune hash to K keys (default 5000)" % (s
.keyPruneInterval
))
499 print(" --char=C character(s) to use for histogram character, some substitutions follow:")
500 print(" pl Use 1/3-width unicode partial lines to simulate 3x actual terminal width")
501 print(" pb Use 1/8-width unicode partial blocks to simulate 8x actual terminal width")
503 print(" bl (Ξ) Building")
504 print(" em (—) Emdash")
505 print(" me (⋯) Mid-Elipses")
506 print(" di (♦) Diamond")
508 print(" sq (□) Square")
509 print(" --color colourise the output")
510 print(" --graph[=G] input is already key/value pairs. vk is default:")
511 print(" kv input is ordered key then value")
512 print(" vk input is ordered value then key")
513 print(" --height=N height of histogram, headers non-inclusive, overrides --size")
514 print(" --help get help")
515 print(" --logarithmic logarithmic graph")
516 print(" --match=RE only match lines (or tokens) that match this regexp, some substitutions follow:")
517 print(" word ^[A-Z,a-z]+\$ - tokens/lines must be entirely alphabetic")
518 print(" num ^\\d+\$ - tokens/lines must be entirely numeric")
519 print(" --numonly[=N] input is numerics, simply graph values without labels")
520 print(" actual input is just values (default - abs, absolute are synonymous to actual)")
521 print(" diff input monotonically-increasing, graph differences (of 2nd and later values)")
522 print(" --palette=P comma-separated list of ANSI colour values for portions of the output")
523 print(" in this order: regular, key, count, percent, graph. implies --color.")
524 print(" --rcfile=F use this rcfile instead of ~/.distributionrc - must be first argument!")
525 print(" --size=S size of histogram, can abbreviate to single character, overridden by --width/--height")
526 print(" small 40x10")
527 print(" medium 80x20")
528 print(" large 120x30")
529 print(" full terminal width x terminal height (approximately)")
530 print(" --tokenize=RE split input on regexp RE and make histogram of all resulting tokens")
531 print(" word [^\\w] - split on non-word characters like colons, brackets, commas, etc")
532 print(" white \\s - split on whitespace")
533 print(" --width=N width of the histogram report, N characters, overrides --size")
534 print(" --verbose be verbose")
536 print("You can use single-characters options, like so: -h=25 -w=20 -v. You must still include the =")
539 print(" du -sb /etc/* | %s --palette=0,37,34,33,32 --graph" % (scriptName
))
540 print(" du -sk /etc/* | awk '{print $2\" \"$1}' | %s --graph=kv" % (scriptName
))
541 print(" zcat /var/log/syslog*gz | %s --char=o --tokenize=white" % (scriptName
))
542 print(" zcat /var/log/syslog*gz | awk '{print \$5}' | %s -t=word -m-word -h=15 -c=/" % (scriptName
))
543 print(" zcat /var/log/syslog*gz | cut -c 1-9 | %s -width=60 -height=10 -char=em" % (scriptName
))
544 print(" find /etc -type f | cut -c 6- | %s -tokenize=/ -w=90 -h=35 -c=dt" % (scriptName
))
545 print(" cat /usr/share/dict/words | awk '{print length(\$1)}' | %s -c=* -w=50 -h=10 | sort -n" % (scriptName
))
548 # simple argument parsing and call top-level routines
550 # instantiate our classes
556 # user passed g=vk or g=kv
557 i
.read_pretallied_tokens(s
)
558 elif s
.numOnly
!= 'XXX':
559 # s.numOnly was specified by the user
560 i
.read_numerics(s
, h
)
561 # read_numerics will have output a graph already, so exit
564 # this is the original behaviour of distribution
567 h
.write_hist(s
, i
.tokenDict
)
569 # what is this magic?
570 scriptName
= sys
.argv
[0]
571 if __name__
== "__main__":