3 ## echo | compresslog /path/to/my/log
14 from optparse
import OptionParser
15 from datetime
import datetime
29 def __init__(self
, tagname
, use_syslog
=True):
30 self
.loglevel
= self
.WARN
31 self
.syslog
= use_syslog
33 syslog
.openlog(tagname
)
35 def setLevel(self
, loglevel
):
36 self
.loglevel
= loglevel
39 if self
.WARN
>= self
.loglevel
:
43 if self
.ERROR
>= self
.loglevel
:
47 if self
.INFO
>= self
.loglevel
:
51 if self
.DEBUG
>= self
.loglevel
:
54 def critical(self
, msg
):
57 def _logit(self
, msg
):
61 sys
.stderr
.write(msg
+ '\n')
65 def __init__(self
, logfmt
):
66 self
._parse
_line
= apache_log_parser
.make_parser(logfmt
)
68 self
.ajax_mimetypes
= set(['application/ajax', 'text/xml'])
74 self
.pf_pviews_time
= list()
77 def parse_line(self
, logline
):
79 log_parsed
= self
._parse
_line
(logline
)
81 logger
.error("could not parse line: %s" % logline
)
83 if log_parsed
['status'] != '200':
85 self
.ct_hits
= self
.ct_hits
+ 1
86 if log_parsed
['request_method'] == 'POST':
87 if log_parsed
['response_header_content_type'] in self
.ajax_mimetypes
:
88 self
.ct_ajax_hits
= self
.ct_ajax_hits
+ 1
89 if log_parsed
['response_header_content_type'] == 'text/plain':
90 if log_parsed
['request_url'].find('ajax') > -1:
91 self
.ct_ajax_hits
= self
.ct_ajax_hits
+ 1
93 if log_parsed
['response_header_content_type'] != 'text/html':
95 self
.ct_pviews
= self
.ct_pviews
+ 1
96 self
.pf_pviews_time
.append(int(log_parsed
['time_us']))
97 user
= log_parsed
['note_moodleuser']
99 self
.ls_users
.add(user
)
101 def percentile(self
, N
, P
):
103 http://stackoverflow.com/questions/2374640/how-do-i-calculate-percentiles-with-python-numpy
104 Find the percentile of a list of values
106 @parameter N - A list of values. N must be sorted.
107 @parameter P - A float value from 0.0 to 1.0
109 @return - The percentile of the values.
111 n
= int(round(P
* len(N
) + 0.5))
117 def log_stats(self
, boundary
):
125 'boundary_epoch': boundary
128 stats
['hits'] = self
.ct_hits
129 stats
['ajax_hits'] = self
.ct_ajax_hits
131 stats
['pviews'] = self
.ct_pviews
132 self
.pf_pviews_time
.sort()
133 stats
['pview_avg_us'] = sum(self
.pf_pviews_time
) / float(len(self
.pf_pviews_time
))
134 stats
['pview_50th_us'] = self
.percentile(self
.pf_pviews_time
, 0.5)
135 stats
['pview_80th_us'] = self
.percentile(self
.pf_pviews_time
, 0.8)
136 stats
['unique_users'] = len(self
.ls_users
)
137 # passing separators explicitly gives us more compact encoding
138 logger
.info("stats %s" % json
.dumps(stats
, separators
=(',',':')))
141 parser
= OptionParser()
142 parser
.add_option('-d', '--debug', dest
='debug', default
=0, action
='count',
143 help='Print debugging info to stderr, no output to syslog')
144 parser
.add_option('--stats', dest
='stats', default
=False, action
='store_true',
145 help='Parse and process stats assuming it is a specially-formatted apache access log')
146 (opts
, args
) = parser
.parse_args()
150 ## Setup logging to syslog bright and early
152 logger
= logging('www_stats', (opts
.debug
== 0))
153 logger
.setLevel(logging
.INFO
)
155 logger
.setLevel(logging
.DEBUG
)
158 # these libs add nontrivial memory footprint
159 # import them only if we are going to use them...
160 import apache_log_parser
164 # TODO: Find a reasonable way to make this configurable...
165 pstats
= parse_stats('%h %l \"%{MOODLEUSER}n\" %t %D \"%{Content-Type}o\" \"%r\" %>s %b \"%{Referer}i\" \"%{User-Agent}i\"')
169 sys
.stderr
.write(msg
+'\n')
172 # using microsecond (%f) to sidestep races
173 datestr
= datetime
.now().strftime('_%Y%m%d-%H%M%S-%f')
174 if fname
.endswith('_log'):
175 return fname
[0:-4] + datestr
+ '_log'
177 return fname
+ datestr
179 ## Work around several issues
181 ## - gzip files, due to their nature cannot sustain two simultaneous
182 ## writers. Pure text log files can, thanks to promises from POSIX
183 ## implemented in the kernel about the integrity of writes of
184 ## newline-delimited data.
186 ## - Apache 2.2 will sometimes open multiple piped loggers to the same
187 ## file. Apache 2.4 seems to only do it if the piped logger config
188 ## string is different. v2.2 is less predictable on this.
189 ## This means several compresslog processes are started at the same
190 ## time and race to create the file.
194 ## Try to open file exclusively
195 f
= gzip
.open(fname
, 'a')
197 fcntl
.flock(f
, fcntl
.LOCK_EX | fcntl
.LOCK_NB
)
199 if e
.errno
== errno
.EACCES
or e
.errno
== errno
.EAGAIN
:
200 logger
.warn("Cannot flock %s" % fname
)
204 fname
= alt_name(fname
)
205 logger
.warn("Retrying with %s" % fname
)
209 errorprint("Unexpected error in flock(): %u" % e
.errno
)
212 # make sure we are appending to an empty or a gzip file
213 # to avoid appending to a plaintext log
214 if os
.stat(fname
)[6] != 0:
215 mimetype
= subprocess
.Popen(['/usr/bin/file', '--mime-type', '-b', fname
],
216 stdout
=subprocess
.PIPE
).communicate()[0]
217 mimetype
= mimetype
.rstrip()
218 if mimetype
!= 'application/x-gzip':
219 mvfname
= alt_name(fname
)
220 errorprint("Destination file exists and is not compressed, renaming old file to "
221 + mimetype
+ mvfname
)
222 os
.rename(fname
, mvfname
)
223 # release the lock, restart the process
224 # unfortunately this will append a gzip header
228 # success - out of the error handling loop
231 sig_wants_flush
= False
232 sig_wants_flushexit
= False
234 def sig_flush(signum
, frame
):
235 logger
.debug('sig_flush')
236 global sig_wants_flush
237 sig_wants_flush
= True
239 def sig_flushexit(signum
, frame
):
240 logger
.debug('sig_flushexit')
241 global sig_wants_flushexit
243 sig_wants_flushexit
= True
246 ## ensure we flush on various signals
247 # USR1/USR2: user asked to flush out
248 # HUP: apache is no longer writing to us (hung up)
249 signal
.signal(signal
.SIGUSR1
, sig_flush
)
250 signal
.signal(signal
.SIGUSR2
, sig_flush
)
251 signal
.signal(signal
.SIGHUP
, sig_flushexit
)
252 signal
.signal(signal
.SIGTERM
, sig_flushexit
)
254 # while True / select / read(int) / O_NONBLOCK put
255 # input buffering explicitly in our hands
256 fcntl
.fcntl(sys
.stdin
, fcntl
.F_SETFL
, os
.O_NONBLOCK
)
268 # calculate next period boundary
269 # and use that as timeout
271 if (statsboundary
== False) or (statsboundary
< t
) :
273 statsboundary
= t
- (t
% PERIOD
) + PERIOD
274 timeout
= statsboundary
- t
275 logger
.debug('timeout in: %u s' % timeout
)
277 rfds
= select
.select([sys
.stdin
], [], [], timeout
)[0]
278 logger
.debug('select: %u' % len(rfds
))
279 if len(rfds
): # only read() when select says
280 logger
.debug('read()')
282 buf
= sys
.stdin
.read(4096)
283 logger
.debug('read %u' % len(buf
))
285 # If we do get EINTR during write, in Python < 2.7.2
286 # the write may be screwed/incomplete, but we
287 # have no way to know&retry.
288 # http://bugs.python.org/issue10956
293 logger
.debug('line tail length %u' % len(buftail
))
295 loglines
= buf
.split('\n')
296 # save the tail for later
297 buftail
= loglines
.pop()
298 for logline
in loglines
:
299 linecount
= linecount
+ 1
300 pstats
.parse_line(logline
)
303 logger
.debug('at_eof')
306 if t
> statsboundary
:
307 pstats
.log_stats(statsboundary
)
308 except select
.error
, e
:
309 if e
[0] == errno
.EINTR
:
310 logger
.debug('E in select: %u' % e
[0])
311 continue # on signal, restart at the top
315 logger
.debug('E in read() or write(): %u' % e
[0])
316 if e
[0] == errno
.EINTR
:
317 continue # on signal, restart at the top
321 if at_eof
or sig_wants_flushexit
:
325 pstats
.log_stats(statsboundary
)
326 logger
.debug('fh closed, exiting')
330 sig_wants_flush
= False
332 f
.flush(zlib
.Z_FULL_FLUSH
)
333 logger
.debug('flush')
335 logger
.debug('E in flush')
336 # ignore exceptions, try to keep rolling