3 import os
, sys
, hashlib
4 import sqlite3
, getopt
, mmap
7 #from os.path import join, getsize
8 from datetime
import datetime
11 epoch
= datetime
.utcfromtimestamp(0)
12 def epoch_seconds(dt
):
13 return (dt
- epoch
).total_seconds()
15 BUF_SIZE
= 1048576 # lets read stuff in 1Mb chunks!
16 def sha1file(fname
=None, blocksize
=BUF_SIZE
):
20 with
open(fname
, 'rb') as f
:
21 with mmap
.mmap(f
.fileno(), 0, prot
=mmap
.PROT_READ
) as mm
:
22 for block
in iter(lambda: mm
.read(blocksize
), b
""):
24 return sha1
.hexdigest()
26 class Config
: # https://stackoverflow.com/a/47016739/159695
27 def __init__(self
, **kwds
):
28 self
.verbose
=0 # -1=quiet 0=norm 1=noisy
29 self
.mode
=0 # 0:Test, 1:Create
33 self
.startpoint
= ''.join(['.',os
.sep
])
34 self
.__dict
__.update(kwds
) # Must be last to accept assigned member variable.
36 args
= ['%s=%s' % (k
, repr(v
)) for (k
,v
) in vars(self
).items()]
37 return '%s(%s)' % ( self
.__class
__.__qualname
__, ', '.join(args
) )
39 def pverbose(s
,nl
='\n'):
41 sys
.stdout
.write(s
+nl
)
43 if config
.verbose
>=0 or config
.verbose
==-3:
44 sys
.stdout
.write(s
+nl
)
45 def perror(s
,nl
='\n'):
46 if config
.verbose
>=-1:
47 sys
.stdout
.flush() # avoid inconsistent screen state if stdout has unflushed data
48 sys
.stderr
.write(s
+nl
)
49 def printusage(err
=0):
50 phelp
= err
and perror
or pinfo
# False->pinfo, True->perror
51 phelp('Usage: gethashes [opts] [-p dir] [-T|-C] [-f file] [files...]')
52 #phelp(' -T Test mode (default)')
53 #phelp(' -C Create mode')
54 #phelp(' -t <t> set type to <t> (%s, or auto(default))'%', '.join(sorted(hashlib.algorithms_available)))
55 phelp(' -p <d> change to directory <d> before doing anything')
56 phelp(' -f <f> use <f> as gzipped hash file (<d>.hash.gz)')
57 #phelp('Options in Create mode:')
58 phelp(' -s [s][k,m] load .sha1 files in subdirectories and skip older recorded files larger than [s] [*1024, *1048576] (default=1m)')
59 phelp(' -a Always skip recorded files even if loaded .sha1 file is older')
60 #phelp(' -1 Also create <f>.sha1 file')
61 #phelp('Options in Test mode:')
62 #phelp(' -b <l> Output list of bad files to file <l>')
63 #phelp('Other Options:')
64 phelp(' -v/-q verbose/quiet, change verbosity [-1,2]')
65 phelp(' --help/-h show help')
66 phelp(' --version show gethashes and module versions')
71 # https://stackoverflow.com/questions/635483/what-is-the-best-way-to-implement-nested-dictionaries/19829714#19829714
73 def __missing__(self
, key
):
74 value
= self
[key
] = type(self
)() # retain local pointer to value
75 return value
# faster to return than dict lookup
77 OldHashes
= Vividict()
79 # rem from `cfv` L1116:`_foosum_rem`. `re.match()` checks for a match only at the beginning of the string, thus not r'^'.
80 sha1rem
=re
.compile(r
'([0-9a-fA-F]{40}) ([ *])([^\r\n]+)[\r\n]*$')
82 def loadsha1(root
,afile
):
84 rname
= os
.path
.join(root
,afile
)
85 mtime
= os
.path
.getmtime(rname
)
88 for line
in open(rname
, encoding
='utf-8', errors
='surrogateescape'):
89 x
= sha1rem
.match(line
)
93 pinfo('[!] Textmode in "%s".'%(rname))
96 iname
= os
.path
.join(root
,x
.group(3))
98 istat
= os
.stat(iname
)
99 except FileNotFoundError
:
101 pinfo('[!] Missing file from "%s".'%(rname))
104 if istat
.st_size
< config
.ssize
:
106 if not config
.skipNewer
:
107 itime
= os
.path
.getmtime(iname
)
108 #isize = os.path.getsize(iname)
109 #pprint.pprint(['t:',iname,mtime,itime])
112 OldHashes
[istat
.st_dev
][istat
.st_ino
] = x
.group(1)
114 pinfo('[!] Textmode %d times in "%s" !'%(itextmode
,rname
))
116 pinfo('[!] Missing %d times in "%s" !'%(imissing
,rname
))
124 argv
[0] = ''.join([argv
[0].rstrip(os
.sep
),os
.sep
])
125 #pprint.pprint(argv) # <-- DEBUG
128 opts
, args
= getopt
.gnu_getopt(argv
, "CTf:p:s:a1b:vqh?", ['help','version'])
129 except getopt
.GetoptError
as err
:
130 # print help information and exit:
131 print(err
) # will print something like "option -a not recognized"
138 config
.startpoint
= ''.join([a
.rstrip(os
.sep
),os
.sep
])
139 os
.chdir(a
) # also checks PermissionError and FileNotFoundError for me
143 config
.ssize
= human2bytes(a
)
149 if config
.verbose
>=0: config
.verbose
+=1
151 if config
.verbose
>=0:
155 elif o
in ("-h", "--help", '-?'):
157 elif o
=='-V' or o
=='--version':
158 print('gethashes %s'%version
)
159 print('python %08x-%s'%(sys
.hexversion
,sys
.platform
))
162 assert False, "unhandled option"
164 except RuntimeError as e
:
167 if not hasattr(config
, 'hashfile'):
168 dirName
= os
.path
.basename(os
.path
.abspath(config
.startpoint
))
169 #dirName = os.path.basename(os.getcwd())
170 config
.hashfile
= ''.join([config
.startpoint
, dirName
, '.hash.gz'])
171 pprint
.pprint(config
) # <-- DEBUG
175 f_out
= gzip
.open(config
.hashfile
, 'wt', encoding
='utf-8', errors
='surrogateescape')
177 for root
, dirs
, files
in os
.walk(config
.startpoint
): # os.walk(top, topdown=True, onerror=None, followlinks=False)
179 dirs
.remove('@eaDir') # don't visit "@eaDir" directories
180 dirs
.sort(reverse
=True)
181 files
.sort(reverse
=True)
182 relroot
= root
.rpartition(config
.startpoint
)[2]
183 #if not relroot: relroot = '.'
187 if afile
.endswith(".sha1"):
188 rname
= os
.path
.join(root
,afile
)
189 pinfo('[!]Loading SHA1 hash from "%s"'%rname
)
192 rname
= os
.path
.join(root
,afile
)
194 if os
.path
.samefile(rname
,config
.hashfile
):
196 except FileNotFoundError
:
197 pinfo('[!] FileNotFound:"%s".'%rname
)
199 #fname = os.sep.join(filter(None,[relroot,afile]))
200 fname
= os
.path
.join(relroot
,afile
)
201 istat
= os
.stat(rname
)
202 if istat
.st_size
== 0:
204 elif (istat
.st_dev
in OldHashes
) and (istat
.st_ino
in OldHashes
[istat
.st_dev
]):
205 ihash
= OldHashes
[istat
.st_dev
][istat
.st_ino
]
208 ihash
= sha1file(rname
)
209 #pprint.pprint(('O:',fname,rname,ihash,HitHashes))
210 f_out
.write('%s *%s\n'%(ihash
,fname
))
211 #mtime = os.path.getmtime(rname)
212 #stime = datetime.utcfromtimestamp(mtime).strftime('%Y%m%du%H%M%S')
213 #rtime = datetime.strptime(''.join([stime,'UTC']),'%Y%m%du%H%M%S%Z')
214 #print(rname,fname,stime,mtime,epoch_seconds(rtime))
215 #fsize = os.path.getsize(rname)
216 #hsize = bytes2human(fsize)
217 #print(fname,hsize,stime,sha1file(rname),sep='\t')
220 pinfo('[!]Skipped hashing of %d recorded file(s).'%HitHashes
)
221 pinfo('\n[!]Done. Test with `cfv -p %s -f %s`.'%(config
.startpoint
,config
.hashfile
))
223 # https://github.com/giampaolo/pyftpdlib/blob/0430c92e9d852a6d175b489c0ebf17fbc0190914/scripts/ftpbench#L139
224 def bytes2human(n
, format
="%(value).1f%(symbol)s", intfmt
="%(value).0f %(symbol)s"):
226 >>> bytes2human(10000)
228 >>> bytes2human(100001221)
231 symbols
= ('B', 'K', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y')
233 for i
, s
in enumerate(symbols
[1:]):
234 prefix
[s
] = 1 << (i
+ 1) * 10
235 for symbol
in reversed(symbols
[1:]):
236 if n
>= prefix
[symbol
]:
237 value
= float(n
) / prefix
[symbol
]
238 return format
% locals()
240 #intfmt=re.sub(r'\(value\)\.(\d+)',r'(value).0',format)
242 return intfmt
% dict(symbol
=symbols
[0], value
=n
)
244 # http://goo.gl/zeJZl
247 >>> human2bytes('1M')
249 >>> human2bytes('1G')
252 symbols
= ('B', 'K', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y')
253 letter
= s
[-1:].strip().upper()
255 assert num
.isdigit() and letter
in symbols
, s
257 prefix
= {symbols
[0]: 1}
258 for i
, s
in enumerate(symbols
[1:]):
259 prefix
[s
] = 1 << (i
+ 1) * 10
260 return int(num
* prefix
[letter
])
263 if __name__
== '__main__':
266 def isBlank (myString
):
267 return not (myString
and myString
.strip())
269 def isNotBlank (myString
):
270 return bool(myString
and myString
.strip())