2 %w(md5 zcompress find htmlshrinker zcompress).each {|x| require x}
5 return string.unpack('H32V4' * (string.size/32))
8 def pack(md5, bstart, bsize, start, size)
9 return [md5, bstart, bsize, start, size].pack('H32V4')
13 sprintf("%d", "0x" + four[0..3]).to_i
16 HTMLSHRINKER = HTMLShrinker.new(ARGV[1])
19 attr_reader :text, :compressed, :size, :compressed_size, :filename, :index_content, :block, :buflocation
21 def initialize(filename, block, buflocation)
24 @text = HTMLSHRINKER.compress(File.read(filename))
26 # @index_content = index_content
27 @buflocation = buflocation
41 when /.htm$|.html$/i # get the file, strip all <> tags
42 content = @text.gsub(/\<head>.*?\<\/head>/im,"").gsub(/\<.*?\>/m, " ")
49 attr_reader :number, :start, :size
50 def initialize(number, start, size)
59 cur_block, counter, buflocation, size, buffer = 0, 0, 0, 0, ""
60 location = 4 # (to hold start of index)
62 name = (ARGV[1] ? ARGV[1] : "default")
65 puts "Indexing files in #{ARGV[0]}/ and writing the file #{name}.zindex and directory #{name}.zferret."
66 zdump = File.open("#{name}.zdump", "w")
69 ignore = ARGV[2] ? Regexp.new(ARGV[2]) : /^(Bilde~|Bruker|Pembicaraan_Pengguna~)/
71 Find.find(ARGV[0]) do |newfile|
72 next if File.directory?(newfile) || !File.readable?(newfile)
73 next if newfile =~ ignore
74 wf = Webpage.new(newfile, cur_block, buflocation)
75 puts "#{counter} files indexed." if counter.to_i / 100.0 == counter / 100
78 buflocation += wf.text.size
82 next if buffer.size < 900000
84 bf_compr = ZCompress::compress(buffer)
86 block_ary[cur_block] = Block.new(cur_block, location, bf_compr.size)
90 location += bf_compr.size
91 puts "Writing block no #{cur_block}"
93 # ZFERRET << {:filename => wf.filename, :content => wf.index_content, :offset => location, :size => wf.compressed_size }
94 # location += wf.compressed_size
98 # to ensure last part of buffer is written
99 bf_compr = ZCompress::compress(buffer)
100 zdump.write(bf_compr)
101 block_ary[cur_block] = Block.new(cur_block, location, bf_compr.size)
102 location += bf_compr.size
104 # writing start of index
106 zdump.write([location].pack('V'))
107 puts "location #{location}"
108 puts "Finished, writing index. #{Time.now - t}"
112 pages[file.filename] = {:block_start => block_ary[file.block].start,
113 :block_size => block_ary[file.block].size,
114 :start => file.buflocation,
119 puts "Sorted onetime. #{Time.now - t}"
120 pages.each_pair do |x, y|
121 md5 = MD5.md5(x).hexdigest
122 entry = pack(md5, y[:block_start], y[:block_size], y[:start], y[:size])
123 firstfour = md5subset(md5)
124 subindex[firstfour] = "" if subindex[firstfour].nil?
125 subindex[firstfour] << entry
128 puts "Sorted another time. #{Time.now - t}"
130 location = (65535*8) + indexloc
131 p = File.open(name + ".zlog",'w')
132 subindex.each_with_index do |entry, idx|
134 zdump.seek((idx*8) + indexloc)
135 zdump.print([location, entry.size].pack('V2'))
139 p << "*" * 80 << "\n"
140 p << "seek #{(idx*8) + location} location #{location} size #{entry.size}" << "\n"
141 p << unpack(entry).join(":") << "\n"
143 location += entry.size
145 puts "Finished. #{Time.now - t}"