show: make highlight legible
[debiancodesearch.git] / internal / index / merge.go
blobdaa1e7b8e7ab573fb4ef0e2585f9e584d5193d89
1 package index
3 import (
4 "bufio"
5 "encoding/binary"
6 "io"
7 "log"
8 "os"
9 "path/filepath"
10 "sort"
13 type fileMetaEntry struct {
14 idxid uint32
15 entries uint32
16 offset int64
19 type indexMeta struct {
20 docidBase uint32
21 rd *PForReader
24 type posrelMetaEntry struct {
25 idxid uint32
26 offset int64
29 type posrelMeta struct {
30 rd *PosrelReader
33 func readMeta(dir, typ string, idx map[Trigram][]uint32, idxid uint32) error {
34 f, err := os.Open(filepath.Join(dir, "posting."+typ+".meta"))
35 if err != nil {
36 return err
38 defer f.Close()
39 st, err := f.Stat()
40 if err != nil {
41 return err
43 bufr := bufio.NewReader(f)
45 buf := make([]byte, metaEntrySize)
46 for i := 0; i < (int(st.Size()) / metaEntrySize); i++ {
47 if _, err := io.ReadFull(bufr, buf); err != nil {
48 return err
50 t := Trigram(binary.LittleEndian.Uint32(buf))
51 idx[t] = append(idx[t], idxid)
53 return nil
56 func readPosrelMeta(dir string, idx map[Trigram][]posrelMetaEntry, idxid uint32) error {
57 f, err := os.Open(filepath.Join(dir, "posting.posrel.meta"))
58 if err != nil {
59 return err
61 defer f.Close()
62 st, err := f.Stat()
63 if err != nil {
64 return err
66 bufr := bufio.NewReader(f)
68 buf := make([]byte, metaEntrySize)
69 var entry MetaEntry
70 for i := 0; i < (int(st.Size()) / metaEntrySize); i++ {
71 if _, err := io.ReadFull(bufr, buf); err != nil {
72 return err
74 entry.Unmarshal(buf)
75 idx[entry.Trigram] = append(idx[entry.Trigram], posrelMetaEntry{
76 idxid: idxid,
77 offset: entry.OffsetData,
80 return nil
83 const debug = false
85 var debugTrigram = func(trigram string) Trigram {
86 t := []byte(trigram)
87 return Trigram(uint32(t[0])<<16 | uint32(t[1])<<8 | uint32(t[2]))
88 }("_op")
90 func ConcatN(destdir string, srcdirs []string) error {
91 fDocidMap, err := os.Create(filepath.Join(destdir, "docid.map"))
92 if err != nil {
93 return err
95 defer fDocidMap.Close()
96 cw := newCountingWriter(fDocidMap)
98 var (
99 base uint32
100 offsets []uint32
102 bufr := bufio.NewReader(nil)
103 bases := make([]uint32, len(srcdirs))
104 for idx, dir := range srcdirs {
105 bases[idx] = base
106 f, err := os.Open(filepath.Join(dir, "docid.map"))
107 if err != nil {
108 return err
110 defer f.Close()
111 st, err := f.Stat()
112 if err != nil {
113 return err
115 if _, err := f.Seek(-4, io.SeekEnd); err != nil {
116 return err
118 // Locate index offset:
119 var indexOffset uint32
120 if err := binary.Read(f, binary.LittleEndian, &indexOffset); err != nil {
121 return err
124 // TODO: detect |base| overflows
125 n := (uint32(st.Size()) - indexOffset - 4) / 4
126 log.Printf("%s (idx %d) contains %d docids", dir, idx, n)
127 base += n
129 if _, err := f.Seek(0, io.SeekStart); err != nil {
130 return err
132 bufr.Reset(f)
133 // TODO(performance): measure whether using the index and incrementing
134 // the offsets is any faster than this method:
135 scanner := bufio.NewScanner(&io.LimitedReader{
136 R: bufr,
137 N: int64(indexOffset)})
138 for scanner.Scan() {
139 offsets = append(offsets, uint32(cw.offset))
140 cw.Write(scanner.Bytes())
141 cw.Write([]byte{'\n'})
143 if err := scanner.Err(); err != nil {
144 return err
147 indexStart := uint32(cw.offset)
148 if err := binary.Write(&cw, binary.LittleEndian, offsets); err != nil {
149 return err
151 if err := binary.Write(&cw, binary.LittleEndian, indexStart); err != nil {
152 return err
155 if err := cw.Close(); err != nil {
156 return err
159 log.Printf("reading fileMetaEntries")
161 idxMetaDocid := make([]indexMeta, len(srcdirs))
162 idxMetaPos := make([]indexMeta, len(srcdirs))
163 idxMetaPosrel := make([]posrelMeta, len(srcdirs))
165 idxDocid := make(map[Trigram][]uint32)
166 for idx, dir := range srcdirs {
167 base := bases[idx]
170 rd, err := newPForReader(dir, "docid")
171 if err != nil {
172 return err
174 defer rd.Close()
175 idxMetaDocid[idx] = indexMeta{docidBase: base, rd: rd}
178 if err := readMeta(dir, "docid", idxDocid, uint32(idx)); err != nil {
179 return err
183 rd, err := newPForReader(dir, "pos")
184 if err != nil {
185 return err
187 defer rd.Close()
188 idxMetaPos[idx] = indexMeta{docidBase: base, rd: rd}
192 rd, err := newPosrelReader(dir)
193 if err != nil {
194 return err
196 defer rd.Close()
198 idxMetaPosrel[idx] = posrelMeta{rd: rd}
202 trigrams := make([]Trigram, 0, len(idxDocid))
203 for t := range idxDocid {
204 trigrams = append(trigrams, t)
206 sort.Slice(trigrams, func(i, j int) bool { return trigrams[i] < trigrams[j] })
209 log.Printf("writing merged docids")
210 dw, err := newPForWriter(destdir, "docid")
211 if err != nil {
212 return err
215 fDocidMeta, err := os.Create(filepath.Join(destdir, "posting.docid.meta"))
216 if err != nil {
217 return err
219 defer fDocidMeta.Close()
220 bufwDocidMeta := bufio.NewWriter(fDocidMeta)
222 meBuf := make([]byte, metaEntrySize)
223 dr := NewDeltaReader()
224 for _, t := range trigrams {
225 if debug {
226 if t != debugTrigram {
227 continue
231 //for _, t := range []trigram{trigram(6650227), trigram(7959906)} {
232 //ctrl, data := dw.Offsets()
233 me := MetaEntry{
234 Trigram: t,
235 //OffsetCtrl: ctrl,
236 //OffsetEnc: data,
237 OffsetData: dw.Offset(),
239 var last uint32
240 for _, idxid := range idxDocid[t] {
241 idx := idxMetaDocid[idxid]
242 meta, err := idx.rd.metaEntry1(t)
243 if err != nil {
244 if err == errNotFound {
245 continue
247 return err
249 me.Entries += meta.Entries
250 dr.Reset(meta, idx.rd.data.Data)
251 docids := dr.Read() // returns non-nil at least once
252 // Bump the first docid: it needs to be mapped from the old
253 // docid range [0, n) to the new docid range [base, base+n).
255 // Since we are building a single docid list for this trigram,
256 // the new value needs to be a delta, hence, subtract last.
257 docids[0] += (idx.docidBase - last)
258 for docids != nil {
259 for _, d := range docids {
260 if err := dw.PutUint32(d); err != nil {
261 return err
263 last += d
265 docids = dr.Read()
268 if err := dw.Flush(); err != nil {
269 return err
271 me.Marshal(meBuf)
272 if _, err := bufwDocidMeta.Write(meBuf); err != nil {
273 //if err := binary.Write(bufwDocidMeta, binary.LittleEndian, &me); err != nil {
274 return err
278 if err := bufwDocidMeta.Flush(); err != nil {
279 return err
282 if err := fDocidMeta.Close(); err != nil {
283 return err
286 if err := dw.Close(); err != nil {
287 return err
292 log.Printf("writing merged posrel")
293 fmeta, err := os.Create(filepath.Join(destdir, "posting.posrel.meta"))
294 if err != nil {
295 return err
297 defer fmeta.Close()
298 bufwmeta := bufio.NewWriter(fmeta)
300 fposrel, err := os.Create(filepath.Join(destdir, "posting.posrel.data"))
301 if err != nil {
302 return err
304 defer fposrel.Close()
305 cw := newCountingWriter(fposrel)
306 pw := newPosrelWriter(&cw)
307 for _, t := range trigrams {
308 if debug {
309 if t != debugTrigram {
310 continue
313 if t == 2105376 { // TODO: document: " "?
314 continue
317 me := MetaEntry{
318 Trigram: t,
319 OffsetData: int64(cw.offset),
321 if err := binary.Write(bufwmeta, binary.LittleEndian, &me); err != nil {
322 return err
324 for _, idxid := range idxDocid[t] {
325 // TODO: refactor all metaEntry1 to use ,ok idiom, they only ever return errNotFound
326 fmeta, err := idxMetaPos[idxid].rd.metaEntry1(t)
327 if err != nil {
328 if err == errNotFound {
329 continue
331 return err
334 pmeta, err := idxMetaPosrel[idxid].rd.metaEntry1(t)
335 if err != nil {
336 if err == errNotFound {
337 continue
339 return err
341 b := idxMetaPosrel[idxid].rd.data.Data[pmeta.OffsetData:]
342 if err := pw.Write(b, int(fmeta.Entries)); err != nil {
343 return err
347 if err := pw.Flush(); err != nil {
348 return err
351 if err := bufwmeta.Flush(); err != nil {
352 return err
354 if err := fmeta.Close(); err != nil {
355 return err
357 if err := cw.Close(); err != nil {
358 return err
363 log.Printf("writing merged pos")
364 dw, err := newPForWriter(destdir, "pos")
365 if err != nil {
366 return err
369 fDocidMeta, err := os.Create(filepath.Join(destdir, "posting.pos.meta"))
370 if err != nil {
371 return err
373 defer fDocidMeta.Close()
374 bufwDocidMeta := bufio.NewWriter(fDocidMeta)
376 meBuf := make([]byte, metaEntrySize)
377 dr := NewDeltaReader()
378 //for _, t := range []trigram{trigram(6650227), trigram(7959906)} {
379 for _, t := range trigrams {
380 if debug {
381 if t != debugTrigram {
382 continue
386 if t == 2105376 { // TODO: document: " "?
387 continue
390 //ctrl, data := dw.Offsets()
391 me := MetaEntry{
392 Trigram: t,
393 // OffsetCtrl: ctrl,
394 // OffsetEnc: data,
395 OffsetData: dw.Offset(),
398 for _, idxid := range idxDocid[t] {
399 idx := idxMetaPos[idxid]
400 meta, err := idx.rd.metaEntry1(t)
401 if err != nil {
402 if err == errNotFound {
403 continue
405 return err
407 me.Entries += meta.Entries
408 dr.Reset(meta, idx.rd.data.Data)
410 for docids := dr.Read(); docids != nil; docids = dr.Read() {
411 for _, d := range docids {
412 if err := dw.PutUint32(d); err != nil {
413 return err
419 if err := dw.Flush(); err != nil {
420 return err
423 me.Marshal(meBuf)
424 if _, err := bufwDocidMeta.Write(meBuf); err != nil {
425 //if err := binary.Write(bufwDocidMeta, binary.LittleEndian, &me); err != nil {
426 return err
430 if err := bufwDocidMeta.Flush(); err != nil {
431 return err
434 if err := fDocidMeta.Close(); err != nil {
435 return err
438 if err := dw.Close(); err != nil {
439 return err
443 return nil