13 type fileMetaEntry
struct {
19 type indexMeta
struct {
24 type posrelMetaEntry
struct {
29 type posrelMeta
struct {
33 func readMeta(dir
, typ
string, idx
map[Trigram
][]uint32, idxid
uint32) error
{
34 f
, err
:= os
.Open(filepath
.Join(dir
, "posting."+typ
+".meta"))
43 bufr
:= bufio
.NewReader(f
)
45 buf
:= make([]byte, metaEntrySize
)
46 for i
:= 0; i
< (int(st
.Size()) / metaEntrySize
); i
++ {
47 if _
, err
:= io
.ReadFull(bufr
, buf
); err
!= nil {
50 t
:= Trigram(binary
.LittleEndian
.Uint32(buf
))
51 idx
[t
] = append(idx
[t
], idxid
)
56 func readPosrelMeta(dir
string, idx
map[Trigram
][]posrelMetaEntry
, idxid
uint32) error
{
57 f
, err
:= os
.Open(filepath
.Join(dir
, "posting.posrel.meta"))
66 bufr
:= bufio
.NewReader(f
)
68 buf
:= make([]byte, metaEntrySize
)
70 for i
:= 0; i
< (int(st
.Size()) / metaEntrySize
); i
++ {
71 if _
, err
:= io
.ReadFull(bufr
, buf
); err
!= nil {
75 idx
[entry
.Trigram
] = append(idx
[entry
.Trigram
], posrelMetaEntry
{
77 offset
: entry
.OffsetData
,
85 var debugTrigram
= func(trigram
string) Trigram
{
87 return Trigram(uint32(t
[0])<<16 |
uint32(t
[1])<<8 |
uint32(t
[2]))
90 func ConcatN(destdir
string, srcdirs
[]string) error
{
91 fDocidMap
, err
:= os
.Create(filepath
.Join(destdir
, "docid.map"))
95 defer fDocidMap
.Close()
96 cw
:= newCountingWriter(fDocidMap
)
102 bufr
:= bufio
.NewReader(nil)
103 bases
:= make([]uint32, len(srcdirs
))
104 for idx
, dir
:= range srcdirs
{
106 f
, err
:= os
.Open(filepath
.Join(dir
, "docid.map"))
115 if _
, err
:= f
.Seek(-4, io
.SeekEnd
); err
!= nil {
118 // Locate index offset:
119 var indexOffset
uint32
120 if err
:= binary
.Read(f
, binary
.LittleEndian
, &indexOffset
); err
!= nil {
124 // TODO: detect |base| overflows
125 n
:= (uint32(st
.Size()) - indexOffset
- 4) / 4
126 log
.Printf("%s (idx %d) contains %d docids", dir
, idx
, n
)
129 if _
, err
:= f
.Seek(0, io
.SeekStart
); err
!= nil {
133 // TODO(performance): measure whether using the index and incrementing
134 // the offsets is any faster than this method:
135 scanner
:= bufio
.NewScanner(&io
.LimitedReader
{
137 N
: int64(indexOffset
)})
139 offsets
= append(offsets
, uint32(cw
.offset
))
140 cw
.Write(scanner
.Bytes())
141 cw
.Write([]byte{'\n'})
143 if err
:= scanner
.Err(); err
!= nil {
147 indexStart
:= uint32(cw
.offset
)
148 if err
:= binary
.Write(&cw
, binary
.LittleEndian
, offsets
); err
!= nil {
151 if err
:= binary
.Write(&cw
, binary
.LittleEndian
, indexStart
); err
!= nil {
155 if err
:= cw
.Close(); err
!= nil {
159 log
.Printf("reading fileMetaEntries")
161 idxMetaDocid
:= make([]indexMeta
, len(srcdirs
))
162 idxMetaPos
:= make([]indexMeta
, len(srcdirs
))
163 idxMetaPosrel
:= make([]posrelMeta
, len(srcdirs
))
165 idxDocid
:= make(map[Trigram
][]uint32)
166 for idx
, dir
:= range srcdirs
{
170 rd
, err
:= newPForReader(dir
, "docid")
175 idxMetaDocid
[idx
] = indexMeta
{docidBase
: base
, rd
: rd
}
178 if err
:= readMeta(dir
, "docid", idxDocid
, uint32(idx
)); err
!= nil {
183 rd
, err
:= newPForReader(dir
, "pos")
188 idxMetaPos
[idx
] = indexMeta
{docidBase
: base
, rd
: rd
}
192 rd
, err
:= newPosrelReader(dir
)
198 idxMetaPosrel
[idx
] = posrelMeta
{rd
: rd
}
202 trigrams
:= make([]Trigram
, 0, len(idxDocid
))
203 for t
:= range idxDocid
{
204 trigrams
= append(trigrams
, t
)
206 sort
.Slice(trigrams
, func(i
, j
int) bool { return trigrams
[i
] < trigrams
[j
] })
209 log
.Printf("writing merged docids")
210 dw
, err
:= newPForWriter(destdir
, "docid")
215 fDocidMeta
, err
:= os
.Create(filepath
.Join(destdir
, "posting.docid.meta"))
219 defer fDocidMeta
.Close()
220 bufwDocidMeta
:= bufio
.NewWriter(fDocidMeta
)
222 meBuf
:= make([]byte, metaEntrySize
)
223 dr
:= NewDeltaReader()
224 for _
, t
:= range trigrams
{
226 if t
!= debugTrigram
{
231 //for _, t := range []trigram{trigram(6650227), trigram(7959906)} {
232 //ctrl, data := dw.Offsets()
237 OffsetData
: dw
.Offset(),
240 for _
, idxid
:= range idxDocid
[t
] {
241 idx
:= idxMetaDocid
[idxid
]
242 meta
, err
:= idx
.rd
.metaEntry1(t
)
244 if err
== errNotFound
{
249 me
.Entries
+= meta
.Entries
250 dr
.Reset(meta
, idx
.rd
.data
.Data
)
251 docids
:= dr
.Read() // returns non-nil at least once
252 // Bump the first docid: it needs to be mapped from the old
253 // docid range [0, n) to the new docid range [base, base+n).
255 // Since we are building a single docid list for this trigram,
256 // the new value needs to be a delta, hence, subtract last.
257 docids
[0] += (idx
.docidBase
- last
)
259 for _
, d
:= range docids
{
260 if err
:= dw
.PutUint32(d
); err
!= nil {
268 if err
:= dw
.Flush(); err
!= nil {
272 if _
, err
:= bufwDocidMeta
.Write(meBuf
); err
!= nil {
273 //if err := binary.Write(bufwDocidMeta, binary.LittleEndian, &me); err != nil {
278 if err
:= bufwDocidMeta
.Flush(); err
!= nil {
282 if err
:= fDocidMeta
.Close(); err
!= nil {
286 if err
:= dw
.Close(); err
!= nil {
292 log
.Printf("writing merged posrel")
293 fmeta
, err
:= os
.Create(filepath
.Join(destdir
, "posting.posrel.meta"))
298 bufwmeta
:= bufio
.NewWriter(fmeta
)
300 fposrel
, err
:= os
.Create(filepath
.Join(destdir
, "posting.posrel.data"))
304 defer fposrel
.Close()
305 cw
:= newCountingWriter(fposrel
)
306 pw
:= newPosrelWriter(&cw
)
307 for _
, t
:= range trigrams
{
309 if t
!= debugTrigram
{
313 if t
== 2105376 { // TODO: document: " "?
319 OffsetData
: int64(cw
.offset
),
321 if err
:= binary
.Write(bufwmeta
, binary
.LittleEndian
, &me
); err
!= nil {
324 for _
, idxid
:= range idxDocid
[t
] {
325 // TODO: refactor all metaEntry1 to use ,ok idiom, they only ever return errNotFound
326 fmeta
, err
:= idxMetaPos
[idxid
].rd
.metaEntry1(t
)
328 if err
== errNotFound
{
334 pmeta
, err
:= idxMetaPosrel
[idxid
].rd
.metaEntry1(t
)
336 if err
== errNotFound
{
341 b
:= idxMetaPosrel
[idxid
].rd
.data
.Data
[pmeta
.OffsetData
:]
342 if err
:= pw
.Write(b
, int(fmeta
.Entries
)); err
!= nil {
347 if err
:= pw
.Flush(); err
!= nil {
351 if err
:= bufwmeta
.Flush(); err
!= nil {
354 if err
:= fmeta
.Close(); err
!= nil {
357 if err
:= cw
.Close(); err
!= nil {
363 log
.Printf("writing merged pos")
364 dw
, err
:= newPForWriter(destdir
, "pos")
369 fDocidMeta
, err
:= os
.Create(filepath
.Join(destdir
, "posting.pos.meta"))
373 defer fDocidMeta
.Close()
374 bufwDocidMeta
:= bufio
.NewWriter(fDocidMeta
)
376 meBuf
:= make([]byte, metaEntrySize
)
377 dr
:= NewDeltaReader()
378 //for _, t := range []trigram{trigram(6650227), trigram(7959906)} {
379 for _
, t
:= range trigrams
{
381 if t
!= debugTrigram
{
386 if t
== 2105376 { // TODO: document: " "?
390 //ctrl, data := dw.Offsets()
395 OffsetData
: dw
.Offset(),
398 for _
, idxid
:= range idxDocid
[t
] {
399 idx
:= idxMetaPos
[idxid
]
400 meta
, err
:= idx
.rd
.metaEntry1(t
)
402 if err
== errNotFound
{
407 me
.Entries
+= meta
.Entries
408 dr
.Reset(meta
, idx
.rd
.data
.Data
)
410 for docids
:= dr
.Read(); docids
!= nil; docids
= dr
.Read() {
411 for _
, d
:= range docids
{
412 if err
:= dw
.PutUint32(d
); err
!= nil {
419 if err
:= dw
.Flush(); err
!= nil {
424 if _
, err
:= bufwDocidMeta
.Write(meBuf
); err
!= nil {
425 //if err := binary.Write(bufwDocidMeta, binary.LittleEndian, &me); err != nil {
430 if err
:= bufwDocidMeta
.Flush(); err
!= nil {
434 if err
:= fDocidMeta
.Close(); err
!= nil {
438 if err
:= dw
.Close(); err
!= nil {