1 /********************************************************************
3 * THIS FILE IS PART OF THE OggVorbis SOFTWARE CODEC SOURCE CODE. *
4 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
5 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
6 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
8 * THE OggVorbis SOURCE CODE IS (C) COPYRIGHT 1994-2001 *
9 * by the Xiph.Org Foundation http://www.xiph.org/ *
11 ********************************************************************
13 function: utility for paring low hit count cells from lattice codebook
14 last mod: $Id: latticepare.c 16037 2009-05-26 21:10:58Z xiphmont $
16 ********************************************************************/
23 #include "../lib/scales.h"
27 #include "../lib/os.h"
29 /* Lattice codebooks have two strengths: important fetaures that are
30 poorly modelled by global error minimization training (eg, strong
31 peaks) are not neglected 2) compact quantized representation.
33 A fully populated lattice codebook, however, swings point 1 too far
34 in the opposite direction; rare features need not be modelled quite
35 so religiously and as such, we waste bits unless we eliminate the
36 least common cells. The codebook rep supports unused cells, so we
37 need to tag such cells and build an auxiliary (non-thresh) search
38 mechanism to find the proper match quickly */
40 /* two basic steps; first is pare the cell for which dispersal creates
41 the least additional error. This will naturally choose
42 low-population cells and cells that have not taken on points from
43 neighboring paring (but does not result in the lattice collapsing
44 inward and leaving low population ares totally unmodelled). After
45 paring has removed the desired number of cells, we need to build an
46 auxiliary search for each culled point */
48 /* Although lattice books (due to threshhold-based matching) do not
49 actually use error to make cell selections (in fact, it need not
50 bear any relation), the 'secondbest' entry finder here is in fact
51 error/distance based, so latticepare is only useful on such books */
54 latticepare latticebook.vqh input_data.vqd <target_cells>
56 produces a new output book on stdout
59 static float _dist(int el
,float *a
, float *b
){
63 float val
=(a
[i
]-b
[i
]);
69 static float *pointlist
;
72 void add_vector(codebook
*b
,float *vec
,long n
){
77 pointlist
[points
++]=vec
[j
];
82 static int bestm(codebook
*b
,float *vec
){
83 encode_aux_threshmatch
*tt
=b
->c
->thresh_tree
;
88 /* what would be the closest match if the codebook was fully
91 for(k
=0,o
=dim
-1;k
<dim
;k
++,o
--){
93 for(i
=0;i
<tt
->threshvals
-1;i
++)
94 if(vec
[o
]<tt
->quantthresh
[i
])break;
95 best
=(best
*tt
->quantvals
)+tt
->quantmap
[i
];
100 static int closest(codebook
*b
,float *vec
,int current
){
101 encode_aux_threshmatch
*tt
=b
->c
->thresh_tree
;
107 int best
=bestm(b
,vec
);
109 if(current
<0 && b
->c
->lengthlist
[best
]>0)return best
;
111 for(i
=0;i
<b
->entries
;i
++){
112 if(b
->c
->lengthlist
[i
]>0 && i
!=best
&& i
!=current
){
113 float thismetric
=_dist(dim
, vec
, b
->valuelist
+i
*dim
);
114 if(bestentry
==-1 || thismetric
<bestmetric
){
116 bestmetric
=thismetric
;
124 static float _heuristic(codebook
*b
,float *ppt
,int secondbest
){
125 float *secondcell
=b
->valuelist
+secondbest
*b
->dim
;
126 int best
=bestm(b
,ppt
);
127 float *firstcell
=b
->valuelist
+best
*b
->dim
;
128 float error
=_dist(b
->dim
,firstcell
,secondcell
);
129 float *zero
=alloca(b
->dim
*sizeof(float));
132 memset(zero
,0,b
->dim
*sizeof(float));
133 fromzero
=sqrt(_dist(b
->dim
,firstcell
,zero
));
135 return(error
/fromzero
);
138 static int longsort(const void *a
, const void *b
){
139 return **(long **)b
-**(long **)a
;
143 fprintf(stderr
,"Ogg/Vorbis lattice codebook paring utility\n\n"
144 "usage: latticepare book.vqh data.vqd <target_cells> <protected_cells> base\n"
145 "where <target_cells> is the desired number of final cells (or -1\n"
147 " <protected_cells> is the number of highest-hit count cells\n"
148 " to protect from dispersal\n"
149 " base is the base name (not including .vqh) of the new\n"
154 int main(int argc
,char *argv
[]){
159 long i
,j
,target
=-1,protect
=-1;
179 /* yes, this is evil. However, it's very convenient to parse file
182 /* input file. What kind? */
185 char *name
=strdup(*argv
++);
186 dot
=strrchr(name
,'.');
196 if(!strcmp(ext
,"vqh")){
198 basename
=strrchr(name
,'/');
200 basename
=strdup(basename
)+1;
202 basename
=strdup(name
);
203 dot
=strrchr(basename
,'.');
206 b
=codebook_load(name
);
211 /* data file; we do actually need to suck it into memory */
212 /* we're dealing with just one book, so we can de-interleave */
213 if(!strcmp(ext
,"vqd") && !points
){
218 FILE *in
=fopen(name
,"r");
220 fprintf(stderr
,"Could not open input file %s\n",name
);
226 /* count cols before we start reading */
229 while(*temp
==' ')temp
++;
230 for(cols
=0;*temp
;cols
++){
231 while(*temp
>32)temp
++;
232 while(*temp
==' ')temp
++;
235 vec
=alloca(cols
*sizeof(float));
236 /* count, then load, to avoid fragmenting the hell out of
241 if(get_line_value(in
,vec
+j
)){
242 fprintf(stderr
,"Too few columns on line %ld in data file\n",lines
);
245 if((lines
&0xff)==0)spinnit("counting samples...",lines
*cols
);
248 pointlist
=_ogg_malloc((cols
*lines
+entries
*dim
)*sizeof(float));
255 if(get_line_value(in
,vec
+j
)){
256 fprintf(stderr
,"Too few columns on line %ld in data file\n",lines
);
259 /* deinterleave, add to heap */
260 add_vector(b
,vec
,cols
);
261 if((lines
&0xff)==0)spinnit("loading samples...",lines
*cols
);
270 target
=atol(*argv
++);
271 if(target
==0)target
=entries
;
274 protect
=atol(*argv
++);
278 char *buff
=alloca(strlen(*argv
)+5);
279 sprintf(buff
,"%s.vqh",*argv
);
284 fprintf(stderr
,"unable ot open %s for output",buff
);
294 if(!entries
|| !points
|| !out
)usage();
295 if(target
==-1)usage();
297 /* add guard points */
298 for(i
=0;i
<entries
;i
++)
300 pointlist
[points
++]=b
->valuelist
[i
*dim
+j
];
304 /* set up auxiliary vectors for error tracking */
306 encode_aux_nearestmatch
*nt
=NULL
;
309 long indexedpoints
=0;
312 long *membership
=_ogg_malloc(points
*sizeof(long));
313 long *firsthead
=_ogg_malloc(entries
*sizeof(long));
314 long *secondary
=_ogg_malloc(points
*sizeof(long));
315 long *secondhead
=_ogg_malloc(entries
*sizeof(long));
317 long *cellcount
=_ogg_calloc(entries
,sizeof(long));
318 long *cellcount2
=_ogg_calloc(entries
,sizeof(long));
319 float *cellerror
=_ogg_calloc(entries
,sizeof(float));
320 float *cellerrormax
=_ogg_calloc(entries
,sizeof(float));
321 long cellsleft
=entries
;
322 for(i
=0;i
<points
;i
++)membership
[i
]=-1;
323 for(i
=0;i
<entries
;i
++)firsthead
[i
]=-1;
324 for(i
=0;i
<points
;i
++)secondary
[i
]=-1;
325 for(i
=0;i
<entries
;i
++)secondhead
[i
]=-1;
327 for(i
=0;i
<points
;i
++){
328 /* assign vectors to the nearest cell. Also keep track of second
329 nearest for error statistics */
330 float *ppt
=pointlist
+i
*dim
;
331 int firstentry
=closest(b
,ppt
,-1);
332 int secondentry
=closest(b
,ppt
,firstentry
);
333 float firstmetric
=_dist(dim
,b
->valuelist
+dim
*firstentry
,ppt
);
334 float secondmetric
=_dist(dim
,b
->valuelist
+dim
*secondentry
,ppt
);
336 if(!(i
&0xff))spinnit("initializing... ",points
-i
);
338 membership
[i
]=firsthead
[firstentry
];
339 firsthead
[firstentry
]=i
;
340 secondary
[i
]=secondhead
[secondentry
];
341 secondhead
[secondentry
]=i
;
343 if(i
<points
-entries
){
344 cellerror
[firstentry
]+=secondmetric
-firstmetric
;
345 cellerrormax
[firstentry
]=max(cellerrormax
[firstentry
],
346 _heuristic(b
,ppt
,secondentry
));
347 cellcount
[firstentry
]++;
348 cellcount2
[secondentry
]++;
352 /* which cells are most heavily populated? Protect as many from
353 dispersal as the user has requested */
355 long **countindex
=_ogg_calloc(entries
,sizeof(long *));
356 for(i
=0;i
<entries
;i
++)countindex
[i
]=cellcount
+i
;
357 qsort(countindex
,entries
,sizeof(long *),longsort
);
358 for(i
=0;i
<protect
;i
++){
359 int ptr
=countindex
[i
]-cellcount
;
360 cellerrormax
[ptr
]=9e50f
;
365 fprintf(stderr
,"\r");
366 for(i
=0;i
<entries
;i
++){
367 /* decompose index */
370 fprintf(stderr
,"%d:",entry
%b
->c
->thresh_tree
->quantvals
);
371 entry
/=b
->c
->thresh_tree
->quantvals
;
374 fprintf(stderr
,":%ld/%ld, ",cellcount
[i
],cellcount2
[i
]);
376 fprintf(stderr
,"\n");
379 /* do the automatic cull request */
380 while(cellsleft
>target
){
386 sprintf(spinbuf
,"cells left to eliminate: %ld : ",cellsleft
-target
);
388 /* find the cell with lowest removal impact */
389 for(i
=0;i
<entries
;i
++){
390 if(b
->c
->lengthlist
[i
]>0){
391 if(bestcell
==-1 || cellerrormax
[i
]<=besterror2
){
392 if(bestcell
==-1 || cellerrormax
[i
]<besterror2
||
393 besterror
>cellerror
[i
]){
394 besterror
=cellerror
[i
];
395 besterror2
=cellerrormax
[i
];
402 fprintf(stderr
,"\reliminating cell %d \n"
403 " dispersal error of %g max/%g total (%ld hits)\n",
404 bestcell
,besterror2
,besterror
,cellcount
[bestcell
]);
406 /* disperse it. move each point out, adding it (properly) to
408 b
->c
->lengthlist
[bestcell
]=0;
409 head
=firsthead
[bestcell
];
410 firsthead
[bestcell
]=-1;
412 /* head is a point number */
413 float *ppt
=pointlist
+head
*dim
;
414 int firstentry
=closest(b
,ppt
,-1);
415 int secondentry
=closest(b
,ppt
,firstentry
);
416 float firstmetric
=_dist(dim
,b
->valuelist
+dim
*firstentry
,ppt
);
417 float secondmetric
=_dist(dim
,b
->valuelist
+dim
*secondentry
,ppt
);
418 long next
=membership
[head
];
420 if(head
<points
-entries
){
421 cellcount
[firstentry
]++;
422 cellcount
[bestcell
]--;
423 cellerror
[firstentry
]+=secondmetric
-firstmetric
;
424 cellerrormax
[firstentry
]=max(cellerrormax
[firstentry
],
425 _heuristic(b
,ppt
,secondentry
));
428 membership
[head
]=firsthead
[firstentry
];
429 firsthead
[firstentry
]=head
;
431 if(cellcount
[bestcell
]%128==0)
432 spinnit(spinbuf
,cellcount
[bestcell
]+cellcount2
[bestcell
]);
436 /* now see that all points that had the dispersed cell as second
437 choice have second choice reassigned */
438 head
=secondhead
[bestcell
];
439 secondhead
[bestcell
]=-1;
441 float *ppt
=pointlist
+head
*dim
;
442 /* who are we assigned to now? */
443 int firstentry
=closest(b
,ppt
,-1);
444 /* what is the new second closest match? */
445 int secondentry
=closest(b
,ppt
,firstentry
);
446 /* old second closest is the cell being disbanded */
447 float oldsecondmetric
=_dist(dim
,b
->valuelist
+dim
*bestcell
,ppt
);
448 /* new second closest error */
449 float secondmetric
=_dist(dim
,b
->valuelist
+dim
*secondentry
,ppt
);
450 long next
=secondary
[head
];
452 if(head
<points
-entries
){
453 cellcount2
[secondentry
]++;
454 cellcount2
[bestcell
]--;
455 cellerror
[firstentry
]+=secondmetric
-oldsecondmetric
;
456 cellerrormax
[firstentry
]=max(cellerrormax
[firstentry
],
457 _heuristic(b
,ppt
,secondentry
));
460 secondary
[head
]=secondhead
[secondentry
];
461 secondhead
[secondentry
]=head
;
464 if(cellcount2
[bestcell
]%128==0)
465 spinnit(spinbuf
,cellcount2
[bestcell
]);
471 /* paring is over. Build decision trees using points that now fall
472 through the thresh matcher. */
473 /* we don't free membership; we flatten it in order to use in lp_split */
475 for(i
=0;i
<entries
;i
++){
476 long head
=firsthead
[i
];
477 spinnit("rearranging membership cache... ",entries
-i
);
479 long next
=membership
[head
];
491 pointindex
=_ogg_malloc(points
*sizeof(long));
492 /* make a point index of fall-through points */
493 for(i
=0;i
<points
;i
++){
494 int best
=_best(b
,pointlist
+i
*dim
,1);
496 pointindex
[indexedpoints
++]=i
;
497 spinnit("finding orphaned points... ",points
-i
);
500 /* make an entry index */
501 entryindex
=_ogg_malloc(entries
*sizeof(long));
503 for(i
=0;i
<entries
;i
++){
504 if(b
->c
->lengthlist
[i
]>0)
505 entryindex
[target
++]=i
;
508 /* make working space for a reverse entry index */
509 reventry
=_ogg_malloc(entries
*sizeof(long));
512 nt
=b
->c
->nearest_tree
=
513 _ogg_calloc(1,sizeof(encode_aux_nearestmatch
));
516 nt
->ptr0
=_ogg_malloc(sizeof(long)*nt
->alloc
);
517 nt
->ptr1
=_ogg_malloc(sizeof(long)*nt
->alloc
);
518 nt
->p
=_ogg_malloc(sizeof(long)*nt
->alloc
);
519 nt
->q
=_ogg_malloc(sizeof(long)*nt
->alloc
);
522 fprintf(stderr
,"Leaves added: %d \n",
523 lp_split(pointlist
,points
,
525 pointindex
,indexedpoints
,
532 /* hack alert. I should just change the damned splitter and
534 for(i
=0;i
<nt
->aux
;i
++)nt
->p
[i
]*=dim
;
535 for(i
=0;i
<nt
->aux
;i
++)nt
->q
[i
]*=dim
;
537 /* recount hits. Build new lengthlist. reuse entryindex storage */
538 for(i
=0;i
<entries
;i
++)entryindex
[i
]=1;
539 for(i
=0;i
<points
-entries
;i
++){
540 int best
=_best(b
,pointlist
+i
*dim
,1);
541 float *a
=pointlist
+i
*dim
;
542 if(!(i
&0xff))spinnit("counting hits...",i
);
544 fprintf(stderr
,"\nINTERNAL ERROR; a point count not be matched to a\n"
545 "codebook entry. The new decision tree is broken.\n");
550 for(i
=0;i
<nt
->aux
;i
++)nt
->p
[i
]/=dim
;
551 for(i
=0;i
<nt
->aux
;i
++)nt
->q
[i
]/=dim
;
553 /* the lengthlist builder doesn't actually deal with 0 hit entries.
554 So, we pack the 'sparse' hit list into a dense list, then unpack
555 the lengths after the build */
558 long *lengthlist
=_ogg_calloc(entries
,sizeof(long));
559 for(i
=0;i
<entries
;i
++){
560 if(b
->c
->lengthlist
[i
]>0)
561 entryindex
[upper
++]=entryindex
[i
];
564 fprintf(stderr
,"\nINTERNAL ERROR; _best matched to unused entry\n");
572 fprintf(stderr
,"\nINTERNAL ERROR; packed the wrong number of entries\n");
576 build_tree_from_lengths(upper
,entryindex
,lengthlist
);
579 for(i
=0;i
<entries
;i
++){
580 if(b
->c
->lengthlist
[i
]>0)
581 b
->c
->lengthlist
[i
]=lengthlist
[upper
++];
586 /* we're done. write it out. */
587 write_codebook(out
,basename
,b
->c
);
589 fprintf(stderr
,"\r \nDone.\n");