13 /****************************************************************
14 PSYCHO_4 by MFC Feb 2003
16 This is a cleaned up implementation of psy model 2.
17 This is basically because I was sick of the inconsistencies between
18 the notation in the ISO docs and in the sourcecode.
20 I've nicked a bunch of stuff from LAME to make this a bit easier to grok
21 - ATH values (this also overcomes the lack of mpeg-2 tables
22 which meant that LSF never had proper values)
23 - toolame_freq2bark() to convert frequencies directly to bark values.
24 - spreading_function() isolated the calculation of the spreading function.
25 Basically the same code as before, just isolated in its own function.
26 LAME seem to does some extra tweaks to the ISO1117s model.
27 Not really sure if they help or hinder, so I've commented them out (#ifdef LAME)
29 NB: Because of some of the tweaks to bark value calculation etc, it is now possible
30 to have 64 CBANDS. There's no real limit on the actual number of paritions.
31 I wonder if it's worth experimenting with really higher numbers? Probably won't make
32 that much difference to the final SNR values, but it's something worth trying
33 Maybe CBANDS should be a dynamic value, calculated by the psycho_init function
34 CBANDS definition has been changed in encoder.h from 63 to 64
36 ****************************************************************/
39 /* The static variables "r", "phi_sav", "new", "old" and "oldest" have
40 to be remembered for the unpredictability measure. For "r" and
41 "phi_sav", the first index from the left is the channel select and
42 the second index is the "age" of the data. */
44 static int new = 0, old
= 1, oldest
= 0;
47 /* NMT is a constant 5.5dB. ISO11172 Sec D.2.4.h */
48 static double NMT
= 5.5;
50 /* The index into this array is a bark value
51 This array gives the 'minval' values from ISO11172 Tables D.3.x */
52 static FLOAT minval
[27] = {
64 4.5, 4.5, 4.5,4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, 4.5, /* 11 - 25 */
69 static FLOAT
*grouped_c
, *grouped_e
, *nb
, *cb
, *tb
, *ecb
, *bc
;
70 static FLOAT
*wsamp_r
, *phi
, *energy
;
71 static FLOAT
*c
, *bark
, *thr
;
75 static int *partition
;
76 static FLOAT
*cbval
, *rnorm
;
82 static F2HBLK
*r
, *phi_sav
;
84 #define TRIGTABLESIZE 3142
85 #define TRIGTABLESCALE 1000.0
86 static FLOAT cos_table
[TRIGTABLESIZE
];
87 static FLOAT sin_table
[TRIGTABLESIZE
];
88 void psycho_4_trigtable_init(void) {
91 for (i
=0;i
<TRIGTABLESIZE
;i
++) {
92 cos_table
[i
] = cos((double)i
/TRIGTABLESCALE
);
93 sin_table
[i
] = sin((double)i
/TRIGTABLESCALE
);
97 INLINE FLOAT
psycho_4_cos(FLOAT phi
) {
101 index
= (int)(fabs(phi
) * TRIGTABLESCALE
);
102 while (index
>=TRIGTABLESIZE
) {
103 index
-= TRIGTABLESIZE
;
106 return(sign
* cos_table
[index
]);
109 INLINE FLOAT
psycho_4_sin(FLOAT phi
) {
113 index
= (int)(fabs(phi
) * TRIGTABLESCALE
);
114 while (index
>=TRIGTABLESIZE
) {
115 index
-= TRIGTABLESIZE
;
119 return(-1 * sign
* sin_table
[index
]);
120 return(sign
* sin_table
[index
]);
124 void psycho_4 (short int *buffer
, short int savebuf
[1056], int chn
,
125 double *smr
, double sfreq
, options
*glopts
)
126 /* to match prototype : FLOAT args are always double */
128 unsigned int run
, i
, j
, k
;
129 FLOAT r_prime
, phi_prime
;
133 psycho_4_init (sfreq
, glopts
);
137 for (run
= 0; run
< 2; run
++) {
138 /* Net offset is 480 samples (1056-576) for layer 2; this is because one must
139 stagger input data by 256 samples to synchronize psychoacoustic model with
140 filter bank outputs, then stagger so that center of 1024 FFT window lines
141 up with center of 576 "new" audio samples.
143 flush = 384*3.0/2.0; = 576
145 sync_flush = syncsize - flush; 480
147 for (j
= 0; j
< 480; j
++) {
148 savebuf
[j
] = savebuf
[j
+ 576];
149 wsamp_r
[j
] = window
[j
] * ((FLOAT
) savebuf
[j
]);
151 for (; j
< 1024; j
++) {
152 savebuf
[j
] = *buffer
++;
153 wsamp_r
[j
] = window
[j
] * ((FLOAT
) savebuf
[j
]);
155 for (; j
< 1056; j
++)
156 savebuf
[j
] = *buffer
++;
160 psycho_2_fft (wsamp_r
, energy
, phi
);
162 /* calculate the unpredictability measure, given energy[f] and phi[f]
163 (the age pointers [new/old/oldest] are reset automatically on the second pass */
178 for (j
= 0; j
< HBLKSIZE
; j
++) {
180 double temp1
, temp2
, temp3
;
181 r_prime
= 2.0 * r
[chn
][old
][j
] - r
[chn
][oldest
][j
];
182 phi_prime
= 2.0 * phi_sav
[chn
][old
][j
] - phi_sav
[chn
][oldest
][j
];
184 r
[chn
][new][j
] = sqrt ((double) energy
[j
]);
185 phi_sav
[chn
][new][j
] = phi
[j
];
189 r
[chn
][new][j
] * psycho_4_cos(phi
[j
]) -
190 r_prime
* psycho_4_cos(phi_prime
);
192 r
[chn
][new][j
] * psycho_4_sin(phi
[j
]) -
193 r_prime
* psycho_4_sin(phi_prime
);
194 //fprintf(stdout,"[%5.2f %5.2f] [%5.2f %5.2f]\n",temp1, mytemp1, temp2, mytemp2);
199 temp3
= r
[chn
][new][j
] + fabs ((double) r_prime
);
201 c
[j
] = sqrt (temp1
* temp1
+ temp2
* temp2
) / temp3
;
205 double temp1
, temp2
, temp3
;
206 r_prime
= 2.0 * r
[chn
][old
][j
] - r
[chn
][oldest
][j
];
207 phi_prime
= 2.0 * phi_sav
[chn
][old
][j
] - phi_sav
[chn
][oldest
][j
];
209 r
[chn
][new][j
] = sqrt ((double) energy
[j
]);
210 phi_sav
[chn
][new][j
] = phi
[j
];
214 r
[chn
][new][j
] * cos ((double) phi
[j
]) -
215 r_prime
* cos ((double) phi_prime
);
217 r
[chn
][new][j
] * sin ((double) phi
[j
]) -
218 r_prime
* sin ((double) phi_prime
);
220 temp3
= r
[chn
][new][j
] + fabs ((double) r_prime
);
222 c
[j
] = sqrt (temp1
* temp1
+ temp2
* temp2
) / temp3
;
228 /* For each partition, sum all the energy in that partition - grouped_e
229 and calculated the energy-weighted unpredictability measure - grouped_c
230 ISO 11172 Section D.2.4.e */
231 for (j
= 1; j
< CBANDS
; j
++) {
235 grouped_e
[0] = energy
[0];
236 grouped_c
[0] = energy
[0] * c
[0];
237 for (j
= 1; j
< HBLKSIZE
; j
++) {
238 grouped_e
[partition
[j
]] += energy
[j
];
239 grouped_c
[partition
[j
]] += energy
[j
] * c
[j
];
242 /* convolve the grouped energy-weighted unpredictability measure
243 and the grouped energy with the spreading function
245 for (j
= 0; j
< CBANDS
; j
++) {
248 for (k
= 0; k
< CBANDS
; k
++) {
249 if (s
[j
][k
] != 0.0) {
250 ecb
[j
] += s
[j
][k
] * grouped_e
[k
];
251 cb
[j
] += s
[j
][k
] * grouped_c
[k
];
255 cb
[j
] = cb
[j
] / ecb
[j
];
260 /* Convert cb to tb (the tonality index)
261 ISO11172 SecD.2.4.g */
262 for (i
=0;i
<CBANDS
;i
++) {
265 else if (cb
[i
] > 0.5)
267 tb
[i
] = -0.301029996 - 0.434294482 * log((double) cb
[i
]);
271 /* Calculate the required SNR for each of the frequency partitions
272 ISO 11172 Sect D.2.4.h */
273 for (j
= 0; j
< CBANDS
; j
++) {
275 SNRtemp
= tmn
[j
] * tb
[j
] + NMT
* (1.0 - tb
[j
]);
276 SNR
= MAX(SNRtemp
, minval
[(int)cbval
[j
]]);
277 bc
[j
] = exp ((double) -SNR
* LN_TO_LOG10
);
280 /* Calculate the permissible noise energy level in each of the frequency
282 This section used to have pre-echo control but only for LayerI
283 ISO 11172 Sec D.2.4.k - Spread the threshold energy over FFT lines */
284 for (j
= 0; j
< CBANDS
; j
++) {
285 if (rnorm
[j
] && numlines
[j
])
286 nb
[j
] = ecb
[j
] * bc
[j
] / (rnorm
[j
] * numlines
[j
]);
291 /* ISO11172 Sec D.2.4.l - thr[] the final energy threshold of audibility */
292 for (j
= 0; j
< HBLKSIZE
; j
++)
293 thr
[j
] = MAX(nb
[partition
[j
]], ath
[j
]);
295 /* Translate the 512 threshold values to the 32 filter bands of the coder
296 Using ISO 11172 Table D.5 and Section D.2.4.n */
297 for (j
= 0; j
< 193; j
+= 16) {
299 npart
= 60802371420160.0;
301 for (k
= 0; k
< 17; k
++) {
302 if (thr
[j
+ k
] < npart
)
303 npart
= thr
[j
+ k
]; /* For WIDTH==0, find the minimum noise, and
304 later multiply by the number of indexes i.e. 17 */
305 epart
+= energy
[j
+ k
];
307 snrtmp
[run
][j
/ 16] = 4.342944819 * log((double)(epart
/(npart
*17.0)));
309 for (j
= 208; j
< (HBLKSIZE
- 1); j
+= 16) {
313 for (k
= 0; k
< 17; k
++) {
314 npart
+= thr
[j
+ k
]; /* For WIDTH==1, sum the noise */
315 epart
+= energy
[j
+ k
];
317 snrtmp
[run
][j
/ 16] = 4.342944819 * log ((double) (epart
/npart
));
321 /* Pick the maximum value of the two runs ISO 11172 Sect D.2.1 */
322 for (i
= 0; i
< 32; i
++)
323 smr
[i
] = MAX(snrtmp
[0][i
], snrtmp
[1][i
]);
327 /********************************
328 * init psycho model 2
329 ********************************/
330 void psycho_4_init (double sfreq
, options
*glopts
)
334 /* Allocate memory for all the static variables */
337 /* Set up the SIN/COS tables */
338 psycho_4_trigtable_init();
340 /* calculate HANN window coefficients */
341 for (i
= 0; i
< BLKSIZE
; i
++)
342 window
[i
] = 0.5 * (1 - cos (2.0 * PI
* (i
- 0.5) / BLKSIZE
));
344 /* For each FFT line from 0(DC) to 512(Nyquist) calculate
345 - bark : the bark value of this fft line
346 - ath : the absolute threshold of hearing for this line [ATH]
348 Since it is a 1024 point FFT, each line in the fft corresponds
349 to 1/1024 of the total frequency.
350 Line 0 should correspond to DC - which doesn't really have a ATH afaik
351 Line 1 should be 1/1024th of the Sampling Freq
352 Line 512 should be the nyquist freq */
353 for (i
=0; i
<HBLKSIZE
; i
++) {
354 FLOAT freq
= i
* sfreq
/BLKSIZE
;
355 bark
[i
] = toolame_freq2bark(freq
);
356 /* The ath tables in the dist10 code seem to be a little out of kilter.
357 they seem to start with index 0 corresponding to (sampling freq)/1024.
358 When in doubt, i'm going to assume that the dist10 code is wrong. MFC Feb2003 */
359 ath
[i
] = ATH_energy(freq
,glopts
->athlevel
);
360 //fprintf(stdout,"%.2f ",ath[i]);
364 /* Work out the partitions
365 Starting from line 0, all lines within 0.33 of the starting
366 bark are added to the same partition. When a line is greater
367 by 0.33 of a bark, start a new partition. */
368 int partition_count
= 0; /* keep a count of the partitions */
369 int cbase
= 0; /* current base index for the bark range calculation */
370 for (i
=0;i
<HBLKSIZE
;i
++) {
371 if ((bark
[i
] - bark
[cbase
]) > 0.33) { /* 1/3 critical band? */
372 /* this frequency line is too different from the starting line,
373 (in terms of the bark distance)
374 so close that previous partition, and make this line the first
375 member of the next partition */
376 cbase
= i
; /* Start the new partition from this frequency */
379 /* partition[i] tells us which partition the i'th frequency line is in */
380 partition
[i
] = partition_count
;
381 /* keep a count of how many frequency lines are in each partition */
382 numlines
[partition_count
]++;
385 /* For each partition within the frequency space,
386 calculate the average bark value - cbval [central bark value] */
387 for (i
=0;i
<HBLKSIZE
;i
++)
388 cbval
[partition
[i
]] += bark
[i
]; /* sum up all the bark values */
389 for (i
=0;i
<CBANDS
;i
++) {
390 if (numlines
[i
] != 0)
391 cbval
[i
] /= numlines
[i
]; /* divide by the number of values */
393 cbval
[i
]=0; /* this isn't a partition */
398 /* Calculate the spreading function. ISO 11172 Section D.2.3 */
399 for (i
=0;i
<CBANDS
;i
++) {
400 for (j
=0;j
<CBANDS
;j
++) {
401 s
[i
][j
] = psycho_4_spreading_function( 1.05 * (cbval
[i
] - cbval
[j
]) );
402 rnorm
[i
] += s
[i
][j
]; /* sum the spreading function values for each partition so that
403 they can be normalised later on */
407 /* Calculate Tone Masking Noise values. ISO 11172 Tables D.3.x */
408 for (j
= 0; j
< CBANDS
; j
++)
409 tmn
[j
] = MAX(15.5+cbval
[j
], 24.5);
412 if (glopts
->verbosity
> 10) {
413 /* Dump All the Values to STDOUT */
416 fprintf(stdout
,"psy model 4 init\n");
417 fprintf(stdout
,"index \tnlines \twlow \twhigh \tbval \tminval \ttmn\n");
418 for (i
=0;i
<CBANDS
;i
++)
419 if (numlines
[i
] != 0) {
421 whigh
= wlow
+ numlines
[i
] - 1;
422 fprintf(stdout
,"%i \t%i \t%i \t%i \t%5.2f \t%4.2f \t%4.2f\n",i
+1, numlines
[i
],wlow
, whigh
, cbval
[i
],minval
[(int)cbval
[i
]],tmn
[i
]);
425 fprintf(stdout
,"total lines %i\n",ntot
);
430 /* The spreading function. Values returned in units of energy
431 Argument 'bark' is the difference in bark values between the
432 centre of two partitions.
433 This has been taken from LAME. MFC Feb 2003 */
434 FLOAT8
psycho_4_spreading_function(FLOAT8 bark
) {
436 FLOAT8 tempx
,x
,tempy
,temp
;
439 /* MP3 standard actually spreads these values a little more */
440 if (tempx
>=0) tempx
*= 3;
444 if(tempx
>=0.5 && tempx
<=2.5)
447 x
= 8.0 * (temp
*temp
- 2.0 * temp
);
451 tempy
= 15.811389 + 7.5*tempx
- 17.5*sqrt(1.0+tempx
*tempx
);
453 if (tempy
<= -60.0) return 0.0;
455 tempx
= exp( (x
+ tempy
)*LN_TO_LOG10
);
458 /* I'm not sure where the magic value of 0.6609193 comes from.
459 toolame will just keep using the rnorm to normalise the spreading function
461 /* Normalization. The spreading function should be normalized so that:
464 | s3 [ bark ] d(bark) = 1
474 void psycho_4_allocmem() {
475 grouped_c
= (FLOAT
*) mem_alloc (sizeof (FCB
), "grouped_c");
476 grouped_e
= (FLOAT
*) mem_alloc (sizeof (FCB
), "grouped_e");
477 nb
= (FLOAT
*) mem_alloc (sizeof (FCB
), "nb");
478 cb
= (FLOAT
*) mem_alloc (sizeof (FCB
), "cb");
479 tb
= (FLOAT
*) mem_alloc (sizeof (FCB
), "tb");
480 ecb
= (FLOAT
*) mem_alloc (sizeof (FCB
), "ecb");
481 bc
= (FLOAT
*) mem_alloc (sizeof (FCB
), "bc");
482 wsamp_r
= (FLOAT
*) mem_alloc (sizeof (FBLK
), "wsamp_r");
483 phi
= (FLOAT
*) mem_alloc (sizeof (FBLK
), "phi");
484 energy
= (FLOAT
*) mem_alloc (sizeof (FBLK
), "energy");
485 c
= (FLOAT
*) mem_alloc (sizeof (FHBLK
), "c");
486 bark
= (FLOAT
*) mem_alloc (sizeof (FHBLK
), "bark");
487 thr
= (FLOAT
*) mem_alloc (sizeof (FHBLK
), "thr");
488 snrtmp
= (F32
*) mem_alloc (sizeof (F2_32
), "snrtmp");
490 numlines
= (int *) mem_alloc (sizeof (ICB
), "numlines");
491 partition
= (int *) mem_alloc (sizeof (IHBLK
), "partition");
492 cbval
= (FLOAT
*) mem_alloc (sizeof (FCB
), "cbval");
493 rnorm
= (FLOAT
*) mem_alloc (sizeof (FCB
), "rnorm");
494 window
= (FLOAT
*) mem_alloc (sizeof (FBLK
), "window");
495 ath
= (FLOAT
*) mem_alloc (sizeof (FHBLK
), "ath");
496 tmn
= (double *) mem_alloc (sizeof (DCB
), "tmn");
497 s
= (FCB
*) mem_alloc (sizeof (FCBCB
), "s");
498 lthr
= (FHBLK
*) mem_alloc (sizeof (F2HBLK
), "lthr");
499 r
= (F2HBLK
*) mem_alloc (sizeof (F22HBLK
), "r");
500 phi_sav
= (F2HBLK
*) mem_alloc (sizeof (F22HBLK
), "phi_sav");