2 Calf Box, an open source musical instrument.
3 Copyright (C) 2010-2013 Krzysztof Foltman
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, either version 3 of the License, or
8 (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>.
19 #include "config-api.h"
26 #include "sfzloader.h"
33 #define LOW_QUALITY_INTERPOLATION 0
35 struct resampler_state
39 float lgain
, rgain
, lgain_delta
, rgain_delta
;
46 static inline void process_voice_mono_noloop(struct sampler_gen
*v
, struct resampler_state
*rs
, const int16_t *srcdata
, int endpos
)
48 static const float32x2_t shift1a
= {0.f
, 1.f
}, shift1b
= {1.f
, 1.f
};
49 static const float32x2_t shift2a
= {-1.f
, -1.f
}, shift2b
= {0.f
, 0.f
};
50 static const float32x2_t shift3a
= {-2.f
, -2.f
}, shift3b
= {-2.f
, -1.f
};
51 static const float32x2_t scalinga
= {-1 / 6.0, 3 / 6.0}, scalingb
= {-3 / 6.0, 1 / 6.0};
52 uint64x1_t pos
= v
->bigpos
, delta
= v
->bigdelta
;
53 float32x2_t gains
= {rs
->lgain
, rs
->rgain
};
54 const float32x2_t gaindeltas
= {rs
->lgain_delta
, rs
->rgain_delta
};
55 for (uint32_t i
= rs
->offset
; i
< endpos
; i
++)
57 float32x2_t posposf
= vcvt_n_f32_u32(vreinterpret_u32_u64(pos
), 32);
59 int32x4_t smp
= vmovl_s16(vld1_s16(&srcdata
[pos
>> 32]));
60 pos
= vadd_u64(pos
, delta
);
62 float32x2_t t2
= vdup_n_f32(posposf
[0]);
63 float32x2_t samplesa
= vcvt_f32_s32(vget_low_s32(smp
)), samplesb
= vcvt_f32_s32(vget_high_s32(smp
));
65 float32x2_t mula
= vmul_f32(vmul_f32(vadd_f32(t2
, shift1a
), vadd_f32(t2
, shift2a
)), vmul_f32(vadd_f32(t2
, shift3a
), scalinga
));
66 float32x2_t mulb
= vmul_f32(vmul_f32(vadd_f32(t2
, shift1b
), vadd_f32(t2
, shift2b
)), vmul_f32(vadd_f32(t2
, shift3b
), scalingb
));
67 float32x2_t v
= vmla_f32(vmul_f32(samplesa
, mula
), samplesb
, mulb
);
68 float32x2_t result
= vmul_f32(gains
, vadd_f32(v
, vrev64_f32(v
)));
69 gains
= vadd_f32(gains
, gaindeltas
);
71 rs
->leftright
[2 * i
] = result
[0];
72 rs
->leftright
[2 * i
+ 1] = result
[1];
80 static inline void process_voice_stereo_noloop(struct sampler_gen
*v
, struct resampler_state
*rs
, const int16_t *srcdata
, int endpos
)
82 static const float32x2_t shift1a
= {0.f
, 1.f
}, shift1b
= {1.f
, 1.f
};
83 static const float32x2_t shift2a
= {-1.f
, -1.f
}, shift2b
= {0.f
, 0.f
};
84 static const float32x2_t shift3a
= {-2.f
, -2.f
}, shift3b
= {-2.f
, -1.f
};
85 static const float32x2_t scalinga
= {-1 / 6.0, 3 / 6.0}, scalingb
= {-3 / 6.0, 1 / 6.0};
86 uint64x1_t pos
= v
->bigpos
, delta
= v
->bigdelta
;
87 float32x2_t gains
= {rs
->lgain
, rs
->rgain
};
88 const float32x2_t gaindeltas
= {rs
->lgain_delta
, rs
->rgain_delta
};
89 for (uint32_t i
= rs
->offset
; i
< endpos
; i
++)
91 float32x2_t posposf
= vcvt_n_f32_u32(vreinterpret_u32_u64(pos
), 32);
93 int16x4x2_t pp
= vld2_s16(&srcdata
[(pos
>> 31) &~ 1]);
94 pos
= vadd_u64(pos
, delta
);
95 int32x4_t smp_left
= vmovl_s16(pp
.val
[0]), smp_right
= vmovl_s16(pp
.val
[1]);
97 float32x2_t t2
= vdup_n_f32(posposf
[0]);
98 float32x2_t samplesLa
= vcvt_f32_s32(vget_low_s32(smp_left
)), samplesLb
= vcvt_f32_s32(vget_high_s32(smp_left
));
99 float32x2_t samplesRa
= vcvt_f32_s32(vget_low_s32(smp_right
)), samplesRb
= vcvt_f32_s32(vget_high_s32(smp_right
));
101 float32x2_t mula
= vmul_f32(vmul_f32(vadd_f32(t2
, shift1a
), vadd_f32(t2
, shift2a
)), vmul_f32(vadd_f32(t2
, shift3a
), scalinga
));
102 float32x2_t mulb
= vmul_f32(vmul_f32(vadd_f32(t2
, shift1b
), vadd_f32(t2
, shift2b
)), vmul_f32(vadd_f32(t2
, shift3b
), scalingb
));
103 float32x2_t vL
= vmla_f32(vmul_f32(samplesLa
, mula
), samplesLb
, mulb
);
104 float32x2_t vR
= vmla_f32(vmul_f32(samplesRa
, mula
), samplesRb
, mulb
);
105 float32x2x2_t transposed
= vtrn_f32(vL
, vR
);
106 float32x2_t result
= vmul_f32(gains
, vadd_f32(transposed
.val
[0], transposed
.val
[1]));
107 gains
= vadd_f32(gains
, gaindeltas
);
109 rs
->leftright
[2 * i
] = result
[0];
110 rs
->leftright
[2 * i
+ 1] = result
[1];
112 rs
->lgain
= gains
[0];
113 rs
->rgain
= gains
[1];
120 #include <xmmintrin.h>
124 static const V4SF shift1
= {0, 1, 1, 1};
125 static const V4SF shift2
= {-1, -1, 0, 0};
126 static const V4SF shift3
= {-2, -2, -2, -1};
127 static const V4SF scaling
= {-1, 3, -3, 1};
128 static const V4SF zero
= {0, 0, 0, 0};
131 static inline void process_voice_mono_noloop(struct sampler_gen
*v
, struct resampler_state
*rs
, const int16_t *srcdata
, int endpos
)
133 uint64_t pos
= v
->bigpos
;
134 const float ffrac
= 1.0f
/ 6.0f
;
135 const float _scaler
= 1.f
/ (128.f
* 16777216.f
);
136 for (int i
= rs
->offset
; i
< endpos
; i
++)
138 //float t = ((pos >> 8) & 0x00FFFFFF) * scaler;
139 const int16_t *p
= &srcdata
[pos
>> 32];
141 V4SF t2
= __builtin_ia32_cvtsi2ss(zero
, (pos
& 0xFFFFFFFF) >> 1) * _scaler
;
144 V4SF t4
= __builtin_ia32_shufps(t2
, t2
, 0);
145 V4SF v4mul
= (t4
+ shift1
) * (t4
+ shift2
) * (t4
+ shift3
) * scaling
;
146 V4SF samples
= {p
[0], p
[1], p
[2], p
[3]};
147 v4mul
= __builtin_ia32_mulps(samples
, v4mul
);
149 float c
= (v4mul
[0] + v4mul
[1] + v4mul
[2] + v4mul
[3]) * ffrac
;
151 rs
->leftright
[2 * i
] = rs
->lgain
* c
;
152 rs
->leftright
[2 * i
+ 1] = rs
->rgain
* c
;
153 rs
->lgain
+= rs
->lgain_delta
;
154 rs
->rgain
+= rs
->rgain_delta
;
160 static inline void process_voice_stereo_noloop(struct sampler_gen
*v
, struct resampler_state
*rs
, const int16_t *srcdata
, int endpos
)
162 uint64_t pos
= v
->bigpos
;
163 const float ffrac
= 1.0f
/ 6.0f
;
164 const float _scaler
= 1.f
/ (128.f
* 16777216.f
);
165 for (int i
= rs
->offset
; i
< endpos
; i
++)
167 //float t = ((pos >> 8) & 0x00FFFFFF) * scaler;
168 const int16_t *p
= &srcdata
[(pos
>> 31) & ~1];
170 V4SF t2
= __builtin_ia32_cvtsi2ss(zero
, (pos
& 0xFFFFFFFF) >> 1) * _scaler
;
173 V4SF t4
= __builtin_ia32_shufps(t2
, t2
, 0);
174 V4SF v4mul
= (t4
+ shift1
) * (t4
+ shift2
) * (t4
+ shift3
) * scaling
;
175 V4SF samples_left
= {p
[0], p
[2], p
[4], p
[6]};
176 samples_left
= __builtin_ia32_mulps(samples_left
, v4mul
);
177 V4SF samples_right
= {p
[1], p
[3], p
[5], p
[7]};
178 samples_right
= __builtin_ia32_mulps(samples_right
, v4mul
);
180 float cl
= (samples_left
[0] + samples_left
[1] + samples_left
[2] + samples_left
[3]) * ffrac
;
181 float cr
= (samples_right
[0] + samples_right
[1] + samples_right
[2] + samples_right
[3]) * ffrac
;
183 rs
->leftright
[2 * i
] = rs
->lgain
* cl
;
184 rs
->leftright
[2 * i
+ 1] = rs
->rgain
* cr
;
185 rs
->lgain
+= rs
->lgain_delta
;
186 rs
->rgain
+= rs
->rgain_delta
;
194 static inline void process_voice_mono_noloop(struct sampler_gen
*v
, struct resampler_state
*rs
, const int16_t *srcdata
, int endpos
)
196 const float ffrac
= 1.0f
/ 6.0f
;
197 const float scaler
= 1.f
/ 16777216.f
;
199 for (int i
= rs
->offset
; i
< endpos
; i
++)
201 float t
= ((v
->bigpos
>> 8) & 0x00FFFFFF) * scaler
;
202 const int16_t *p
= &srcdata
[v
->bigpos
>> 32];
203 #if LOW_QUALITY_INTERPOLATION
204 float c
= (1.f
- t
) * p
[1] + t
* p
[2];
206 float b0
= -t
*(t
-1.f
)*(t
-2.f
);
207 float b1
= 3.f
*(t
+1.f
)*(t
-1.f
)*(t
-2.f
);
208 float c
= (b0
* p
[0] + b1
* p
[1] - 3.f
*(t
+1.f
)*t
*(t
-2.f
) * p
[2] + (t
+1.f
)*t
*(t
-1.f
) * p
[3]) * ffrac
;
210 rs
->leftright
[2 * i
] = rs
->lgain
* c
;
211 rs
->leftright
[2 * i
+ 1] = rs
->rgain
* c
;
212 rs
->lgain
+= rs
->lgain_delta
;
213 rs
->rgain
+= rs
->rgain_delta
;
214 v
->bigpos
+= v
->bigdelta
;
219 static inline void process_voice_stereo_noloop(struct sampler_gen
*v
, struct resampler_state
*rs
, const int16_t *srcdata
, int endpos
)
221 const float ffrac
= 1.0f
/ 6.0f
;
222 const float scaler
= 1.f
/ 16777216.f
;
224 for (int i
= rs
->offset
; i
< endpos
; i
++)
226 float t
= ((v
->bigpos
>> 8) & 0x00FFFFFF) * scaler
;
227 const int16_t *p
= &srcdata
[(v
->bigpos
>> 31) & ~1];
228 #if LOW_QUALITY_INTERPOLATION
229 float c0
= (1.f
- t
) * p
[2] + t
* p
[4];
230 float c1
= (1.f
- t
) * p
[3] + t
* p
[5];
232 float b0
= -t
*(t
-1.f
)*(t
-2.f
);
233 float b1
= 3.f
*(t
+1.f
)*(t
-1.f
)*(t
-2.f
);
234 float c0
= (b0
* p
[0] + b1
* p
[2] - 3.f
*(t
+1.f
)*t
*(t
-2.f
) * p
[4] + (t
+1.f
)*t
*(t
-1.f
) * p
[6]) * ffrac
;
235 float c1
= (b0
* p
[1] + b1
* p
[3] - 3.f
*(t
+1.f
)*t
*(t
-2.f
) * p
[5] + (t
+1.f
)*t
*(t
-1.f
) * p
[7]) * ffrac
;
237 rs
->leftright
[2 * i
] = rs
->lgain
* c0
;
238 rs
->leftright
[2 * i
+ 1] = rs
->rgain
* c1
;
239 rs
->lgain
+= rs
->lgain_delta
;
240 rs
->rgain
+= rs
->rgain_delta
;
241 v
->bigpos
+= v
->bigdelta
;
248 static inline uint32_t process_voice_noloop(struct sampler_gen
*v
, struct resampler_state
*rs
, const int16_t *srcdata
, uint32_t pos_offset
, uint32_t usable_sample_end
)
250 uint32_t out_frames
= CBOX_BLOCK_SIZE
- rs
->offset
;
252 uint64_t sample_end64
= ((uint64_t)usable_sample_end
) << 32;
253 // Check how many frames can be written to output buffer without going
254 // past usable_sample_end.
255 if (__builtin_expect(v
->bigpos
+ (out_frames
- 1) * v
->bigdelta
>= sample_end64
, 0))
256 out_frames
= (sample_end64
- v
->bigpos
) / v
->bigdelta
+ 1;
258 assert(out_frames
> 0 && out_frames
<= CBOX_BLOCK_SIZE
- rs
->offset
);
259 uint32_t oldpos
= v
->bigpos
>> 32;
260 if (v
->mode
== spt_stereo16
)
261 process_voice_stereo_noloop(v
, rs
, srcdata
- (pos_offset
<< 1), rs
->offset
+ out_frames
);
263 process_voice_mono_noloop(v
, rs
, srcdata
- pos_offset
, rs
->offset
+ out_frames
);
264 return (v
->bigpos
>> 32) - oldpos
;
267 static void process_voice_withloop(struct sampler_gen
*v
, struct resampler_state
*rs
)
269 // This is the first frame where interpolation will cross the loop boundary
270 uint32_t loop_end
= v
->loop_end
;
271 uint32_t loop_edge
= loop_end
- MAX_INTERPOLATION_ORDER
;
273 while ( rs
->offset
< CBOX_BLOCK_SIZE
) {
274 uint64_t startframe
= v
->bigpos
>> 32;
276 int16_t *source_data
= v
->sample_data
;
277 uint32_t source_offset
= 0;
278 uint32_t usable_sample_end
= loop_edge
;
279 // if the first frame to play is already within 3 frames of loop end
280 // (we need consecutive 4 frames for cubic interpolation) then
281 // "straighten out" the area around the loop, and play that
282 if (__builtin_expect(startframe
>= loop_edge
, 0))
284 // if fully past the loop end, then it's normal wraparound
285 // (or end of the sample if not looping)
286 if (startframe
>= loop_end
)
288 if (v
->loop_start
== (uint32_t)-1)
290 v
->mode
= spt_inactive
;
294 if (v
->loop_count
&& v
->play_count
>= v
->loop_count
)
296 v
->mode
= spt_inactive
;
299 v
->bigpos
-= (uint64_t)(loop_end
- v
->loop_start
) << 32;
303 usable_sample_end
= loop_end
;
304 source_data
= v
->scratch
;
305 source_offset
= loop_edge
;
308 process_voice_noloop(v
, rs
, source_data
, source_offset
, usable_sample_end
);
312 static void process_voice_streaming(struct sampler_gen
*v
, struct resampler_state
*rs
, uint32_t limit
)
314 if (v
->consumed_credit
> 0)
316 if (v
->consumed_credit
>= limit
)
318 v
->consumed_credit
-= limit
;
321 limit
-= v
->consumed_credit
;
322 v
->consumed_credit
= 0;
324 // This is the first frame where interpolation will cross the loop boundary
325 int16_t scratch
[2 * MAX_INTERPOLATION_ORDER
* 2];
327 while ( limit
&& rs
->offset
< CBOX_BLOCK_SIZE
) {
328 uint64_t startframe
= v
->bigpos
>> 32;
330 int16_t *source_data
= v
->in_streaming_buffer
? v
->streaming_buffer
: v
->sample_data
;
331 uint32_t loop_start
= v
->in_streaming_buffer
? 0 : v
->loop_start
;
332 uint32_t loop_end
= v
->in_streaming_buffer
? v
->streaming_buffer_frames
: v
->loop_end
;
333 uint32_t loop_edge
= loop_end
- MAX_INTERPOLATION_ORDER
;
334 uint32_t source_offset
= 0;
335 uint32_t usable_sample_end
= loop_edge
;
336 // if the first frame to play is already within 3 frames of loop end
337 // (we need consecutive 4 frames for cubic interpolation) then
338 // "straighten out" the area around the loop, and play that
339 if (startframe
>= loop_edge
)
341 // if fully past the loop end, then it's normal wraparound
342 // (or end of the sample if not looping)
343 if (startframe
>= loop_end
)
345 if (v
->loop_start
== (uint32_t)-1)
347 v
->mode
= spt_inactive
;
350 v
->bigpos
-= (uint64_t)(loop_end
- loop_start
) << 32;
351 if (v
->prefetch_only_loop
)
352 v
->consumed
-= (loop_end
- loop_start
);
354 v
->in_streaming_buffer
= TRUE
;
358 int shift
= (v
->mode
== spt_stereo16
) ? 1 : 0;
360 // 'linearize' the virtual circular buffer - write 3 (or N) frames before end of the loop
361 // and 3 (N) frames at the start of the loop, and play it; in rare cases this will need to be
362 // repeated twice if output write pointer is close to CBOX_BLOCK_SIZE or playback rate is very low,
364 uint32_t halfscratch
= MAX_INTERPOLATION_ORDER
<< shift
;
365 memcpy(&scratch
[0], &source_data
[loop_edge
<< shift
], halfscratch
* sizeof(int16_t) );
366 if (v
->loop_start
== (uint32_t)-1)
367 memset(scratch
+ halfscratch
, 0, halfscratch
* sizeof(int16_t));
369 memcpy(scratch
+ halfscratch
, &v
->streaming_buffer
[v
->loop_start
<< shift
], halfscratch
* sizeof(int16_t));
371 usable_sample_end
= loop_end
;
372 source_data
= scratch
;
373 source_offset
= loop_edge
;
375 if (limit
!= (uint32_t)-1 && usable_sample_end
- startframe
> limit
)
376 usable_sample_end
= startframe
+ limit
;
378 uint32_t consumed
= process_voice_noloop(v
, rs
, source_data
, source_offset
, usable_sample_end
);
379 if (consumed
> limit
)
381 // The number of frames 'consumed' may be greater than the amount
382 // available because of sample-skipping (at least that's the only
383 // *legitimate* reason). This should be accounted for in the,
384 // consumed sample counter (hence temporary storage of the
385 // 'buffer overconsumption' in the consumed_credit field), but is not
386 // actually causing any use of missing data, as the missing samples
387 // have been skipped.
388 assert(v
->consumed_credit
== 0);
389 v
->consumed_credit
= consumed
- limit
;
390 assert (v
->consumed_credit
<= 1 + (v
->bigdelta
>> 32));
393 v
->consumed
+= consumed
;
394 if (consumed
< limit
)
401 void sampler_gen_reset(struct sampler_gen
*v
)
403 v
->mode
= spt_inactive
;
409 v
->consumed_credit
= 0;
410 v
->streaming_buffer
= NULL
;
411 v
->in_streaming_buffer
= FALSE
;
412 v
->prefetch_only_loop
= FALSE
;
413 v
->fadein_counter
= -1.f
;
416 uint32_t sampler_gen_sample_playback(struct sampler_gen
*v
, float *leftright
, uint32_t limit
)
418 struct resampler_state rs
;
419 rs
.leftright
= leftright
;
421 rs
.lgain
= v
->last_lgain
;
422 rs
.rgain
= v
->last_rgain
;
423 rs
.lgain_delta
= (v
->lgain
- v
->last_lgain
) * (1.f
/ CBOX_BLOCK_SIZE
);
424 rs
.rgain_delta
= (v
->rgain
- v
->last_rgain
) * (1.f
/ CBOX_BLOCK_SIZE
);
425 if (v
->streaming_buffer
)
426 process_voice_streaming(v
, &rs
, limit
);
429 process_voice_withloop(v
, &rs
);
431 uint32_t written
= rs
.offset
;
433 if (!v
->streaming_buffer
)
435 v
->virtpos
+= written
* v
->virtdelta
;
436 if (v
->virtpos
!= v
->bigpos
)
438 while ((v
->virtpos
>> 32) >= v
->loop_end
&& v
->loop_start
!= -1)
439 v
->virtpos
-= ((uint64_t)(v
->loop_end
- v
->loop_start
)) << 32;
442 if (v
->fadein_counter
== -1 && fabs((v
->bigpos
- v
->virtpos
) / (65536.0 * 65536.0)) > v
->stretching_jump
)
444 int64_t jump
= (int64_t)(v
->stretching_jump
* 65536.0 * 65536.0);
445 int64_t newpos
= v
->bigpos
> v
->virtpos
? v
->bigpos
- jump
: v
->bigpos
+ jump
;
448 // XXXKF beware of extremely short loops
449 while ((newpos
>> 32) >= v
->loop_end
&& v
->loop_start
!= -1)
450 newpos
-= ((uint64_t)(v
->loop_end
- v
->loop_start
)) << 32;
451 if ((newpos
>> 32) >= v
->cur_sample_end
- 4)
452 newpos
= ((uint64_t)v
->cur_sample_end
- 4)<< 32;
453 v
->fadein_pos
= newpos
;
454 v
->fadein_counter
= 0;
456 else if (v
->fadein_counter
!= -1)
458 float leftright_fadein
[2 * CBOX_BLOCK_SIZE
];
461 rs
.leftright
= leftright_fadein
;
462 rs
.lgain
= v
->last_lgain
;
463 rs
.rgain
= v
->last_rgain
;
465 uint64_t oldpos
= v
->bigpos
;
466 v
->bigpos
= v
->fadein_pos
;
467 process_voice_withloop(v
, &rs
);
468 v
->fadein_pos
= v
->bigpos
;
471 uint32_t written2
= rs
.offset
;
473 // XXXKF not the best set of special cases
475 if (written2
> written
)
477 for (i
= 2 * written
; i
< 2 * written2
; i
+= 2)
478 leftright
[i
] = leftright
[i
+ 1] = 0.f
;
481 if (written2
< written
)
483 for (i
= 2 * written2
; i
< 2 * written
; i
+= 2)
484 leftright_fadein
[i
] = leftright_fadein
[i
+ 1] = 0.f
;
487 float cnt
= v
->fadein_counter
;
488 float scl
= v
->bigdelta
/ (v
->stretching_crossfade
* v
->virtdelta
);
489 for (i
= 0; i
< 2 * written2
; i
+= 2)
491 leftright
[i
] += (leftright_fadein
[i
] - leftright
[i
]) * cnt
;
492 leftright
[i
+ 1] += (leftright_fadein
[i
+ 1] - leftright
[i
+ 1]) * cnt
;
500 v
->bigpos
= v
->fadein_pos
;
502 v
->fadein_counter
= cnt
;
505 v
->last_lgain
= v
->lgain
;
506 v
->last_rgain
= v
->rgain
;