25 |
|
|
26 |
#include <math.h> |
#include <math.h> |
27 |
|
|
28 |
|
#include "../../common/global.h" |
29 |
|
|
30 |
/// ln(2) / 2 |
/// ln(2) / 2 |
31 |
#define LN_2_2 0.34657359f |
#define LN_2_2 0.34657359f |
32 |
|
|
46 |
* between multiple filters. |
* between multiple filters. |
47 |
*/ |
*/ |
48 |
struct biquad_param_t { |
struct biquad_param_t { |
|
bq_t a1; |
|
|
bq_t a2; |
|
49 |
bq_t b0; |
bq_t b0; |
50 |
bq_t b1; |
bq_t b1; |
51 |
bq_t b2; |
bq_t b2; |
52 |
|
bq_t a1; |
53 |
|
bq_t a2; |
54 |
}; |
}; |
55 |
|
|
56 |
/** |
/** |
60 |
class BiquadFilter { |
class BiquadFilter { |
61 |
protected: |
protected: |
62 |
// following five variables are only used if no external biquad_param_t reference is used |
// following five variables are only used if no external biquad_param_t reference is used |
|
bq_t a1; |
|
|
bq_t a2; |
|
63 |
bq_t b0; |
bq_t b0; |
64 |
bq_t b1; |
bq_t b1; |
65 |
bq_t b2; |
bq_t b2; |
66 |
|
bq_t a1; |
67 |
|
bq_t a2; |
68 |
// following four variables are used to buffer the feedback |
// following four variables are used to buffer the feedback |
69 |
bq_t x1; |
bq_t x1; |
70 |
bq_t x2; |
bq_t x2; |
71 |
bq_t y1; |
bq_t y1; |
72 |
bq_t y2; |
bq_t y2; |
73 |
|
|
74 |
|
const static float fbc = 0.98; |
75 |
|
|
76 |
/** |
/** |
77 |
* Prevent \a f from going into denormal mode which would slow down |
* Prevent \a f from going into denormal mode which would slow down |
78 |
* subsequent floating point calculations, we achieve that by setting |
* subsequent floating point calculations, we achieve that by setting |
84 |
f -= 1e-18f; |
f -= 1e-18f; |
85 |
} |
} |
86 |
public: |
public: |
87 |
inline BiquadFilter() { |
BiquadFilter() { |
88 |
|
Reset(); |
89 |
|
} |
90 |
|
|
91 |
|
void Reset() { |
92 |
x1 = 0.0f; |
x1 = 0.0f; |
93 |
x2 = 0.0f; |
x2 = 0.0f; |
94 |
y1 = 0.0f; |
y1 = 0.0f; |
123 |
return y; |
return y; |
124 |
} |
} |
125 |
|
|
126 |
|
#if ARCH_X86 |
127 |
|
// expects to find input in xmm0 (xmm0 stays unmodified) and finally leaves output in xmm6 |
128 |
|
inline void Apply4StepsSSE(biquad_param_t* param) { |
129 |
|
__asm__ __volatile__ ( |
130 |
|
"movss (%2),%%xmm4 # b0\n\t" |
131 |
|
"shufps $0x00,%%xmm4,%%xmm4 # copy b0 to other cells\n\t" |
132 |
|
"mulps %%xmm0,%%xmm4 # xmm4 = x*b0\n\t" |
133 |
|
"movups (%0),%%xmm2 # load b1,b2,a1,a2\n\t" |
134 |
|
"movups (%1),%%xmm5 # load x1,x2,y1,y2\n\t" |
135 |
|
/* sample 0 */ |
136 |
|
"movaps %%xmm5,%%xmm3\n\t" |
137 |
|
"mulps %%xmm2,%%xmm5 # xmm5 = [b1,b2,a1,a2] * [x1,x2,y1,y2]\n\t" |
138 |
|
"shufps $0x0a,%%xmm3,%%xmm3 # x2 = x1, y2 = y1\n\t" |
139 |
|
"movss %%xmm4,%%xmm6\n\t" |
140 |
|
"addss %%xmm5,%%xmm6\n\t" |
141 |
|
"shufps $0x39,%%xmm5,%%xmm5\n\t" |
142 |
|
"addss %%xmm5,%%xmm6\n\t" |
143 |
|
"shufps $0x39,%%xmm5,%%xmm5\n\t" |
144 |
|
"addss %%xmm5,%%xmm6\n\t" |
145 |
|
"shufps $0x39,%%xmm5,%%xmm5\n\t" |
146 |
|
"addss %%xmm5,%%xmm6 # xmm6 = b0*x + b1*x1 + b2*x2 + a1*y1 + a2*y2\n\t" |
147 |
|
/* sample 1 */ |
148 |
|
"shufps $0x39,%%xmm4,%%xmm4 # rotate xmm4 down 1 cell\n\t" |
149 |
|
"movss %%xmm6,%%xmm3 # y1 = y\n\t" |
150 |
|
"shufps $0x4e,%%xmm3,%%xmm3 # rotate 2 cells\n\t" |
151 |
|
"movss %%xmm0,%%xmm3 # x1 = x\n\t" |
152 |
|
"shufps $0x93,%%xmm6,%%xmm6 # rotate output up 1 cell\n\t" |
153 |
|
"movaps %%xmm3,%%xmm5\n\t" |
154 |
|
"shufps $0x39,%%xmm0,%%xmm0 # rotate input down 1 cell\n\t" |
155 |
|
"mulps %%xmm2,%%xmm5 # xmm5 = [b1,b2,a1,a2] * [x1,x2,y1,y2]\n\t" |
156 |
|
"movss %%xmm5,%%xmm6\n\t" |
157 |
|
"addss %%xmm4,%%xmm6\n\t" |
158 |
|
"shufps $0x39,%%xmm5,%%xmm5\n\t" |
159 |
|
"addss %%xmm5,%%xmm6\n\t" |
160 |
|
"shufps $0x39,%%xmm5,%%xmm5\n\t" |
161 |
|
"addss %%xmm5,%%xmm6\n\t" |
162 |
|
"shufps $0x39,%%xmm5,%%xmm5\n\t" |
163 |
|
"addss %%xmm5,%%xmm6 # xmm6 = b0*x + b1*x1 + b2*x2 + a1*y1 + a2*y2\n\t" |
164 |
|
/* sample 2 */ |
165 |
|
"shufps $0x0a,%%xmm3,%%xmm3 # x2 = x1, y2 = y1\n\t" |
166 |
|
"shufps $0x39,%%xmm4,%%xmm4 # rotate xmm4 down 1 cell\n\t" |
167 |
|
"movss %%xmm6,%%xmm3 # y1 = y\n\t" |
168 |
|
"shufps $0x4e,%%xmm3,%%xmm3 # rotate 2 cells\n\t" |
169 |
|
"movss %%xmm0,%%xmm3 # x1 = x\n\t" |
170 |
|
"shufps $0x93,%%xmm6,%%xmm6 # rotate output up 1 cell\n\t" |
171 |
|
"movaps %%xmm3,%%xmm5\n\t" |
172 |
|
"shufps $0x39,%%xmm0,%%xmm0 # rotate input down 1 cell\n\t" |
173 |
|
"mulps %%xmm2,%%xmm5 # xmm5 = [b1,b2,a1,a2] * [x1,x2,y1,y2]\n\t" |
174 |
|
"movss %%xmm5,%%xmm6\n\t" |
175 |
|
"addss %%xmm4,%%xmm6\n\t" |
176 |
|
"shufps $0x39,%%xmm5,%%xmm5\n\t" |
177 |
|
"addss %%xmm5,%%xmm6\n\t" |
178 |
|
"shufps $0x39,%%xmm5,%%xmm5\n\t" |
179 |
|
"addss %%xmm5,%%xmm6\n\t" |
180 |
|
"shufps $0x39,%%xmm5,%%xmm5\n\t" |
181 |
|
"addss %%xmm5,%%xmm6 # xmm6 = b0*x + b1*x1 + b2*x2 + a1*y1 + a2*y2\n\t" |
182 |
|
/* sample 3 */ |
183 |
|
"shufps $0x0a,%%xmm3,%%xmm3 # x2 = x1, y2 = y1\n\t" |
184 |
|
"shufps $0x39,%%xmm4,%%xmm4 # rotate xmm4 down 1 cell\n\t" |
185 |
|
"movss %%xmm6,%%xmm3 # y1 = y\n\t" |
186 |
|
"shufps $0x4e,%%xmm3,%%xmm3 # rotate 2 cells\n\t" |
187 |
|
"movss %%xmm0,%%xmm3 # x1 = x\n\t" |
188 |
|
"shufps $0x93,%%xmm6,%%xmm6 # rotate output up 1 cell\n\t" |
189 |
|
"mulps %%xmm3,%%xmm2 # xmm5 = [b1,b2,a1,a2] * [x1,x2,y1,y2]\n\t" |
190 |
|
"shufps $0x39,%%xmm0,%%xmm0 # rotate input down 1 cell\n\t" |
191 |
|
"movss %%xmm2,%%xmm6\n\t" |
192 |
|
"shufps $0x39,%%xmm2,%%xmm2\n\t" |
193 |
|
"addss %%xmm2,%%xmm6\n\t" |
194 |
|
"shufps $0x39,%%xmm2,%%xmm2\n\t" |
195 |
|
"addss %%xmm2,%%xmm6\n\t" |
196 |
|
"shufps $0x39,%%xmm2,%%xmm2\n\t" |
197 |
|
"addss %%xmm2,%%xmm6\n\t" |
198 |
|
"addss %%xmm4,%%xmm6 # xmm6 = b0*x + b1*x1 + b2*x2 + a1*y1 + a2*y2\n\t" |
199 |
|
/* done */ |
200 |
|
"shufps $0x0a,%%xmm3,%%xmm3 # x2 = x1, y2 = y1\n\t" |
201 |
|
"movss %%xmm6,%%xmm3 # y1 = y\n\t" |
202 |
|
"shufps $0x4e,%%xmm3,%%xmm3 # rotate 2 cells\n\t" |
203 |
|
"movss %%xmm0,%%xmm3 # x1 = x\n\t" |
204 |
|
"shufps $0x1b,%%xmm6,%%xmm6 # swap output to correct order\n\t" |
205 |
|
"shufps $0x39,%%xmm0,%%xmm0 # rotate input down 1 cell, to restore original input\n\t" |
206 |
|
"movups %%xmm3,(%1) # store x1,x2,y1,y2\n\t" |
207 |
|
: /* no output */ |
208 |
|
: "r" (¶m->b1), /* %0 - [b1,b2,a1,a2] */ |
209 |
|
"r" (&x1), /* %1 - [x1,x2,y1,y2] */ |
210 |
|
"r" (¶m->b0) /* %2 */ |
211 |
|
); |
212 |
|
} |
213 |
|
#endif // ARCH_X86 |
214 |
|
|
215 |
inline bq_t ApplyFB(bq_t x, const bq_t fb) { |
inline bq_t ApplyFB(bq_t x, const bq_t fb) { |
216 |
bq_t y; |
bq_t y; |
217 |
|
|
241 |
|
|
242 |
return y; |
return y; |
243 |
} |
} |
244 |
|
|
245 |
|
#if ARCH_X86 |
246 |
|
// expects to find input in xmm0 (xmm0 stays unmodified) and finally leaves output in xmm7 |
247 |
|
inline void ApplyFB4StepsSSE(biquad_param_t* param, const bq_t &fb) { |
248 |
|
float xs, ys; |
249 |
|
float t0, t1, t2, t3, t4, t5, t6, t7, t8; // temporary stack space |
250 |
|
__asm__ __volatile__ ( |
251 |
|
/* prepare input */ |
252 |
|
"movss %15,%%xmm5\n\t" |
253 |
|
"movss %%xmm0,(%14)\n\t" |
254 |
|
/* sample 0 */ |
255 |
|
"movss %0, %%xmm3\n\t" |
256 |
|
"movss %1, %%xmm4\n\t" |
257 |
|
"mulss %%xmm4, %%xmm5\n\t" |
258 |
|
"movss %%xmm3, %2\n\t" |
259 |
|
"movss %%xmm5, %16\n\t" |
260 |
|
"mulss %%xmm3, %%xmm5\n\t" |
261 |
|
"movss %19, %%xmm2\n\t" |
262 |
|
"movss %3, %%xmm6\n\t" |
263 |
|
"movss %21, %%xmm3\n\t" |
264 |
|
"addss %%xmm5, %%xmm6\n\t" |
265 |
|
"movss %%xmm2, %%xmm5\n\t" |
266 |
|
"movss %20, %%xmm4\n\t" |
267 |
|
"movss %%xmm6, %4\n\t" |
268 |
|
"mulss %%xmm6, %%xmm5\n\t" |
269 |
|
"movss %5, %%xmm6\n\t" |
270 |
|
"movss %%xmm2, %6\n\t" |
271 |
|
"movss %%xmm4, %7\n\t" |
272 |
|
"movss %%xmm3, %%xmm2\n\t" |
273 |
|
"mulss %%xmm6, %%xmm4\n\t" |
274 |
|
"mulss %8, %%xmm2\n\t" |
275 |
|
"movss %%xmm3, %9\n\t" |
276 |
|
"addss %%xmm4, %%xmm5\n\t" |
277 |
|
"movss %18, %%xmm3\n\t" |
278 |
|
"movss %17, %%xmm4\n\t" |
279 |
|
"addss %%xmm2, %%xmm5\n\t" |
280 |
|
"movss %%xmm4, %10\n\t" |
281 |
|
"movss %%xmm3, %%xmm2\n\t" |
282 |
|
"mulss %11, %%xmm4\n\t" |
283 |
|
"mulss %12, %%xmm2\n\t" |
284 |
|
"movss %%xmm3, %13\n\t" |
285 |
|
"addss %%xmm4, %%xmm5\n\t" |
286 |
|
"movss %11, %%xmm3\n\t" |
287 |
|
"movss %4, %%xmm4\n\t" |
288 |
|
"addss %%xmm2, %%xmm5\n\t" |
289 |
|
:: "m" (y1), /* %0 */ |
290 |
|
"m" (fbc), /* %1 */ |
291 |
|
"m" (t0), /* %2 */ |
292 |
|
"m" (xs), /* %3 */ |
293 |
|
"m" (t7), /* %4 */ |
294 |
|
"m" (x1), /* %5 */ |
295 |
|
"m" (t1), /* %6 */ |
296 |
|
"m" (t2), /* %7 */ |
297 |
|
"m" (x2), /* %8 */ |
298 |
|
"m" (t3), /* %9 */ |
299 |
|
"m" (t4), /* %10 */ |
300 |
|
"m" (t0), /* %11 */ |
301 |
|
"m" (y2), /* %12 */ |
302 |
|
"m" (t5), /* %13 */ |
303 |
|
"r" (&xs), /* %14 */ |
304 |
|
"m" (fb), /* %15 */ |
305 |
|
"m" (ys), /* %16 */ |
306 |
|
"m" (param->a1), /* %17 */ |
307 |
|
"m" (param->a2), /* %18 */ |
308 |
|
"m" (param->b0), /* %19 */ |
309 |
|
"m" (param->b1), /* %20 */ |
310 |
|
"m" (param->b2) /* %21 */ |
311 |
|
); |
312 |
|
__asm__ __volatile__ ( |
313 |
|
"shufps $0x39,%%xmm0,%%xmm0 # rotate down one cell\n\t" |
314 |
|
"movss %%xmm5,%%xmm7\n\t" |
315 |
|
:: |
316 |
|
); |
317 |
|
/* sample 1 */ |
318 |
|
__asm__ __volatile__ ( |
319 |
|
"movss %0, %%xmm4\n\t" |
320 |
|
"movss %%xmm0, %%xmm3\n\t" |
321 |
|
"mulss %%xmm5, %%xmm4\n\t" |
322 |
|
"mulss %3, %%xmm6\n\t" |
323 |
|
"movss %5, %%xmm2\n\t" |
324 |
|
"addss %%xmm4, %%xmm3\n\t" |
325 |
|
"mulss %7, %%xmm2\n\t" |
326 |
|
"movss %6, %%xmm4\n\t" |
327 |
|
"movss %%xmm3, %8\n\t" |
328 |
|
"mulss %%xmm3, %%xmm4\n\t" |
329 |
|
"addss %%xmm2, %%xmm4\n\t" |
330 |
|
"movss %9, %%xmm3\n\t" |
331 |
|
"mulss %%xmm5, %%xmm3\n\t" |
332 |
|
"movss %10, %%xmm2\n\t" |
333 |
|
"addss %%xmm6, %%xmm4\n\t" |
334 |
|
"mulss %11, %%xmm2\n\t" |
335 |
|
"addss %%xmm3, %%xmm4\n\t" |
336 |
|
"addss %%xmm2, %%xmm4\n\t" |
337 |
|
:: "m" (ys), /* %0 */ |
338 |
|
"m" (fbc), /* %1 */ |
339 |
|
"m" (xs), /* %2 */ |
340 |
|
"m" (t3), /* %3 */ |
341 |
|
"m" (y2), /* %4 */ |
342 |
|
"m" (t2), /* %5 */ |
343 |
|
"m" (t1), /* %6 */ |
344 |
|
"m" (t7), /* %7 */ |
345 |
|
"m" (t8), /* %8 */ |
346 |
|
"m" (t4), /* %9 */ |
347 |
|
"m" (t5), /* %10 */ |
348 |
|
"m" (t0), /* %11 */ |
349 |
|
"m" (x2), /* %12 */ |
350 |
|
"m" (x1), /* %13 */ |
351 |
|
"m" (y1) /* %14 */ |
352 |
|
); |
353 |
|
__asm__ __volatile__ ( |
354 |
|
"shufps $0x93,%%xmm7,%%xmm7 # rotate up one cell\n\t" |
355 |
|
"shufps $0x39,%%xmm0,%%xmm0 # rotate down one cell\n\t" |
356 |
|
"movss %%xmm4,%%xmm7\n\t" |
357 |
|
:: |
358 |
|
); |
359 |
|
/* sample 2 */ |
360 |
|
__asm__ __volatile__ ( |
361 |
|
"movss %2, %%xmm6\n\t" |
362 |
|
"movss %3, %%xmm3\n\t" |
363 |
|
"mulss %%xmm4, %%xmm6\n\t" |
364 |
|
"movss %4, %%xmm2\n\t" |
365 |
|
"mulss %9, %%xmm2\n\t" |
366 |
|
"addss %%xmm0, %%xmm6\n\t" |
367 |
|
"mulss %7, %%xmm5\n\t" |
368 |
|
"mulss %%xmm6, %%xmm3\n\t" |
369 |
|
"addss %%xmm2, %%xmm3\n\t" |
370 |
|
"movss %5, %%xmm2\n\t" |
371 |
|
"mulss %8, %%xmm2\n\t" |
372 |
|
"addss %%xmm2, %%xmm3\n\t" |
373 |
|
"movss %6, %%xmm2\n\t" |
374 |
|
"mulss %%xmm4, %%xmm2\n\t" |
375 |
|
"addss %%xmm5, %%xmm2\n\t" |
376 |
|
"addss %%xmm2, %%xmm3\n\t" |
377 |
|
:: "m" (xs), /* %0 */ |
378 |
|
"m" (fb), /* %1 */ |
379 |
|
"m" (ys), /* %2 */ |
380 |
|
"m" (t1), /* %3 */ |
381 |
|
"m" (t2), /* %4 */ |
382 |
|
"m" (t3), /* %5 */ |
383 |
|
"m" (t4), /* %6 */ |
384 |
|
"m" (t5), /* %7 */ |
385 |
|
"m" (t7), /* %8 */ |
386 |
|
"m" (t8), /* %9 */ |
387 |
|
"m" (x1), /* %10 */ |
388 |
|
"m" (x2), /* %11 */ |
389 |
|
"m" (y1), /* %12 */ |
390 |
|
"m" (y2) /* %13 */ |
391 |
|
); |
392 |
|
__asm__ __volatile__ ( |
393 |
|
"shufps $0x39,%%xmm0,%%xmm0 # rotate down one cell\n\t" |
394 |
|
"shufps $0x93,%%xmm7,%%xmm7 # rotate up one cell\n\t" |
395 |
|
"movss %%xmm3,%%xmm7\n\t" |
396 |
|
:: |
397 |
|
); |
398 |
|
/* sample 3 */ |
399 |
|
__asm__ __volatile__ ( |
400 |
|
"movss %1, %%xmm2\n\t" |
401 |
|
"mulss %7, %%xmm4\n\t" |
402 |
|
"mulss %%xmm3, %%xmm2\n\t" |
403 |
|
"movss %3, %%xmm5\n\t" |
404 |
|
"movss %%xmm6, %11\n\t" |
405 |
|
"addss %%xmm0, %%xmm2\n\t" |
406 |
|
"movss %%xmm3, %13\n\t" |
407 |
|
"mulss %%xmm2, %%xmm5\n\t" |
408 |
|
"mulss %4, %%xmm6\n\t" |
409 |
|
"movss %%xmm2, %10\n\t" |
410 |
|
"addss %%xmm6, %%xmm5\n\t" |
411 |
|
"movss %5, %%xmm2\n\t" |
412 |
|
"mulss %9, %%xmm2\n\t" |
413 |
|
"mulss %6, %%xmm3\n\t" |
414 |
|
"addss %%xmm2, %%xmm5\n\t" |
415 |
|
"addss %%xmm3, %%xmm4\n\t" |
416 |
|
"addss %%xmm4, %%xmm5\n\t" |
417 |
|
"movss %%xmm5, %12\n\t" |
418 |
|
:: "m" (xs), /* %0 */ |
419 |
|
"m" (ys), /* %1 */ |
420 |
|
"m" (fbc), /* %2 */ |
421 |
|
"m" (t1), /* %3 */ |
422 |
|
"m" (t2), /* %4 */ |
423 |
|
"m" (t3), /* %5 */ |
424 |
|
"m" (t4), /* %6 */ |
425 |
|
"m" (t5), /* %7 */ |
426 |
|
"m" (t6), /* %8 */ |
427 |
|
"m" (t8), /* %9 */ |
428 |
|
"m" (x1), /* %10 */ |
429 |
|
"m" (x2), /* %11 */ |
430 |
|
"m" (y1), /* %12 */ |
431 |
|
"m" (y2) /* %13 */ |
432 |
|
); |
433 |
|
__asm__ __volatile__ ( |
434 |
|
"shufps $0x93,%%xmm7,%%xmm7 # rotate up one cell\n\t" |
435 |
|
"shufps $0x39,%%xmm0,%%xmm0 # rotate down one cell to restore original input\n\t" |
436 |
|
"movss %%xmm5,%%xmm7\n\t" |
437 |
|
"shufps $0x1b,%%xmm7,%%xmm7 # swap output to correct order\n\t" |
438 |
|
:: |
439 |
|
); |
440 |
|
} |
441 |
}; |
}; |
442 |
|
#endif // ARCH_X86 |
443 |
|
|
444 |
class LowpassFilter : public BiquadFilter { |
class LowpassFilter : public BiquadFilter { |
445 |
public: |
public: |