1 |
/*************************************************************************** |
2 |
* * |
3 |
* LinuxSampler - modular, streaming capable sampler * |
4 |
* * |
5 |
* Copyright (C) 2003, 2004 by Benno Senoner and Christian Schoenebeck * |
6 |
* Copyright (C) 2005 - 2007 Christian Schoenebeck * |
7 |
* * |
8 |
* This program is free software; you can redistribute it and/or modify * |
9 |
* it under the terms of the GNU General Public License as published by * |
10 |
* the Free Software Foundation; either version 2 of the License, or * |
11 |
* (at your option) any later version. * |
12 |
* * |
13 |
* This program is distributed in the hope that it will be useful, * |
14 |
* but WITHOUT ANY WARRANTY; without even the implied warranty of * |
15 |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * |
16 |
* GNU General Public License for more details. * |
17 |
* * |
18 |
* You should have received a copy of the GNU General Public License * |
19 |
* along with this program; if not, write to the Free Software * |
20 |
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, * |
21 |
* MA 02111-1307 USA * |
22 |
***************************************************************************/ |
23 |
|
24 |
// Note: the assembly code is currently disabled, as it doesn't fit into |
25 |
// the new synthesis core introduced by LS 0.4.0 |
26 |
|
27 |
#ifndef __LS_RESAMPLER_H__ |
28 |
#define __LS_RESAMPLER_H__ |
29 |
|
30 |
#include "../../common/global_private.h" |
31 |
|
32 |
// TODO: cubic interpolation is not yet supported by the MMX/SSE(1) version though |
33 |
#ifndef USE_LINEAR_INTERPOLATION |
34 |
# define USE_LINEAR_INTERPOLATION 1 ///< set to 0 if you prefer cubic interpolation (slower, better quality) |
35 |
#endif |
36 |
|
37 |
namespace LinuxSampler { |
38 |
|
39 |
/** @brief Stereo sample point |
40 |
* |
41 |
* Encapsulates one stereo sample point, thus signal value for one |
42 |
* sample point for left and right channel. |
43 |
*/ |
44 |
struct stereo_sample_t { |
45 |
float left; |
46 |
float right; |
47 |
}; |
48 |
|
49 |
/** @brief Resampler Template |
50 |
* |
51 |
* This template provides pure C++ and MMX/SSE assembly implementations |
52 |
* for linear and cubic interpolation for pitching a mono or stereo |
53 |
* input signal. |
54 |
*/ |
55 |
template<bool INTERPOLATE,bool BITDEPTH24> |
56 |
class Resampler { |
57 |
public: |
58 |
inline static float GetNextSampleMonoCPP(sample_t* pSrc, double* Pos, float& Pitch) { |
59 |
if (INTERPOLATE) return Interpolate1StepMonoCPP(pSrc, Pos, Pitch); |
60 |
else { // no pitch, so no interpolation necessary |
61 |
int pos_int = (int) *Pos; |
62 |
*Pos += 1.0; |
63 |
return pSrc [pos_int]; |
64 |
} |
65 |
} |
66 |
|
67 |
inline static stereo_sample_t GetNextSampleStereoCPP(sample_t* pSrc, double* Pos, float& Pitch) { |
68 |
if (INTERPOLATE) return Interpolate1StepStereoCPP(pSrc, Pos, Pitch); |
69 |
else { // no pitch, so no interpolation necessary |
70 |
int pos_int = (int) *Pos; |
71 |
pos_int <<= 1; |
72 |
*Pos += 1.0; |
73 |
stereo_sample_t samplePoint; |
74 |
samplePoint.left = pSrc[pos_int]; |
75 |
samplePoint.right = pSrc[pos_int+1]; |
76 |
return samplePoint; |
77 |
} |
78 |
} |
79 |
|
80 |
#if 0 // CONFIG_ASM && ARCH_X86 |
81 |
inline static void GetNext4SamplesMonoMMXSSE(sample_t* pSrc, void* Pos, float& Pitch) { |
82 |
if (INTERPOLATE) Interpolate4StepsMonoMMXSSE(pSrc, Pos, Pitch); |
83 |
else { // no pitch, so no interpolation necessary |
84 |
const float __4f = 4.0f; |
85 |
__asm__ __volatile__ ( |
86 |
"movss (%1), %%xmm5 # load Pos\n\t" |
87 |
"cvtss2si %%xmm5, %%edi # int(Pos)\n\t" |
88 |
"addss %2, %%xmm5 # Pos += 4.0f\n\t" |
89 |
"movswl (%0,%%edi,2), %%eax # load sample 0\n\t" |
90 |
"cvtsi2ss %%eax, %%xmm2 # convert to float\n\t" |
91 |
"shufps $0x93, %%xmm2, %%xmm2 # shift up\n\t" |
92 |
"movswl 2(%0,%%edi,2), %%edx # load sample 1\n\t" |
93 |
"cvtsi2ss %%edx, %%xmm2 # convert to float\n\t" |
94 |
"shufps $0x93, %%xmm2, %%xmm2 # shift up\n\t" |
95 |
"movss %%xmm5, (%1) # update Pos\n\t" |
96 |
"movswl 4(%0,%%edi,2), %%eax # load sample 2\n\t" |
97 |
"cvtsi2ss %%eax, %%xmm2 # convert to float\n\t" |
98 |
"shufps $0x93, %%xmm2, %%xmm2 # shift up\n\t" |
99 |
"movswl 6(%0,%%edi,2), %%edx # load sample 3\n\t" |
100 |
"cvtsi2ss %%edx, %%xmm2 # convert to float\n\t" |
101 |
"shufps $0x1b, %%xmm2, %%xmm2 # swap to correct order\n\t" |
102 |
:: "r" (pSrc), "r" (Pos), "m" (__4f) |
103 |
: "%eax", "%edx", "%edi" |
104 |
); |
105 |
} |
106 |
} |
107 |
|
108 |
inline static void GetNext4SamplesStereoMMXSSE(sample_t* pSrc, void* Pos, float& Pitch) { |
109 |
if (INTERPOLATE) { |
110 |
Interpolate4StepsStereoMMXSSE(pSrc, Pos, Pitch); |
111 |
//EMMS; |
112 |
} else { // no pitch, so no interpolation necessary |
113 |
const float __4f = 4.0f; |
114 |
__asm__ __volatile__ ( |
115 |
"movss (%1), %%xmm5 # load Pos\n\t" |
116 |
"cvtss2si %%xmm5, %%edi # int(Pos)\n\t" |
117 |
"addss %2, %%xmm5 # Pos += 4.0f\n\t" |
118 |
"movswl (%0, %%edi,4), %%eax # load sample 0 (left)\n\t" |
119 |
"cvtsi2ss %%eax, %%xmm2 # convert to float\n\t" |
120 |
"shufps $0x93, %%xmm2, %%xmm2 # shift up\n\t" |
121 |
"movss %%xmm5, (%1) # update Pos\n\t" |
122 |
"movswl 2(%0, %%edi,4), %%edx # load sample 0 (left)\n\t" |
123 |
"cvtsi2ss %%edx, %%xmm3 # convert to float\n\t" |
124 |
"shufps $0x93, %%xmm3, %%xmm3 # shift up\n\t" |
125 |
"movswl 4(%0, %%edi,4), %%eax # load sample 1 (left)\n\t" |
126 |
"cvtsi2ss %%eax, %%xmm2 # convert to float\n\t" |
127 |
"shufps $0x93, %%xmm2, %%xmm2 # shift up\n\t" |
128 |
"movswl 6(%0, %%edi,4), %%edx # load sample 1 (right)\n\t" |
129 |
"cvtsi2ss %%edx, %%xmm3 # convert to float\n\t" |
130 |
"shufps $0x93, %%xmm3, %%xmm3 # shift up\n\t" |
131 |
"movswl 8(%0, %%edi,4), %%eax # load sample 2 (left)\n\t" |
132 |
"cvtsi2ss %%eax, %%xmm2 # convert to float\n\t" |
133 |
"shufps $0x93, %%xmm2, %%xmm2 # shift up\n\t" |
134 |
"movswl 10(%0, %%edi,4), %%edx # load sample 2 (right)\n\t" |
135 |
"cvtsi2ss %%edx, %%xmm3 # convert to float\n\t" |
136 |
"shufps $0x93, %%xmm3, %%xmm3 # shift up\n\t" |
137 |
"movswl 12(%0, %%edi,4), %%eax # load sample 3 (left)\n\t" |
138 |
"cvtsi2ss %%eax, %%xmm2 # convert to float\n\t" |
139 |
"shufps $0x1b, %%xmm2, %%xmm2 # swap to correct order\n\t" |
140 |
"movswl 14(%0, %%edi,4), %%edx # load sample 3 (right)\n\t" |
141 |
"cvtsi2ss %%edx, %%xmm3 # convert to float\n\t" |
142 |
"shufps $0x1b, %%xmm3, %%xmm3 # swap to correct order\n\t" |
143 |
:: "r" (pSrc), "r" (Pos), "m" (__4f) |
144 |
: "%eax", "%edx", "%edi" |
145 |
); |
146 |
} |
147 |
} |
148 |
#endif // CONFIG_ASM && ARCH_X86 |
149 |
|
150 |
protected: |
151 |
|
152 |
inline static int32_t getSample(sample_t* src, int pos) { |
153 |
if (BITDEPTH24) { |
154 |
pos *= 3; |
155 |
#if WORDS_BIGENDIAN |
156 |
unsigned char* p = (unsigned char*)src; |
157 |
return p[pos] << 8 | p[pos + 1] << 16 | p[pos + 2] << 24; |
158 |
#else |
159 |
// 24bit read optimization: |
160 |
// a misaligned 32bit read and subquent 8 bit shift is faster (on x86) than reading 3 single bytes and shifting them |
161 |
return (*((int32_t *)(&((char *)(src))[pos])))<<8; |
162 |
#endif |
163 |
} else { |
164 |
return src[pos]; |
165 |
} |
166 |
} |
167 |
|
168 |
inline static float Interpolate1StepMonoCPP(sample_t* pSrc, double* Pos, float& Pitch) { |
169 |
int pos_int = (int) *Pos; // integer position |
170 |
float pos_fract = *Pos - pos_int; // fractional part of position |
171 |
|
172 |
#if USE_LINEAR_INTERPOLATION |
173 |
int x1 = getSample(pSrc, pos_int); |
174 |
int x2 = getSample(pSrc, pos_int + 1); |
175 |
float samplePoint = (x1 + pos_fract * (x2 - x1)); |
176 |
#else // polynomial interpolation |
177 |
float xm1 = getSample(pSrc, pos_int); |
178 |
float x0 = getSample(pSrc, pos_int + 1); |
179 |
float x1 = getSample(pSrc, pos_int + 2); |
180 |
float x2 = getSample(pSrc, pos_int + 3); |
181 |
float a = (3.0f * (x0 - x1) - xm1 + x2) * 0.5f; |
182 |
float b = 2.0f * x1 + xm1 - (5.0f * x0 + x2) * 0.5f; |
183 |
float c = (x1 - xm1) * 0.5f; |
184 |
float samplePoint = (((a * pos_fract) + b) * pos_fract + c) * pos_fract + x0; |
185 |
#endif // USE_LINEAR_INTERPOLATION |
186 |
|
187 |
*Pos += Pitch; |
188 |
return samplePoint; |
189 |
} |
190 |
|
191 |
inline static stereo_sample_t Interpolate1StepStereoCPP(sample_t* pSrc, double* Pos, float& Pitch) { |
192 |
int pos_int = (int) *Pos; // integer position |
193 |
float pos_fract = *Pos - pos_int; // fractional part of position |
194 |
pos_int <<= 1; |
195 |
|
196 |
stereo_sample_t samplePoint; |
197 |
|
198 |
#if USE_LINEAR_INTERPOLATION |
199 |
// left channel |
200 |
int x1 = getSample(pSrc, pos_int); |
201 |
int x2 = getSample(pSrc, pos_int + 2); |
202 |
samplePoint.left = (x1 + pos_fract * (x2 - x1)); |
203 |
// right channel |
204 |
x1 = getSample(pSrc, pos_int + 1); |
205 |
x2 = getSample(pSrc, pos_int + 3); |
206 |
samplePoint.right = (x1 + pos_fract * (x2 - x1)); |
207 |
#else // polynomial interpolation |
208 |
// calculate left channel |
209 |
float xm1 = getSample(pSrc, pos_int); |
210 |
float x0 = getSample(pSrc, pos_int + 2); |
211 |
float x1 = getSample(pSrc, pos_int + 4); |
212 |
float x2 = getSample(pSrc, pos_int + 6); |
213 |
float a = (3.0f * (x0 - x1) - xm1 + x2) * 0.5f; |
214 |
float b = 2.0f * x1 + xm1 - (5.0f * x0 + x2) * 0.5f; |
215 |
float c = (x1 - xm1) * 0.5f; |
216 |
samplePoint.left = (((a * pos_fract) + b) * pos_fract + c) * pos_fract + x0; |
217 |
|
218 |
//calculate right channel |
219 |
xm1 = getSample(pSrc, pos_int + 1); |
220 |
x0 = getSample(pSrc, pos_int + 3); |
221 |
x1 = getSample(pSrc, pos_int + 5); |
222 |
x2 = getSample(pSrc, pos_int + 7); |
223 |
a = (3.0f * (x0 - x1) - xm1 + x2) * 0.5f; |
224 |
b = 2.0f * x1 + xm1 - (5.0f * x0 + x2) * 0.5f; |
225 |
c = (x1 - xm1) * 0.5f; |
226 |
samplePoint.right = (((a * pos_fract) + b) * pos_fract + c) * pos_fract + x0; |
227 |
#endif // USE_LINEAR_INTERPOLATION |
228 |
|
229 |
*Pos += Pitch; |
230 |
return samplePoint; |
231 |
} |
232 |
|
233 |
#if 0 // CONFIG_ASM && ARCH_X86 |
234 |
// TODO: no support for cubic interpolation yet |
235 |
inline static void Interpolate4StepsMonoMMXSSE(sample_t* pSrc, void* Pos, float& Pitch) { |
236 |
/* calculate playback position of each of the 4 samples by adding the associated pitch */ |
237 |
__asm__ __volatile__ ( |
238 |
"movss (%0),%%xmm0 # sample position of sample[0] -> xmm0[0]\n\t" |
239 |
"movss %1,%%xmm1 # copy pitch -> xmm1[0]\n\t" |
240 |
"shufps $0x90,%%xmm0,%%xmm0 # shift up, but keep xmm0[0]\n\t" |
241 |
"addss %%xmm1,%%xmm0 # calculate sample position of sample[1]\n\t" |
242 |
"shufps $0x90,%%xmm0,%%xmm0 # shift up, but keep xmm0[0]\n\t" |
243 |
"addss %%xmm1,%%xmm0 # calculate sample position of sample[2]\n\t" |
244 |
"shufps $0x90,%%xmm0,%%xmm0 # shift up, but keep xmm0[0]\n\t" |
245 |
"addss %%xmm1,%%xmm0 # calculate sample position of sample[3]\n\t" |
246 |
"movss %%xmm0,%%xmm2 # xmm0[0] -> xmm2[0]\n\t" |
247 |
"addss %%xmm1,%%xmm2 # calculate initial sample position for the next 4-sample cycle\n\t" |
248 |
"movss %%xmm2,(%0) # update 'Pos'\n\t" |
249 |
"shufps $0x1b,%%xmm0,%%xmm0 # swap, so that xmm0[0]=sample pos 0, xmm0[1]=sample pos 1,...\n\t" |
250 |
"cvttps2pi %%xmm0,%%mm4 # int(xmm0[0-1]) -> mm4\n\t" |
251 |
"shufps $0xe4,%%xmm0,%%xmm1 # xmm0[2-3] -> xmm1[2-3]\n\t" |
252 |
"shufps $0x0e,%%xmm1,%%xmm1 # xmm1[2-3] -> xmm1[0-1]\n\t" |
253 |
"cvttps2pi %%xmm1,%%mm5 # int(xmm1[0-1]) -> mm5\n\t" |
254 |
"cvtpi2ps %%mm5,%%xmm1 # double(mm5) -> xmm1[0-1]\n\t" |
255 |
"shufps $0x44,%%xmm1,%%xmm1 # shift lower 2 FPs up to the upper 2 cells\n\t" |
256 |
"cvtpi2ps %%mm4,%%xmm1 # double(mm4) -> xmm1[0-1]\n\t" |
257 |
"subps %%xmm1,%%xmm0 # xmm0[1-3] = xmm0[1-3] - xmm1[1-3]\n\t" |
258 |
: |
259 |
: "r" (Pos), /* %0 */ |
260 |
"m" (Pitch) /* %1 */ |
261 |
: "%xmm0", /* holds fractional position (0.0 <= x < 1.0) of sample 0-3 at the end */ |
262 |
"%xmm1", /* holds integer position (back converted to SPFP) of sample 0-3 at the end */ |
263 |
"mm4", /* holds integer position of sample 0-1 at the end */ |
264 |
"mm5", /* holds integer position of sample 2-3 at the end */ |
265 |
"st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)" |
266 |
); |
267 |
/* get sample values of pSrc[pos_int] and pSrc[pos_int+1] of the 4 samples */ |
268 |
__asm__ __volatile__ ( |
269 |
"movd %%mm4,%%edi # sample position of sample 0\n\t" |
270 |
"psrlq $32,%%mm4 # mm4 >> 32\n\t" |
271 |
"movswl (%0,%%edi,2),%%eax # pSrc[pos_int] (sample 0)\n\t" |
272 |
"movswl 2(%0,%%edi,2),%%ecx # pSrc[pos_int] (sample 0+1)\n\t" |
273 |
"cvtsi2ss %%eax, %%xmm2 # pSrc[pos_int] -> xmm2[0]\n\t" |
274 |
"cvtsi2ss %%ecx, %%xmm3 # pSrc[pos_int] -> xmm3[0]\n\t" |
275 |
"shufps $0x93, %%xmm2, %%xmm2 # shift up\n\t" |
276 |
"shufps $0x93, %%xmm3, %%xmm3 # shift up\n\t" |
277 |
"movd %%mm4,%%edi # sample position of sample 1\n\t" |
278 |
"movswl (%0,%%edi,2),%%eax # pSrc[pos_int] (sample 1)\n\t" |
279 |
"movswl 2(%0,%%edi,2),%%ecx # pSrc[pos_int] (sample 1+1)\n\t" |
280 |
"cvtsi2ss %%eax, %%xmm2 # pSrc[pos_int] -> xmm2[0]\n\t" |
281 |
"cvtsi2ss %%ecx, %%xmm3 # pSrc[pos_int] -> xmm3[0]\n\t" |
282 |
"shufps $0x93, %%xmm2, %%xmm2 # shift up\n\t" |
283 |
"shufps $0x93, %%xmm3, %%xmm3 # shift up\n\t" |
284 |
"movd %%mm5,%%edi # sample position of sample 2\n\t" |
285 |
"psrlq $32,%%mm5 # mm5 >> 32\n\t" |
286 |
"movswl (%0,%%edi,2),%%eax # pSrc[pos_int] (sample 2)\n\t" |
287 |
"movswl 2(%0,%%edi,2),%%ecx # pSrc[pos_int] (sample 2+1)\n\t" |
288 |
"cvtsi2ss %%eax, %%xmm2 # pSrc[pos_int] -> xmm2[0]\n\t" |
289 |
"cvtsi2ss %%ecx, %%xmm3 # pSrc[pos_int] -> xmm3[0]\n\t" |
290 |
"shufps $0x93, %%xmm2, %%xmm2 # shift up\n\t" |
291 |
"shufps $0x93, %%xmm3, %%xmm3 # shift up\n\t" |
292 |
"movd %%mm5,%%edi # sample position of sample 2\n\t" |
293 |
"movswl (%0,%%edi,2),%%eax # pSrc[pos_int] (sample 3)\n\t" |
294 |
"movswl 2(%0,%%edi,2),%%ecx # pSrc[pos_int] (sample 3+1)\n\t" |
295 |
"cvtsi2ss %%eax, %%xmm2 # pSrc[pos_int] -> xmm2[0]\n\t" |
296 |
"cvtsi2ss %%ecx, %%xmm3 # pSrc[pos_int] -> xmm3[0]\n\t" |
297 |
"shufps $0x1b, %%xmm2, %%xmm2 # swap to correct order\n\t" |
298 |
"shufps $0x1b, %%xmm3, %%xmm3 # swap to correct order\n\t" |
299 |
: /* no output */ |
300 |
: "S" (pSrc) /* %0 - sample read position */ |
301 |
: "%eax", "%ecx", /*"%edx",*/ "%edi", |
302 |
"%xmm2", /* holds pSrc[int_pos] of the 4 samples at the end */ |
303 |
"%xmm3", /* holds pSrc[int_pos+1] of the 4 samples at the end */ |
304 |
"mm4", /* holds integer position of sample 0-1 at the end */ |
305 |
"mm5", /* holds integer position of sample 2-3 at the end */ |
306 |
"st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)" |
307 |
); |
308 |
/* linear interpolation of the 4 samples simultaniously */ |
309 |
__asm__ __volatile__ ( |
310 |
"subps %%xmm2,%%xmm3 # xmm3 = pSrc[pos_int+1] - pSrc[pos_int]\n\t" |
311 |
"mulps %%xmm0,%%xmm3 # xmm3 = pos_fract * (pSrc[pos_int+1] - pSrc[pos_int])\n\t" |
312 |
"addps %%xmm3,%%xmm2 # xmm2 = pSrc[pos_int] + (pos_fract * (pSrc[pos_int+1] - pSrc[pos_int]))\n\t" |
313 |
: /* no output */ |
314 |
: /* no input */ |
315 |
: "%xmm2" /* holds linear interpolated sample point (of all 4 samples) at the end */ |
316 |
); |
317 |
} |
318 |
|
319 |
// TODO: no support for cubic interpolation yet |
320 |
inline static void Interpolate4StepsStereoMMXSSE(sample_t* pSrc, void* Pos, float& Pitch) { |
321 |
/* calculate playback position of each of the 4 samples by adding the associated pitch */ |
322 |
__asm__ __volatile__ ( |
323 |
"movss (%0),%%xmm0 # sample position of sample[0] -> xmm0[0]\n\t" |
324 |
"movss %1,%%xmm1 # copy pitch -> xmm1[0]\n\t" |
325 |
"shufps $0x90,%%xmm0,%%xmm0 # shift up, but keep xmm0[0]\n\t" |
326 |
"addss %%xmm1,%%xmm0 # calculate sample position of sample[1]\n\t" |
327 |
"shufps $0x90,%%xmm0,%%xmm0 # shift up, but keep xmm0[0]\n\t" |
328 |
"addss %%xmm1,%%xmm0 # calculate sample position of sample[2]\n\t" |
329 |
"shufps $0x90,%%xmm0,%%xmm0 # shift up, but keep xmm0[0]\n\t" |
330 |
"addss %%xmm1,%%xmm0 # calculate sample position of sample[3]\n\t" |
331 |
"movss %%xmm0,%%xmm2 # xmm0[0] -> xmm2[0]\n\t" |
332 |
"addss %%xmm1,%%xmm2 # calculate initial sample position for the next 4-sample cycle\n\t" |
333 |
"movss %%xmm2,(%0) # update 'Pos'\n\t" |
334 |
"shufps $0x1b,%%xmm0,%%xmm0 # swap, so that xmm0[0]=sample pos 0, xmm0[1]=sample pos 1,...\n\t" |
335 |
"cvttps2pi %%xmm0,%%mm4 # int(xmm0[0-1]) -> mm4\n\t" |
336 |
"shufps $0xe4,%%xmm0,%%xmm1 # xmm0[2-3] -> xmm1[2-3]\n\t" |
337 |
"shufps $0x0e,%%xmm1,%%xmm1 # xmm1[2-3] -> xmm1[0-1]\n\t" |
338 |
"cvttps2pi %%xmm1,%%mm5 # int(xmm1[0-1]) -> mm5\n\t" |
339 |
"cvtpi2ps %%mm5,%%xmm1 # double(mm5) -> xmm1[0-1]\n\t" |
340 |
"shufps $0x44,%%xmm1,%%xmm1 # shift lower 2 FPs up to the upper 2 cells\n\t" |
341 |
"cvtpi2ps %%mm4,%%xmm1 # double(mm4) -> xmm1[0-1]\n\t" |
342 |
"subps %%xmm1,%%xmm0 # xmm0[1-3] = xmm0[1-3] - xmm1[1-3]\n\t" |
343 |
: |
344 |
: "r" (Pos), /* %0 */ |
345 |
"m" (Pitch) /* %1 */ |
346 |
: "%xmm0", /* holds fractional position (0.0 <= x < 1.0) of sample 0-3 at the end */ |
347 |
"%xmm1", /* holds integer position (back converted to SPFP) of sample 0-3 at the end */ |
348 |
"mm4", /* holds integer position of sample 0-1 at the end */ |
349 |
"mm5", /* holds integer position of sample 2-3 at the end */ |
350 |
"st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)" |
351 |
); |
352 |
|
353 |
/* get sample values of pSrc[pos_int], pSrc[pos_int+1], pSrc[pos_int+2] and pSrc[pos_int+3] of the 4 samples */ |
354 |
__asm__ __volatile__ ( |
355 |
"xorl %%eax,%%eax # clear eax\n\t" |
356 |
"xorl %%edx,%%edx # clear edx\n\t" |
357 |
"movd %%mm4,%%edi # sample position of sample 0\n\t" |
358 |
"psrlq $32,%%mm4 # mm4 >> 32\n\t" |
359 |
"movswl (%0,%%edi,4),%%eax # pSrc[pos_int] (sample 0)\n\t" |
360 |
"cvtsi2ss %%eax, %%xmm2 # pSrc[pos_int] -> xmm2[0]\n\t" |
361 |
"shufps $0x93, %%xmm2, %%xmm2 # shift up\n\t" |
362 |
"movswl 2(%0,%%edi,4),%%edx # pSrc[pos_int] (sample 0+1)\n\t" |
363 |
"cvtsi2ss %%edx, %%xmm3 # pSrc[pos_int] -> xmm3[0]\n\t" |
364 |
"shufps $0x93, %%xmm3, %%xmm3 # shift up\n\t" |
365 |
"movswl 4(%0,%%edi,4),%%eax # pSrc[pos_int] (sample 0+2)\n\t" |
366 |
"cvtsi2ss %%eax, %%xmm4 # pSrc[pos_int] -> xmm4[0]\n\t" |
367 |
"shufps $0x93, %%xmm4, %%xmm4 # shift up\n\t" |
368 |
"movswl 6(%0,%%edi,4),%%edx # pSrc[pos_int] (sample 0+3)\n\t" |
369 |
"cvtsi2ss %%edx, %%xmm5 # pSrc[pos_int] -> xmm5[0]\n\t" |
370 |
"movd %%mm4,%%edi # sample position of sample 1\n\t" |
371 |
"shufps $0x93, %%xmm5, %%xmm5 # shift up\n\t" |
372 |
"movswl (%0,%%edi,4),%%eax # pSrc[pos_int] (sample 1)\n\t" |
373 |
"cvtsi2ss %%eax, %%xmm2 # pSrc[pos_int] -> xmm2[0]\n\t" |
374 |
"shufps $0x93, %%xmm2, %%xmm2 # shift up\n\t" |
375 |
"movswl 2(%0,%%edi,4),%%edx # pSrc[pos_int] (sample 1+1)\n\t" |
376 |
"cvtsi2ss %%edx, %%xmm3 # pSrc[pos_int] -> xmm3[0]\n\t" |
377 |
"shufps $0x93, %%xmm3, %%xmm3 # shift up\n\t" |
378 |
"movswl 4(%0,%%edi,4),%%eax # pSrc[pos_int] (sample 1+2)\n\t" |
379 |
"cvtsi2ss %%eax, %%xmm4 # pSrc[pos_int] -> xmm4[0]\n\t" |
380 |
"shufps $0x93, %%xmm4, %%xmm4 # shift up\n\t" |
381 |
"movswl 6(%0,%%edi,4),%%edx # pSrc[pos_int] (sample 1+3)\n\t" |
382 |
"cvtsi2ss %%edx, %%xmm5 # pSrc[pos_int] -> xmm5[0]\n\t" |
383 |
"movd %%mm5,%%edi # sample position of sample 2\n\t" |
384 |
"shufps $0x93, %%xmm5, %%xmm5 # shift up\n\t" |
385 |
"psrlq $32,%%mm5 # mm5 >> 32\n\t" |
386 |
"movswl (%0,%%edi,4),%%eax # pSrc[pos_int] (sample 2)\n\t" |
387 |
"cvtsi2ss %%eax, %%xmm2 # pSrc[pos_int] -> xmm2[0]\n\t" |
388 |
"shufps $0x93, %%xmm2, %%xmm2 # shift up\n\t" |
389 |
"movswl 2(%0,%%edi,4),%%edx # pSrc[pos_int] (sample 2+1)\n\t" |
390 |
"cvtsi2ss %%edx, %%xmm3 # pSrc[pos_int] -> xmm3[0]\n\t" |
391 |
"shufps $0x93, %%xmm3, %%xmm3 # shift up\n\t" |
392 |
"movswl 4(%0,%%edi,4),%%eax # pSrc[pos_int] (sample 2+2)\n\t" |
393 |
"cvtsi2ss %%eax, %%xmm4 # pSrc[pos_int] -> xmm4[0]\n\t" |
394 |
"shufps $0x93, %%xmm4, %%xmm4 # shift up\n\t" |
395 |
"movswl 6(%0,%%edi,4),%%edx # pSrc[pos_int] (sample 2+3)\n\t" |
396 |
"cvtsi2ss %%edx, %%xmm5 # pSrc[pos_int] -> xmm5[0]\n\t" |
397 |
"movd %%mm5,%%edi # sample position of sample 3\n\t" |
398 |
"shufps $0x93, %%xmm5, %%xmm5 # shift up\n\t" |
399 |
"movswl (%0,%%edi,4),%%eax # pSrc[pos_int] (sample 3)\n\t" |
400 |
"cvtsi2ss %%eax, %%xmm2 # pSrc[pos_int] -> xmm2[0]\n\t" |
401 |
"shufps $0x1b, %%xmm2, %%xmm2 # shift up\n\t" |
402 |
"movswl 2(%0,%%edi,4),%%edx # pSrc[pos_int] (sample 3+1)\n\t" |
403 |
"cvtsi2ss %%edx, %%xmm3 # pSrc[pos_int] -> xmm3[0]\n\t" |
404 |
"shufps $0x1b, %%xmm3, %%xmm3 # shift up\n\t" |
405 |
"movswl 4(%0,%%edi,4),%%eax # pSrc[pos_int] (sample 3+2)\n\t" |
406 |
"cvtsi2ss %%eax, %%xmm4 # pSrc[pos_int] -> xmm4[0]\n\t" |
407 |
"shufps $0x1b, %%xmm4, %%xmm4 # swap to correct order\n\t" |
408 |
"movswl 6(%0,%%edi,4),%%edx # pSrc[pos_int] (sample 3+3)\n\t" |
409 |
"cvtsi2ss %%edx, %%xmm5 # pSrc[pos_int] -> xmm5[0]\n\t" |
410 |
"shufps $0x1b, %%xmm5, %%xmm5 # swap to correct order\n\t" |
411 |
: /* no output */ |
412 |
: "S" (pSrc) /* %0 - sample read position */ |
413 |
: "%eax", "%edx", "%edi", |
414 |
"xmm2", /* holds pSrc[int_pos] of the 4 samples at the end */ |
415 |
"xmm3", /* holds pSrc[int_pos+1] of the 4 samples at the end */ |
416 |
"xmm4", /* holds pSrc[int_pos+2] of the 4 samples at the end */ |
417 |
"xmm5", /* holds pSrc[int_pos+3] of the 4 samples at the end */ |
418 |
"mm4", /* holds integer position of sample 0-1 at the end */ |
419 |
"mm5", /* holds integer position of sample 2-3 at the end */ |
420 |
"st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)" |
421 |
); |
422 |
/* linear interpolation of the 4 samples (left & right channel) simultaniously */ |
423 |
__asm__ __volatile__ ( |
424 |
"subps %%xmm2,%%xmm4 # xmm4 = pSrc[pos_int+2] - pSrc[pos_int] (left channel)\n\t" |
425 |
"mulps %%xmm0,%%xmm4 # xmm4 = pos_fract * (pSrc[pos_int+2] - pSrc[pos_int]) (left channel)\n\t" |
426 |
"addps %%xmm4,%%xmm2 # xmm2 = pSrc[pos_int] + (pos_fract * (pSrc[pos_int+2] - pSrc[pos_int])) (left channel)\n\t" |
427 |
"subps %%xmm3,%%xmm5 # xmm5 = pSrc[pos_int+3] - pSrc[pos_int+1] (right channel)\n\t" |
428 |
"mulps %%xmm0,%%xmm5 # xmm5 = pos_fract * (pSrc[pos_int+3] - pSrc[pos_int+1]) (right channel)\n\t" |
429 |
"addps %%xmm5,%%xmm3 # xmm3 = pSrc[pos_int+1] + (pos_fract * (pSrc[pos_int+3] - pSrc[pos_int+1])) (right channel)\n\t" |
430 |
: /* no output */ |
431 |
: /* no input */ |
432 |
: "%xmm2", /* holds linear interpolated sample of left channel (of all 4 samples) at the end */ |
433 |
"%xmm3" /* holds linear interpolated sample of right channel (of all 4 samples) at the end */ |
434 |
); |
435 |
} |
436 |
#endif // CONFIG_ASM && ARCH_X86 |
437 |
}; |
438 |
|
439 |
} // namespace LinuxSampler |
440 |
|
441 |
#endif // __LS_RESAMPLER_H__ |