1 |
schoenebeck |
320 |
/*************************************************************************** |
2 |
|
|
* * |
3 |
|
|
* LinuxSampler - modular, streaming capable sampler * |
4 |
|
|
* * |
5 |
|
|
* Copyright (C) 2003, 2004 by Benno Senoner and Christian Schoenebeck * |
6 |
schoenebeck |
617 |
* Copyright (C) 2005 Christian Schoenebeck * |
7 |
schoenebeck |
320 |
* * |
8 |
|
|
* This program is free software; you can redistribute it and/or modify * |
9 |
|
|
* it under the terms of the GNU General Public License as published by * |
10 |
|
|
* the Free Software Foundation; either version 2 of the License, or * |
11 |
|
|
* (at your option) any later version. * |
12 |
|
|
* * |
13 |
|
|
* This program is distributed in the hope that it will be useful, * |
14 |
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of * |
15 |
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * |
16 |
|
|
* GNU General Public License for more details. * |
17 |
|
|
* * |
18 |
|
|
* You should have received a copy of the GNU General Public License * |
19 |
|
|
* along with this program; if not, write to the Free Software * |
20 |
|
|
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, * |
21 |
|
|
* MA 02111-1307 USA * |
22 |
|
|
***************************************************************************/ |
23 |
|
|
|
24 |
|
|
#ifndef __LS_RESAMPLER_H__ |
25 |
|
|
#define __LS_RESAMPLER_H__ |
26 |
|
|
|
27 |
|
|
#include "../../common/global.h" |
28 |
|
|
|
29 |
|
|
// TODO: cubic interpolation is not yet supported by the MMX/SSE(1) version though |
30 |
persson |
903 |
// TODO: cubic interpolation is not supported for 24 bit samples |
31 |
schoenebeck |
320 |
#ifndef USE_LINEAR_INTERPOLATION |
32 |
|
|
# define USE_LINEAR_INTERPOLATION 1 ///< set to 0 if you prefer cubic interpolation (slower, better quality) |
33 |
|
|
#endif |
34 |
|
|
|
35 |
|
|
namespace LinuxSampler { |
36 |
|
|
|
37 |
schoenebeck |
563 |
/** @brief Stereo sample point |
38 |
|
|
* |
39 |
|
|
* Encapsulates one stereo sample point, thus signal value for one |
40 |
|
|
* sample point for left and right channel. |
41 |
|
|
*/ |
42 |
schoenebeck |
320 |
struct stereo_sample_t { |
43 |
|
|
float left; |
44 |
|
|
float right; |
45 |
|
|
}; |
46 |
|
|
|
47 |
schoenebeck |
563 |
/** @brief Resampler Template |
48 |
|
|
* |
49 |
|
|
* This template provides pure C++ and MMX/SSE assembly implementations |
50 |
|
|
* for linear and cubic interpolation for pitching a mono or stereo |
51 |
|
|
* input signal. |
52 |
|
|
*/ |
53 |
persson |
903 |
template<bool INTERPOLATE,bool BITDEPTH24> |
54 |
schoenebeck |
320 |
class Resampler { |
55 |
|
|
public: |
56 |
|
|
inline static float GetNextSampleMonoCPP(sample_t* pSrc, double* Pos, float& Pitch) { |
57 |
|
|
if (INTERPOLATE) return Interpolate1StepMonoCPP(pSrc, Pos, Pitch); |
58 |
|
|
else { // no pitch, so no interpolation necessary |
59 |
|
|
int pos_int = (int) *Pos; |
60 |
|
|
*Pos += 1.0; |
61 |
|
|
return pSrc [pos_int]; |
62 |
|
|
} |
63 |
|
|
} |
64 |
|
|
|
65 |
|
|
inline static stereo_sample_t GetNextSampleStereoCPP(sample_t* pSrc, double* Pos, float& Pitch) { |
66 |
|
|
if (INTERPOLATE) return Interpolate1StepStereoCPP(pSrc, Pos, Pitch); |
67 |
|
|
else { // no pitch, so no interpolation necessary |
68 |
|
|
int pos_int = (int) *Pos; |
69 |
|
|
pos_int <<= 1; |
70 |
|
|
*Pos += 1.0; |
71 |
|
|
stereo_sample_t samplePoint; |
72 |
|
|
samplePoint.left = pSrc[pos_int]; |
73 |
|
|
samplePoint.right = pSrc[pos_int+1]; |
74 |
|
|
return samplePoint; |
75 |
|
|
} |
76 |
|
|
} |
77 |
|
|
|
78 |
schoenebeck |
617 |
#if CONFIG_ASM && ARCH_X86 |
79 |
schoenebeck |
320 |
inline static void GetNext4SamplesMonoMMXSSE(sample_t* pSrc, void* Pos, float& Pitch) { |
80 |
|
|
if (INTERPOLATE) Interpolate4StepsMonoMMXSSE(pSrc, Pos, Pitch); |
81 |
|
|
else { // no pitch, so no interpolation necessary |
82 |
|
|
const float __4f = 4.0f; |
83 |
|
|
__asm__ __volatile__ ( |
84 |
|
|
"movss (%1), %%xmm5 # load Pos\n\t" |
85 |
|
|
"cvtss2si %%xmm5, %%edi # int(Pos)\n\t" |
86 |
|
|
"addss %2, %%xmm5 # Pos += 4.0f\n\t" |
87 |
|
|
"movswl (%0,%%edi,2), %%eax # load sample 0\n\t" |
88 |
|
|
"cvtsi2ss %%eax, %%xmm2 # convert to float\n\t" |
89 |
|
|
"shufps $0x93, %%xmm2, %%xmm2 # shift up\n\t" |
90 |
|
|
"movswl 2(%0,%%edi,2), %%edx # load sample 1\n\t" |
91 |
|
|
"cvtsi2ss %%edx, %%xmm2 # convert to float\n\t" |
92 |
|
|
"shufps $0x93, %%xmm2, %%xmm2 # shift up\n\t" |
93 |
|
|
"movss %%xmm5, (%1) # update Pos\n\t" |
94 |
|
|
"movswl 4(%0,%%edi,2), %%eax # load sample 2\n\t" |
95 |
|
|
"cvtsi2ss %%eax, %%xmm2 # convert to float\n\t" |
96 |
|
|
"shufps $0x93, %%xmm2, %%xmm2 # shift up\n\t" |
97 |
|
|
"movswl 6(%0,%%edi,2), %%edx # load sample 3\n\t" |
98 |
|
|
"cvtsi2ss %%edx, %%xmm2 # convert to float\n\t" |
99 |
|
|
"shufps $0x1b, %%xmm2, %%xmm2 # swap to correct order\n\t" |
100 |
|
|
:: "r" (pSrc), "r" (Pos), "m" (__4f) |
101 |
|
|
: "%eax", "%edx", "%edi" |
102 |
|
|
); |
103 |
|
|
} |
104 |
|
|
} |
105 |
|
|
|
106 |
|
|
inline static void GetNext4SamplesStereoMMXSSE(sample_t* pSrc, void* Pos, float& Pitch) { |
107 |
|
|
if (INTERPOLATE) { |
108 |
|
|
Interpolate4StepsStereoMMXSSE(pSrc, Pos, Pitch); |
109 |
|
|
//EMMS; |
110 |
|
|
} else { // no pitch, so no interpolation necessary |
111 |
|
|
const float __4f = 4.0f; |
112 |
|
|
__asm__ __volatile__ ( |
113 |
|
|
"movss (%1), %%xmm5 # load Pos\n\t" |
114 |
|
|
"cvtss2si %%xmm5, %%edi # int(Pos)\n\t" |
115 |
|
|
"addss %2, %%xmm5 # Pos += 4.0f\n\t" |
116 |
|
|
"movswl (%0, %%edi,4), %%eax # load sample 0 (left)\n\t" |
117 |
|
|
"cvtsi2ss %%eax, %%xmm2 # convert to float\n\t" |
118 |
|
|
"shufps $0x93, %%xmm2, %%xmm2 # shift up\n\t" |
119 |
|
|
"movss %%xmm5, (%1) # update Pos\n\t" |
120 |
|
|
"movswl 2(%0, %%edi,4), %%edx # load sample 0 (left)\n\t" |
121 |
|
|
"cvtsi2ss %%edx, %%xmm3 # convert to float\n\t" |
122 |
|
|
"shufps $0x93, %%xmm3, %%xmm3 # shift up\n\t" |
123 |
|
|
"movswl 4(%0, %%edi,4), %%eax # load sample 1 (left)\n\t" |
124 |
|
|
"cvtsi2ss %%eax, %%xmm2 # convert to float\n\t" |
125 |
|
|
"shufps $0x93, %%xmm2, %%xmm2 # shift up\n\t" |
126 |
|
|
"movswl 6(%0, %%edi,4), %%edx # load sample 1 (right)\n\t" |
127 |
|
|
"cvtsi2ss %%edx, %%xmm3 # convert to float\n\t" |
128 |
|
|
"shufps $0x93, %%xmm3, %%xmm3 # shift up\n\t" |
129 |
|
|
"movswl 8(%0, %%edi,4), %%eax # load sample 2 (left)\n\t" |
130 |
|
|
"cvtsi2ss %%eax, %%xmm2 # convert to float\n\t" |
131 |
|
|
"shufps $0x93, %%xmm2, %%xmm2 # shift up\n\t" |
132 |
|
|
"movswl 10(%0, %%edi,4), %%edx # load sample 2 (right)\n\t" |
133 |
|
|
"cvtsi2ss %%edx, %%xmm3 # convert to float\n\t" |
134 |
|
|
"shufps $0x93, %%xmm3, %%xmm3 # shift up\n\t" |
135 |
|
|
"movswl 12(%0, %%edi,4), %%eax # load sample 3 (left)\n\t" |
136 |
|
|
"cvtsi2ss %%eax, %%xmm2 # convert to float\n\t" |
137 |
|
|
"shufps $0x1b, %%xmm2, %%xmm2 # swap to correct order\n\t" |
138 |
|
|
"movswl 14(%0, %%edi,4), %%edx # load sample 3 (right)\n\t" |
139 |
|
|
"cvtsi2ss %%edx, %%xmm3 # convert to float\n\t" |
140 |
|
|
"shufps $0x1b, %%xmm3, %%xmm3 # swap to correct order\n\t" |
141 |
|
|
:: "r" (pSrc), "r" (Pos), "m" (__4f) |
142 |
|
|
: "%eax", "%edx", "%edi" |
143 |
|
|
); |
144 |
|
|
} |
145 |
|
|
} |
146 |
schoenebeck |
617 |
#endif // CONFIG_ASM && ARCH_X86 |
147 |
schoenebeck |
320 |
|
148 |
|
|
protected: |
149 |
|
|
|
150 |
persson |
903 |
static int getSample(sample_t* src, int pos) { |
151 |
|
|
if (BITDEPTH24) { |
152 |
|
|
pos *= 3; |
153 |
|
|
unsigned char* p = (unsigned char*)src; |
154 |
|
|
return p[pos] << 8 | p[pos + 1] << 16 | p[pos + 2] << 24; |
155 |
|
|
} else { |
156 |
|
|
return src[pos]; |
157 |
|
|
} |
158 |
|
|
} |
159 |
|
|
|
160 |
schoenebeck |
320 |
inline static float Interpolate1StepMonoCPP(sample_t* pSrc, double* Pos, float& Pitch) { |
161 |
|
|
int pos_int = (int) *Pos; // integer position |
162 |
|
|
float pos_fract = *Pos - pos_int; // fractional part of position |
163 |
|
|
|
164 |
|
|
#if USE_LINEAR_INTERPOLATION |
165 |
persson |
903 |
int x1 = getSample(pSrc, pos_int); |
166 |
|
|
int x2 = getSample(pSrc, pos_int + 1); |
167 |
|
|
float samplePoint = (x1 + pos_fract * (x2 - x1)); |
168 |
schoenebeck |
320 |
#else // polynomial interpolation |
169 |
|
|
float xm1 = pSrc[pos_int]; |
170 |
|
|
float x0 = pSrc[pos_int+1]; |
171 |
|
|
float x1 = pSrc[pos_int+2]; |
172 |
|
|
float x2 = pSrc[pos_int+3]; |
173 |
|
|
float a = (3.0f * (x0 - x1) - xm1 + x2) * 0.5f; |
174 |
|
|
float b = 2.0f * x1 + xm1 - (5.0f * x0 + x2) * 0.5f; |
175 |
|
|
float c = (x1 - xm1) * 0.5f; |
176 |
|
|
float samplePoint = (((a * pos_fract) + b) * pos_fract + c) * pos_fract + x0; |
177 |
|
|
#endif // USE_LINEAR_INTERPOLATION |
178 |
|
|
|
179 |
|
|
*Pos += Pitch; |
180 |
|
|
return samplePoint; |
181 |
|
|
} |
182 |
|
|
|
183 |
|
|
inline static stereo_sample_t Interpolate1StepStereoCPP(sample_t* pSrc, double* Pos, float& Pitch) { |
184 |
|
|
int pos_int = (int) *Pos; // integer position |
185 |
|
|
float pos_fract = *Pos - pos_int; // fractional part of position |
186 |
|
|
pos_int <<= 1; |
187 |
|
|
|
188 |
|
|
stereo_sample_t samplePoint; |
189 |
|
|
|
190 |
|
|
#if USE_LINEAR_INTERPOLATION |
191 |
|
|
// left channel |
192 |
persson |
903 |
int x1 = getSample(pSrc, pos_int); |
193 |
|
|
int x2 = getSample(pSrc, pos_int + 2); |
194 |
|
|
samplePoint.left = (x1 + pos_fract * (x2 - x1)); |
195 |
schoenebeck |
320 |
// right channel |
196 |
persson |
903 |
x1 = getSample(pSrc, pos_int + 1); |
197 |
|
|
x2 = getSample(pSrc, pos_int + 3); |
198 |
|
|
samplePoint.right = (x1 + pos_fract * (x2 - x1)); |
199 |
schoenebeck |
320 |
#else // polynomial interpolation |
200 |
|
|
// calculate left channel |
201 |
|
|
float xm1 = pSrc[pos_int]; |
202 |
|
|
float x0 = pSrc[pos_int+2]; |
203 |
|
|
float x1 = pSrc[pos_int+4]; |
204 |
|
|
float x2 = pSrc[pos_int+6]; |
205 |
|
|
float a = (3.0f * (x0 - x1) - xm1 + x2) * 0.5f; |
206 |
|
|
float b = 2.0f * x1 + xm1 - (5.0f * x0 + x2) * 0.5f; |
207 |
|
|
float c = (x1 - xm1) * 0.5f; |
208 |
|
|
samplePoint.left = (((a * pos_fract) + b) * pos_fract + c) * pos_fract + x0; |
209 |
|
|
|
210 |
|
|
//calculate right channel |
211 |
|
|
xm1 = pSrc[pos_int+1]; |
212 |
|
|
x0 = pSrc[pos_int+3]; |
213 |
|
|
x1 = pSrc[pos_int+5]; |
214 |
|
|
x2 = pSrc[pos_int+7]; |
215 |
|
|
a = (3.0f * (x0 - x1) - xm1 + x2) * 0.5f; |
216 |
|
|
b = 2.0f * x1 + xm1 - (5.0f * x0 + x2) * 0.5f; |
217 |
|
|
c = (x1 - xm1) * 0.5f; |
218 |
|
|
samplePoint.right = (((a * pos_fract) + b) * pos_fract + c) * pos_fract + x0; |
219 |
|
|
#endif // USE_LINEAR_INTERPOLATION |
220 |
|
|
|
221 |
|
|
*Pos += Pitch; |
222 |
|
|
return samplePoint; |
223 |
|
|
} |
224 |
|
|
|
225 |
schoenebeck |
617 |
#if CONFIG_ASM && ARCH_X86 |
226 |
schoenebeck |
320 |
// TODO: no support for cubic interpolation yet |
227 |
|
|
inline static void Interpolate4StepsMonoMMXSSE(sample_t* pSrc, void* Pos, float& Pitch) { |
228 |
|
|
/* calculate playback position of each of the 4 samples by adding the associated pitch */ |
229 |
|
|
__asm__ __volatile__ ( |
230 |
|
|
"movss (%0),%%xmm0 # sample position of sample[0] -> xmm0[0]\n\t" |
231 |
|
|
"movss %1,%%xmm1 # copy pitch -> xmm1[0]\n\t" |
232 |
|
|
"shufps $0x90,%%xmm0,%%xmm0 # shift up, but keep xmm0[0]\n\t" |
233 |
|
|
"addss %%xmm1,%%xmm0 # calculate sample position of sample[1]\n\t" |
234 |
|
|
"shufps $0x90,%%xmm0,%%xmm0 # shift up, but keep xmm0[0]\n\t" |
235 |
|
|
"addss %%xmm1,%%xmm0 # calculate sample position of sample[2]\n\t" |
236 |
|
|
"shufps $0x90,%%xmm0,%%xmm0 # shift up, but keep xmm0[0]\n\t" |
237 |
|
|
"addss %%xmm1,%%xmm0 # calculate sample position of sample[3]\n\t" |
238 |
|
|
"movss %%xmm0,%%xmm2 # xmm0[0] -> xmm2[0]\n\t" |
239 |
|
|
"addss %%xmm1,%%xmm2 # calculate initial sample position for the next 4-sample cycle\n\t" |
240 |
|
|
"movss %%xmm2,(%0) # update 'Pos'\n\t" |
241 |
|
|
"shufps $0x1b,%%xmm0,%%xmm0 # swap, so that xmm0[0]=sample pos 0, xmm0[1]=sample pos 1,...\n\t" |
242 |
|
|
"cvttps2pi %%xmm0,%%mm4 # int(xmm0[0-1]) -> mm4\n\t" |
243 |
|
|
"shufps $0xe4,%%xmm0,%%xmm1 # xmm0[2-3] -> xmm1[2-3]\n\t" |
244 |
|
|
"shufps $0x0e,%%xmm1,%%xmm1 # xmm1[2-3] -> xmm1[0-1]\n\t" |
245 |
|
|
"cvttps2pi %%xmm1,%%mm5 # int(xmm1[0-1]) -> mm5\n\t" |
246 |
|
|
"cvtpi2ps %%mm5,%%xmm1 # double(mm5) -> xmm1[0-1]\n\t" |
247 |
|
|
"shufps $0x44,%%xmm1,%%xmm1 # shift lower 2 FPs up to the upper 2 cells\n\t" |
248 |
|
|
"cvtpi2ps %%mm4,%%xmm1 # double(mm4) -> xmm1[0-1]\n\t" |
249 |
|
|
"subps %%xmm1,%%xmm0 # xmm0[1-3] = xmm0[1-3] - xmm1[1-3]\n\t" |
250 |
|
|
: |
251 |
|
|
: "r" (Pos), /* %0 */ |
252 |
|
|
"m" (Pitch) /* %1 */ |
253 |
|
|
: "%xmm0", /* holds fractional position (0.0 <= x < 1.0) of sample 0-3 at the end */ |
254 |
|
|
"%xmm1", /* holds integer position (back converted to SPFP) of sample 0-3 at the end */ |
255 |
|
|
"mm4", /* holds integer position of sample 0-1 at the end */ |
256 |
|
|
"mm5", /* holds integer position of sample 2-3 at the end */ |
257 |
|
|
"st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)" |
258 |
|
|
); |
259 |
|
|
/* get sample values of pSrc[pos_int] and pSrc[pos_int+1] of the 4 samples */ |
260 |
|
|
__asm__ __volatile__ ( |
261 |
|
|
"movd %%mm4,%%edi # sample position of sample 0\n\t" |
262 |
|
|
"psrlq $32,%%mm4 # mm4 >> 32\n\t" |
263 |
|
|
"movswl (%0,%%edi,2),%%eax # pSrc[pos_int] (sample 0)\n\t" |
264 |
|
|
"movswl 2(%0,%%edi,2),%%ecx # pSrc[pos_int] (sample 0+1)\n\t" |
265 |
|
|
"cvtsi2ss %%eax, %%xmm2 # pSrc[pos_int] -> xmm2[0]\n\t" |
266 |
|
|
"cvtsi2ss %%ecx, %%xmm3 # pSrc[pos_int] -> xmm3[0]\n\t" |
267 |
|
|
"shufps $0x93, %%xmm2, %%xmm2 # shift up\n\t" |
268 |
|
|
"shufps $0x93, %%xmm3, %%xmm3 # shift up\n\t" |
269 |
|
|
"movd %%mm4,%%edi # sample position of sample 1\n\t" |
270 |
|
|
"movswl (%0,%%edi,2),%%eax # pSrc[pos_int] (sample 1)\n\t" |
271 |
|
|
"movswl 2(%0,%%edi,2),%%ecx # pSrc[pos_int] (sample 1+1)\n\t" |
272 |
|
|
"cvtsi2ss %%eax, %%xmm2 # pSrc[pos_int] -> xmm2[0]\n\t" |
273 |
|
|
"cvtsi2ss %%ecx, %%xmm3 # pSrc[pos_int] -> xmm3[0]\n\t" |
274 |
|
|
"shufps $0x93, %%xmm2, %%xmm2 # shift up\n\t" |
275 |
|
|
"shufps $0x93, %%xmm3, %%xmm3 # shift up\n\t" |
276 |
|
|
"movd %%mm5,%%edi # sample position of sample 2\n\t" |
277 |
|
|
"psrlq $32,%%mm5 # mm5 >> 32\n\t" |
278 |
|
|
"movswl (%0,%%edi,2),%%eax # pSrc[pos_int] (sample 2)\n\t" |
279 |
|
|
"movswl 2(%0,%%edi,2),%%ecx # pSrc[pos_int] (sample 2+1)\n\t" |
280 |
|
|
"cvtsi2ss %%eax, %%xmm2 # pSrc[pos_int] -> xmm2[0]\n\t" |
281 |
|
|
"cvtsi2ss %%ecx, %%xmm3 # pSrc[pos_int] -> xmm3[0]\n\t" |
282 |
|
|
"shufps $0x93, %%xmm2, %%xmm2 # shift up\n\t" |
283 |
|
|
"shufps $0x93, %%xmm3, %%xmm3 # shift up\n\t" |
284 |
|
|
"movd %%mm5,%%edi # sample position of sample 2\n\t" |
285 |
|
|
"movswl (%0,%%edi,2),%%eax # pSrc[pos_int] (sample 3)\n\t" |
286 |
|
|
"movswl 2(%0,%%edi,2),%%ecx # pSrc[pos_int] (sample 3+1)\n\t" |
287 |
|
|
"cvtsi2ss %%eax, %%xmm2 # pSrc[pos_int] -> xmm2[0]\n\t" |
288 |
|
|
"cvtsi2ss %%ecx, %%xmm3 # pSrc[pos_int] -> xmm3[0]\n\t" |
289 |
|
|
"shufps $0x1b, %%xmm2, %%xmm2 # swap to correct order\n\t" |
290 |
|
|
"shufps $0x1b, %%xmm3, %%xmm3 # swap to correct order\n\t" |
291 |
|
|
: /* no output */ |
292 |
|
|
: "S" (pSrc) /* %0 - sample read position */ |
293 |
|
|
: "%eax", "%ecx", /*"%edx",*/ "%edi", |
294 |
|
|
"%xmm2", /* holds pSrc[int_pos] of the 4 samples at the end */ |
295 |
|
|
"%xmm3", /* holds pSrc[int_pos+1] of the 4 samples at the end */ |
296 |
|
|
"mm4", /* holds integer position of sample 0-1 at the end */ |
297 |
|
|
"mm5", /* holds integer position of sample 2-3 at the end */ |
298 |
|
|
"st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)" |
299 |
|
|
); |
300 |
|
|
/* linear interpolation of the 4 samples simultaniously */ |
301 |
|
|
__asm__ __volatile__ ( |
302 |
|
|
"subps %%xmm2,%%xmm3 # xmm3 = pSrc[pos_int+1] - pSrc[pos_int]\n\t" |
303 |
|
|
"mulps %%xmm0,%%xmm3 # xmm3 = pos_fract * (pSrc[pos_int+1] - pSrc[pos_int])\n\t" |
304 |
|
|
"addps %%xmm3,%%xmm2 # xmm2 = pSrc[pos_int] + (pos_fract * (pSrc[pos_int+1] - pSrc[pos_int]))\n\t" |
305 |
|
|
: /* no output */ |
306 |
|
|
: /* no input */ |
307 |
|
|
: "%xmm2" /* holds linear interpolated sample point (of all 4 samples) at the end */ |
308 |
|
|
); |
309 |
|
|
} |
310 |
|
|
|
311 |
|
|
// TODO: no support for cubic interpolation yet |
312 |
|
|
inline static void Interpolate4StepsStereoMMXSSE(sample_t* pSrc, void* Pos, float& Pitch) { |
313 |
|
|
/* calculate playback position of each of the 4 samples by adding the associated pitch */ |
314 |
|
|
__asm__ __volatile__ ( |
315 |
|
|
"movss (%0),%%xmm0 # sample position of sample[0] -> xmm0[0]\n\t" |
316 |
|
|
"movss %1,%%xmm1 # copy pitch -> xmm1[0]\n\t" |
317 |
|
|
"shufps $0x90,%%xmm0,%%xmm0 # shift up, but keep xmm0[0]\n\t" |
318 |
|
|
"addss %%xmm1,%%xmm0 # calculate sample position of sample[1]\n\t" |
319 |
|
|
"shufps $0x90,%%xmm0,%%xmm0 # shift up, but keep xmm0[0]\n\t" |
320 |
|
|
"addss %%xmm1,%%xmm0 # calculate sample position of sample[2]\n\t" |
321 |
|
|
"shufps $0x90,%%xmm0,%%xmm0 # shift up, but keep xmm0[0]\n\t" |
322 |
|
|
"addss %%xmm1,%%xmm0 # calculate sample position of sample[3]\n\t" |
323 |
|
|
"movss %%xmm0,%%xmm2 # xmm0[0] -> xmm2[0]\n\t" |
324 |
|
|
"addss %%xmm1,%%xmm2 # calculate initial sample position for the next 4-sample cycle\n\t" |
325 |
|
|
"movss %%xmm2,(%0) # update 'Pos'\n\t" |
326 |
|
|
"shufps $0x1b,%%xmm0,%%xmm0 # swap, so that xmm0[0]=sample pos 0, xmm0[1]=sample pos 1,...\n\t" |
327 |
|
|
"cvttps2pi %%xmm0,%%mm4 # int(xmm0[0-1]) -> mm4\n\t" |
328 |
|
|
"shufps $0xe4,%%xmm0,%%xmm1 # xmm0[2-3] -> xmm1[2-3]\n\t" |
329 |
|
|
"shufps $0x0e,%%xmm1,%%xmm1 # xmm1[2-3] -> xmm1[0-1]\n\t" |
330 |
|
|
"cvttps2pi %%xmm1,%%mm5 # int(xmm1[0-1]) -> mm5\n\t" |
331 |
|
|
"cvtpi2ps %%mm5,%%xmm1 # double(mm5) -> xmm1[0-1]\n\t" |
332 |
|
|
"shufps $0x44,%%xmm1,%%xmm1 # shift lower 2 FPs up to the upper 2 cells\n\t" |
333 |
|
|
"cvtpi2ps %%mm4,%%xmm1 # double(mm4) -> xmm1[0-1]\n\t" |
334 |
|
|
"subps %%xmm1,%%xmm0 # xmm0[1-3] = xmm0[1-3] - xmm1[1-3]\n\t" |
335 |
|
|
: |
336 |
|
|
: "r" (Pos), /* %0 */ |
337 |
|
|
"m" (Pitch) /* %1 */ |
338 |
|
|
: "%xmm0", /* holds fractional position (0.0 <= x < 1.0) of sample 0-3 at the end */ |
339 |
|
|
"%xmm1", /* holds integer position (back converted to SPFP) of sample 0-3 at the end */ |
340 |
|
|
"mm4", /* holds integer position of sample 0-1 at the end */ |
341 |
|
|
"mm5", /* holds integer position of sample 2-3 at the end */ |
342 |
|
|
"st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)" |
343 |
|
|
); |
344 |
|
|
|
345 |
|
|
/* get sample values of pSrc[pos_int], pSrc[pos_int+1], pSrc[pos_int+2] and pSrc[pos_int+3] of the 4 samples */ |
346 |
|
|
__asm__ __volatile__ ( |
347 |
|
|
"xorl %%eax,%%eax # clear eax\n\t" |
348 |
|
|
"xorl %%edx,%%edx # clear edx\n\t" |
349 |
|
|
"movd %%mm4,%%edi # sample position of sample 0\n\t" |
350 |
|
|
"psrlq $32,%%mm4 # mm4 >> 32\n\t" |
351 |
|
|
"movswl (%0,%%edi,4),%%eax # pSrc[pos_int] (sample 0)\n\t" |
352 |
|
|
"cvtsi2ss %%eax, %%xmm2 # pSrc[pos_int] -> xmm2[0]\n\t" |
353 |
|
|
"shufps $0x93, %%xmm2, %%xmm2 # shift up\n\t" |
354 |
|
|
"movswl 2(%0,%%edi,4),%%edx # pSrc[pos_int] (sample 0+1)\n\t" |
355 |
|
|
"cvtsi2ss %%edx, %%xmm3 # pSrc[pos_int] -> xmm3[0]\n\t" |
356 |
|
|
"shufps $0x93, %%xmm3, %%xmm3 # shift up\n\t" |
357 |
|
|
"movswl 4(%0,%%edi,4),%%eax # pSrc[pos_int] (sample 0+2)\n\t" |
358 |
|
|
"cvtsi2ss %%eax, %%xmm4 # pSrc[pos_int] -> xmm4[0]\n\t" |
359 |
|
|
"shufps $0x93, %%xmm4, %%xmm4 # shift up\n\t" |
360 |
|
|
"movswl 6(%0,%%edi,4),%%edx # pSrc[pos_int] (sample 0+3)\n\t" |
361 |
|
|
"cvtsi2ss %%edx, %%xmm5 # pSrc[pos_int] -> xmm5[0]\n\t" |
362 |
|
|
"movd %%mm4,%%edi # sample position of sample 1\n\t" |
363 |
|
|
"shufps $0x93, %%xmm5, %%xmm5 # shift up\n\t" |
364 |
|
|
"movswl (%0,%%edi,4),%%eax # pSrc[pos_int] (sample 1)\n\t" |
365 |
|
|
"cvtsi2ss %%eax, %%xmm2 # pSrc[pos_int] -> xmm2[0]\n\t" |
366 |
|
|
"shufps $0x93, %%xmm2, %%xmm2 # shift up\n\t" |
367 |
|
|
"movswl 2(%0,%%edi,4),%%edx # pSrc[pos_int] (sample 1+1)\n\t" |
368 |
|
|
"cvtsi2ss %%edx, %%xmm3 # pSrc[pos_int] -> xmm3[0]\n\t" |
369 |
|
|
"shufps $0x93, %%xmm3, %%xmm3 # shift up\n\t" |
370 |
|
|
"movswl 4(%0,%%edi,4),%%eax # pSrc[pos_int] (sample 1+2)\n\t" |
371 |
|
|
"cvtsi2ss %%eax, %%xmm4 # pSrc[pos_int] -> xmm4[0]\n\t" |
372 |
|
|
"shufps $0x93, %%xmm4, %%xmm4 # shift up\n\t" |
373 |
|
|
"movswl 6(%0,%%edi,4),%%edx # pSrc[pos_int] (sample 1+3)\n\t" |
374 |
|
|
"cvtsi2ss %%edx, %%xmm5 # pSrc[pos_int] -> xmm5[0]\n\t" |
375 |
|
|
"movd %%mm5,%%edi # sample position of sample 2\n\t" |
376 |
|
|
"shufps $0x93, %%xmm5, %%xmm5 # shift up\n\t" |
377 |
|
|
"psrlq $32,%%mm5 # mm5 >> 32\n\t" |
378 |
|
|
"movswl (%0,%%edi,4),%%eax # pSrc[pos_int] (sample 2)\n\t" |
379 |
|
|
"cvtsi2ss %%eax, %%xmm2 # pSrc[pos_int] -> xmm2[0]\n\t" |
380 |
|
|
"shufps $0x93, %%xmm2, %%xmm2 # shift up\n\t" |
381 |
|
|
"movswl 2(%0,%%edi,4),%%edx # pSrc[pos_int] (sample 2+1)\n\t" |
382 |
|
|
"cvtsi2ss %%edx, %%xmm3 # pSrc[pos_int] -> xmm3[0]\n\t" |
383 |
|
|
"shufps $0x93, %%xmm3, %%xmm3 # shift up\n\t" |
384 |
|
|
"movswl 4(%0,%%edi,4),%%eax # pSrc[pos_int] (sample 2+2)\n\t" |
385 |
|
|
"cvtsi2ss %%eax, %%xmm4 # pSrc[pos_int] -> xmm4[0]\n\t" |
386 |
|
|
"shufps $0x93, %%xmm4, %%xmm4 # shift up\n\t" |
387 |
|
|
"movswl 6(%0,%%edi,4),%%edx # pSrc[pos_int] (sample 2+3)\n\t" |
388 |
|
|
"cvtsi2ss %%edx, %%xmm5 # pSrc[pos_int] -> xmm5[0]\n\t" |
389 |
|
|
"movd %%mm5,%%edi # sample position of sample 3\n\t" |
390 |
|
|
"shufps $0x93, %%xmm5, %%xmm5 # shift up\n\t" |
391 |
|
|
"movswl (%0,%%edi,4),%%eax # pSrc[pos_int] (sample 3)\n\t" |
392 |
|
|
"cvtsi2ss %%eax, %%xmm2 # pSrc[pos_int] -> xmm2[0]\n\t" |
393 |
|
|
"shufps $0x1b, %%xmm2, %%xmm2 # shift up\n\t" |
394 |
|
|
"movswl 2(%0,%%edi,4),%%edx # pSrc[pos_int] (sample 3+1)\n\t" |
395 |
|
|
"cvtsi2ss %%edx, %%xmm3 # pSrc[pos_int] -> xmm3[0]\n\t" |
396 |
|
|
"shufps $0x1b, %%xmm3, %%xmm3 # shift up\n\t" |
397 |
|
|
"movswl 4(%0,%%edi,4),%%eax # pSrc[pos_int] (sample 3+2)\n\t" |
398 |
|
|
"cvtsi2ss %%eax, %%xmm4 # pSrc[pos_int] -> xmm4[0]\n\t" |
399 |
|
|
"shufps $0x1b, %%xmm4, %%xmm4 # swap to correct order\n\t" |
400 |
|
|
"movswl 6(%0,%%edi,4),%%edx # pSrc[pos_int] (sample 3+3)\n\t" |
401 |
|
|
"cvtsi2ss %%edx, %%xmm5 # pSrc[pos_int] -> xmm5[0]\n\t" |
402 |
|
|
"shufps $0x1b, %%xmm5, %%xmm5 # swap to correct order\n\t" |
403 |
|
|
: /* no output */ |
404 |
|
|
: "S" (pSrc) /* %0 - sample read position */ |
405 |
|
|
: "%eax", "%edx", "%edi", |
406 |
|
|
"xmm2", /* holds pSrc[int_pos] of the 4 samples at the end */ |
407 |
|
|
"xmm3", /* holds pSrc[int_pos+1] of the 4 samples at the end */ |
408 |
|
|
"xmm4", /* holds pSrc[int_pos+2] of the 4 samples at the end */ |
409 |
|
|
"xmm5", /* holds pSrc[int_pos+3] of the 4 samples at the end */ |
410 |
|
|
"mm4", /* holds integer position of sample 0-1 at the end */ |
411 |
|
|
"mm5", /* holds integer position of sample 2-3 at the end */ |
412 |
|
|
"st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)" |
413 |
|
|
); |
414 |
|
|
/* linear interpolation of the 4 samples (left & right channel) simultaniously */ |
415 |
|
|
__asm__ __volatile__ ( |
416 |
|
|
"subps %%xmm2,%%xmm4 # xmm4 = pSrc[pos_int+2] - pSrc[pos_int] (left channel)\n\t" |
417 |
|
|
"mulps %%xmm0,%%xmm4 # xmm4 = pos_fract * (pSrc[pos_int+2] - pSrc[pos_int]) (left channel)\n\t" |
418 |
|
|
"addps %%xmm4,%%xmm2 # xmm2 = pSrc[pos_int] + (pos_fract * (pSrc[pos_int+2] - pSrc[pos_int])) (left channel)\n\t" |
419 |
|
|
"subps %%xmm3,%%xmm5 # xmm5 = pSrc[pos_int+3] - pSrc[pos_int+1] (right channel)\n\t" |
420 |
|
|
"mulps %%xmm0,%%xmm5 # xmm5 = pos_fract * (pSrc[pos_int+3] - pSrc[pos_int+1]) (right channel)\n\t" |
421 |
|
|
"addps %%xmm5,%%xmm3 # xmm3 = pSrc[pos_int+1] + (pos_fract * (pSrc[pos_int+3] - pSrc[pos_int+1])) (right channel)\n\t" |
422 |
|
|
: /* no output */ |
423 |
|
|
: /* no input */ |
424 |
|
|
: "%xmm2", /* holds linear interpolated sample of left channel (of all 4 samples) at the end */ |
425 |
|
|
"%xmm3" /* holds linear interpolated sample of right channel (of all 4 samples) at the end */ |
426 |
|
|
); |
427 |
|
|
} |
428 |
schoenebeck |
617 |
#endif // CONFIG_ASM && ARCH_X86 |
429 |
schoenebeck |
320 |
}; |
430 |
|
|
|
431 |
|
|
} // namespace LinuxSampler |
432 |
|
|
|
433 |
|
|
#endif // __LS_RESAMPLER_H__ |