engines/common/Resampler.h

/***************************************************************************
 *                                                                         *
 *   LinuxSampler - modular, streaming capable sampler                     *
 *                                                                         *
 *   Copyright (C) 2003, 2004 by Benno Senoner and Christian Schoenebeck   *
 *                                                                         *
 *   This program is free software; you can redistribute it and/or modify  *
 *   it under the terms of the GNU General Public License as published by  *
 *   the Free Software Foundation; either version 2 of the License, or     *
 *   (at your option) any later version.                                   *
 *                                                                         *
 *   This program is distributed in the hope that it will be useful,       *
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
 *   GNU General Public License for more details.                          *
 *                                                                         *
 *   You should have received a copy of the GNU General Public License     *
 *   along with this program; if not, write to the Free Software           *
 *   Foundation, Inc., 59 Temple Place, Suite 330, Boston,                 *
 *   MA  02111-1307  USA                                                   *
 ***************************************************************************/

#ifndef __LS_RESAMPLER_H__
#define __LS_RESAMPLER_H__

#include "../../common/global.h"

// TODO: cubic interpolation is not yet supported by the MMX/SSE(1) version though
#ifndef USE_LINEAR_INTERPOLATION
# define USE_LINEAR_INTERPOLATION   1  ///< set to 0 if you prefer cubic interpolation (slower, better quality)
#endif

namespace LinuxSampler {

    struct stereo_sample_t {
        float left;
        float right;
    };

    template<bool INTERPOLATE>
    class Resampler {
        public:
            inline static float GetNextSampleMonoCPP(sample_t* pSrc, double* Pos, float& Pitch) {
                if (INTERPOLATE) return Interpolate1StepMonoCPP(pSrc, Pos, Pitch);
                else { // no pitch, so no interpolation necessary
                    int pos_int = (int) *Pos;
                    *Pos += 1.0;
                    return pSrc [pos_int];
                }
            }

            inline static stereo_sample_t GetNextSampleStereoCPP(sample_t* pSrc, double* Pos, float& Pitch) {
                if (INTERPOLATE) return Interpolate1StepStereoCPP(pSrc, Pos, Pitch);
                else { // no pitch, so no interpolation necessary
                    int pos_int = (int) *Pos;
                    pos_int <<= 1;
                    *Pos += 1.0;
                    stereo_sample_t samplePoint;
                    samplePoint.left  = pSrc[pos_int];
                    samplePoint.right = pSrc[pos_int+1];
                    return samplePoint;
                }
            }

            inline static void GetNext4SamplesMonoMMXSSE(sample_t* pSrc, void* Pos, float& Pitch) {
                if (INTERPOLATE) Interpolate4StepsMonoMMXSSE(pSrc, Pos, Pitch);
                else { // no pitch, so no interpolation necessary
                    const float __4f = 4.0f;
                    __asm__ __volatile__ (
                        "movss    (%1), %%xmm5           # load Pos\n\t"
                        "cvtss2si %%xmm5, %%edi          # int(Pos)\n\t"
                        "addss    %2, %%xmm5             # Pos += 4.0f\n\t"
                        "movswl   (%0,%%edi,2), %%eax    # load sample 0\n\t"
                        "cvtsi2ss  %%eax, %%xmm2         # convert to float\n\t"
                        "shufps    $0x93, %%xmm2, %%xmm2 # shift up\n\t"
                        "movswl   2(%0,%%edi,2), %%edx   # load sample 1\n\t"
                        "cvtsi2ss  %%edx, %%xmm2         # convert to float\n\t"
                        "shufps    $0x93, %%xmm2, %%xmm2 # shift up\n\t"
                        "movss     %%xmm5, (%1)          # update Pos\n\t"
                        "movswl   4(%0,%%edi,2), %%eax   # load sample 2\n\t"
                        "cvtsi2ss  %%eax, %%xmm2         # convert to float\n\t"
                        "shufps    $0x93, %%xmm2, %%xmm2 # shift up\n\t"
                        "movswl   6(%0,%%edi,2), %%edx   # load sample 3\n\t"
                        "cvtsi2ss  %%edx, %%xmm2         # convert to float\n\t"
                        "shufps    $0x1b, %%xmm2, %%xmm2 # swap to correct order\n\t"
                        :: "r" (pSrc), "r" (Pos), "m" (__4f)
                        :  "%eax", "%edx", "%edi"
                    );
                }
            }

            inline static void GetNext4SamplesStereoMMXSSE(sample_t* pSrc, void* Pos, float& Pitch) {
                if (INTERPOLATE) {
                    Interpolate4StepsStereoMMXSSE(pSrc, Pos, Pitch);
                    //EMMS;
                } else { // no pitch, so no interpolation necessary
                    const float __4f = 4.0f;
                    __asm__ __volatile__ (
                        "movss    (%1), %%xmm5           # load Pos\n\t"
                        "cvtss2si %%xmm5, %%edi          # int(Pos)\n\t"
                        "addss    %2, %%xmm5             # Pos += 4.0f\n\t"
                        "movswl    (%0, %%edi,4), %%eax  # load sample 0 (left)\n\t"
                        "cvtsi2ss  %%eax, %%xmm2         # convert to float\n\t"
                        "shufps    $0x93, %%xmm2, %%xmm2 # shift up\n\t"
                        "movss     %%xmm5, (%1)          # update Pos\n\t"
                        "movswl   2(%0, %%edi,4), %%edx  # load sample 0 (left)\n\t"
                        "cvtsi2ss  %%edx, %%xmm3         # convert to float\n\t"
                        "shufps    $0x93, %%xmm3, %%xmm3 # shift up\n\t"
                        "movswl   4(%0, %%edi,4), %%eax  # load sample 1 (left)\n\t"
                        "cvtsi2ss  %%eax, %%xmm2         # convert to float\n\t"
                        "shufps    $0x93, %%xmm2, %%xmm2 # shift up\n\t"
                        "movswl   6(%0, %%edi,4), %%edx  # load sample 1 (right)\n\t"
                        "cvtsi2ss  %%edx, %%xmm3         # convert to float\n\t"
                        "shufps    $0x93, %%xmm3, %%xmm3 # shift up\n\t"
                        "movswl   8(%0, %%edi,4), %%eax  # load sample 2 (left)\n\t"
                        "cvtsi2ss  %%eax, %%xmm2         # convert to float\n\t"
                        "shufps    $0x93, %%xmm2, %%xmm2 # shift up\n\t"
                        "movswl  10(%0, %%edi,4), %%edx  # load sample 2 (right)\n\t"
                        "cvtsi2ss  %%edx, %%xmm3         # convert to float\n\t"
                        "shufps    $0x93, %%xmm3, %%xmm3 # shift up\n\t"
                        "movswl  12(%0, %%edi,4), %%eax  # load sample 3 (left)\n\t"
                        "cvtsi2ss  %%eax, %%xmm2         # convert to float\n\t"
                        "shufps    $0x1b, %%xmm2, %%xmm2 # swap to correct order\n\t"
                        "movswl  14(%0, %%edi,4), %%edx  # load sample 3 (right)\n\t"
                        "cvtsi2ss  %%edx, %%xmm3         # convert to float\n\t"
                        "shufps    $0x1b, %%xmm3, %%xmm3 # swap to correct order\n\t"
                        :: "r" (pSrc), "r" (Pos), "m" (__4f)
                        :  "%eax", "%edx", "%edi"
                    );
                }
            }

        protected:

            inline static float Interpolate1StepMonoCPP(sample_t* pSrc, double* Pos, float& Pitch) {
                int   pos_int   = (int) *Pos;     // integer position
                float pos_fract = *Pos - pos_int; // fractional part of position

                #if USE_LINEAR_INTERPOLATION
                    float samplePoint  = pSrc[pos_int] + pos_fract * (pSrc[pos_int+1] - pSrc[pos_int]);
                #else // polynomial interpolation
                    float xm1 = pSrc[pos_int];
                    float x0  = pSrc[pos_int+1];
                    float x1  = pSrc[pos_int+2];
                    float x2  = pSrc[pos_int+3];
                    float a   = (3.0f * (x0 - x1) - xm1 + x2) * 0.5f;
                    float b   = 2.0f * x1 + xm1 - (5.0f * x0 + x2) * 0.5f;
                    float c   = (x1 - xm1) * 0.5f;
                    float samplePoint =  (((a * pos_fract) + b) * pos_fract + c) * pos_fract + x0;
                #endif // USE_LINEAR_INTERPOLATION

                *Pos += Pitch;
                return samplePoint;
            }

            inline static stereo_sample_t Interpolate1StepStereoCPP(sample_t* pSrc, double* Pos, float& Pitch) {
                int   pos_int   = (int) *Pos;  // integer position
                float pos_fract = *Pos - pos_int;     // fractional part of position
                pos_int <<= 1;

                stereo_sample_t samplePoint;

                #if USE_LINEAR_INTERPOLATION
                    // left channel
                    samplePoint.left = pSrc[pos_int]   + pos_fract * (pSrc[pos_int+2] - pSrc[pos_int]);
                    // right channel
                    samplePoint.right = pSrc[pos_int+1] + pos_fract * (pSrc[pos_int+3] - pSrc[pos_int+1]);
                #else // polynomial interpolation
                    // calculate left channel
                    float xm1 = pSrc[pos_int];
                    float x0  = pSrc[pos_int+2];
                    float x1  = pSrc[pos_int+4];
                    float x2  = pSrc[pos_int+6];
                    float a   = (3.0f * (x0 - x1) - xm1 + x2) * 0.5f;
                    float b   = 2.0f * x1 + xm1 - (5.0f * x0 + x2) * 0.5f;
                    float c   = (x1 - xm1) * 0.5f;
                    samplePoint.left = (((a * pos_fract) + b) * pos_fract + c) * pos_fract + x0;

                    //calculate right channel
                    xm1 = pSrc[pos_int+1];
                    x0  = pSrc[pos_int+3];
                    x1  = pSrc[pos_int+5];
                    x2  = pSrc[pos_int+7];
                    a   = (3.0f * (x0 - x1) - xm1 + x2) * 0.5f;
                    b   = 2.0f * x1 + xm1 - (5.0f * x0 + x2) * 0.5f;
                    c   = (x1 - xm1) * 0.5f;
                    samplePoint.right =  (((a * pos_fract) + b) * pos_fract + c) * pos_fract + x0;
                #endif // USE_LINEAR_INTERPOLATION

                *Pos += Pitch;
                return samplePoint;
            }

            // TODO: no support for cubic interpolation yet
            inline static void Interpolate4StepsMonoMMXSSE(sample_t* pSrc, void* Pos, float& Pitch) {
                /* calculate playback position of each of the 4 samples by adding the associated pitch */
                __asm__ __volatile__ (
                    "movss    (%0),%%xmm0             # sample position of sample[0] -> xmm0[0]\n\t"
                    "movss    %1,%%xmm1               # copy pitch -> xmm1[0]\n\t"
                    "shufps   $0x90,%%xmm0,%%xmm0     # shift up, but keep xmm0[0]\n\t"
                    "addss    %%xmm1,%%xmm0           # calculate sample position of sample[1]\n\t"
                    "shufps   $0x90,%%xmm0,%%xmm0     # shift up, but keep xmm0[0]\n\t"
                    "addss    %%xmm1,%%xmm0           # calculate sample position of sample[2]\n\t"
                    "shufps   $0x90,%%xmm0,%%xmm0     # shift up, but keep xmm0[0]\n\t"
                    "addss    %%xmm1,%%xmm0           # calculate sample position of sample[3]\n\t"
                    "movss    %%xmm0,%%xmm2           # xmm0[0] -> xmm2[0]\n\t"
                    "addss    %%xmm1,%%xmm2           # calculate initial sample position for the next 4-sample cycle\n\t"
                    "movss    %%xmm2,(%0)             # update 'Pos'\n\t"
                    "shufps   $0x1b,%%xmm0,%%xmm0     # swap, so that xmm0[0]=sample pos 0, xmm0[1]=sample pos 1,...\n\t"
                    "cvttps2pi %%xmm0,%%mm4           # int(xmm0[0-1]) -> mm4\n\t"
                    "shufps   $0xe4,%%xmm0,%%xmm1     # xmm0[2-3] -> xmm1[2-3]\n\t"
                    "shufps   $0x0e,%%xmm1,%%xmm1     # xmm1[2-3] -> xmm1[0-1]\n\t"
                    "cvttps2pi %%xmm1,%%mm5           # int(xmm1[0-1]) -> mm5\n\t"
                    "cvtpi2ps %%mm5,%%xmm1            # double(mm5) -> xmm1[0-1]\n\t"
                    "shufps   $0x44,%%xmm1,%%xmm1     # shift lower 2 FPs up to the upper 2 cells\n\t"
                    "cvtpi2ps %%mm4,%%xmm1            # double(mm4) -> xmm1[0-1]\n\t"
                    "subps    %%xmm1,%%xmm0           # xmm0[1-3] = xmm0[1-3] - xmm1[1-3]\n\t"
                    :
                    : "r" (Pos),  /* %0 */
                      "m" (Pitch) /* %1 */
                    : "%xmm0", /* holds fractional position (0.0 <= x < 1.0) of sample 0-3 at the end */
                      "%xmm1", /* holds integer position (back converted to SPFP) of sample 0-3 at the end */
                      "mm4",  /* holds integer position of sample 0-1 at the end */
                      "mm5",  /* holds integer position of sample 2-3 at the end */
                      "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)"
                );
                /* get sample values of pSrc[pos_int] and pSrc[pos_int+1] of the 4 samples */
                __asm__ __volatile__ (
                    "movd   %%mm4,%%edi               # sample position of sample 0\n\t"
                    "psrlq  $32,%%mm4                 # mm4 >> 32\n\t"
                    "movswl (%0,%%edi,2),%%eax        # pSrc[pos_int] (sample 0)\n\t"
                    "movswl 2(%0,%%edi,2),%%ecx       # pSrc[pos_int] (sample 0+1)\n\t"
                    "cvtsi2ss %%eax, %%xmm2           # pSrc[pos_int] -> xmm2[0]\n\t"
                    "cvtsi2ss %%ecx, %%xmm3           # pSrc[pos_int] -> xmm3[0]\n\t"
                    "shufps    $0x93, %%xmm2, %%xmm2  # shift up\n\t"
                    "shufps    $0x93, %%xmm3, %%xmm3  # shift up\n\t"
                    "movd   %%mm4,%%edi               # sample position of sample 1\n\t"
                    "movswl (%0,%%edi,2),%%eax        # pSrc[pos_int] (sample 1)\n\t"
                    "movswl 2(%0,%%edi,2),%%ecx       # pSrc[pos_int] (sample 1+1)\n\t"
                    "cvtsi2ss %%eax, %%xmm2           # pSrc[pos_int] -> xmm2[0]\n\t"
                    "cvtsi2ss %%ecx, %%xmm3           # pSrc[pos_int] -> xmm3[0]\n\t"
                    "shufps    $0x93, %%xmm2, %%xmm2  # shift up\n\t"
                    "shufps    $0x93, %%xmm3, %%xmm3  # shift up\n\t"
                    "movd   %%mm5,%%edi               # sample position of sample 2\n\t"
                    "psrlq  $32,%%mm5                 # mm5 >> 32\n\t"
                    "movswl (%0,%%edi,2),%%eax        # pSrc[pos_int] (sample 2)\n\t"
                    "movswl 2(%0,%%edi,2),%%ecx       # pSrc[pos_int] (sample 2+1)\n\t"
                    "cvtsi2ss %%eax, %%xmm2           # pSrc[pos_int] -> xmm2[0]\n\t"
                    "cvtsi2ss %%ecx, %%xmm3           # pSrc[pos_int] -> xmm3[0]\n\t"
                    "shufps    $0x93, %%xmm2, %%xmm2  # shift up\n\t"
                    "shufps    $0x93, %%xmm3, %%xmm3  # shift up\n\t"
                    "movd   %%mm5,%%edi               # sample position of sample 2\n\t"
                    "movswl (%0,%%edi,2),%%eax        # pSrc[pos_int] (sample 3)\n\t"
                    "movswl 2(%0,%%edi,2),%%ecx       # pSrc[pos_int] (sample 3+1)\n\t"
                    "cvtsi2ss %%eax, %%xmm2           # pSrc[pos_int] -> xmm2[0]\n\t"
                    "cvtsi2ss %%ecx, %%xmm3           # pSrc[pos_int] -> xmm3[0]\n\t"
                    "shufps    $0x1b, %%xmm2, %%xmm2  # swap to correct order\n\t"
                    "shufps    $0x1b, %%xmm3, %%xmm3  # swap to correct order\n\t"
                    : /* no output */
                    : "S" (pSrc) /* %0 - sample read position  */
                    : "%eax", "%ecx", /*"%edx",*/ "%edi",
                      "%xmm2", /* holds pSrc[int_pos]   of the 4 samples at the end */
                      "%xmm3", /* holds pSrc[int_pos+1] of the 4 samples at the end */
                      "mm4",  /* holds integer position of sample 0-1 at the end */
                      "mm5",  /* holds integer position of sample 2-3 at the end */
                      "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)"
                );
                /* linear interpolation of the 4 samples simultaniously */
                __asm__ __volatile__ (
                    "subps %%xmm2,%%xmm3   # xmm3 = pSrc[pos_int+1] - pSrc[pos_int]\n\t"
                    "mulps %%xmm0,%%xmm3   # xmm3 = pos_fract * (pSrc[pos_int+1] - pSrc[pos_int])\n\t"
                    "addps %%xmm3,%%xmm2   # xmm2 = pSrc[pos_int] + (pos_fract * (pSrc[pos_int+1] - pSrc[pos_int]))\n\t"
                    : /* no output */
                    : /* no input */
                    : "%xmm2" /* holds linear interpolated sample point (of all 4 samples) at the end */
                );
            }

            // TODO: no support for cubic interpolation yet
            inline static void Interpolate4StepsStereoMMXSSE(sample_t* pSrc, void* Pos, float& Pitch) {
                /* calculate playback position of each of the 4 samples by adding the associated pitch */
                __asm__ __volatile__ (
                    "movss    (%0),%%xmm0             # sample position of sample[0] -> xmm0[0]\n\t"
                    "movss    %1,%%xmm1               # copy pitch -> xmm1[0]\n\t"
                    "shufps   $0x90,%%xmm0,%%xmm0     # shift up, but keep xmm0[0]\n\t"
                    "addss    %%xmm1,%%xmm0           # calculate sample position of sample[1]\n\t"
                    "shufps   $0x90,%%xmm0,%%xmm0     # shift up, but keep xmm0[0]\n\t"
                    "addss    %%xmm1,%%xmm0           # calculate sample position of sample[2]\n\t"
                    "shufps   $0x90,%%xmm0,%%xmm0     # shift up, but keep xmm0[0]\n\t"
                    "addss    %%xmm1,%%xmm0           # calculate sample position of sample[3]\n\t"
                    "movss    %%xmm0,%%xmm2           # xmm0[0] -> xmm2[0]\n\t"
                    "addss    %%xmm1,%%xmm2           # calculate initial sample position for the next 4-sample cycle\n\t"
                    "movss    %%xmm2,(%0)             # update 'Pos'\n\t"
                    "shufps   $0x1b,%%xmm0,%%xmm0     # swap, so that xmm0[0]=sample pos 0, xmm0[1]=sample pos 1,...\n\t"
                    "cvttps2pi %%xmm0,%%mm4           # int(xmm0[0-1]) -> mm4\n\t"
                    "shufps   $0xe4,%%xmm0,%%xmm1     # xmm0[2-3] -> xmm1[2-3]\n\t"
                    "shufps   $0x0e,%%xmm1,%%xmm1     # xmm1[2-3] -> xmm1[0-1]\n\t"
                    "cvttps2pi %%xmm1,%%mm5           # int(xmm1[0-1]) -> mm5\n\t"
                    "cvtpi2ps %%mm5,%%xmm1            # double(mm5) -> xmm1[0-1]\n\t"
                    "shufps   $0x44,%%xmm1,%%xmm1     # shift lower 2 FPs up to the upper 2 cells\n\t"
                    "cvtpi2ps %%mm4,%%xmm1            # double(mm4) -> xmm1[0-1]\n\t"
                    "subps    %%xmm1,%%xmm0           # xmm0[1-3] = xmm0[1-3] - xmm1[1-3]\n\t"
                    :
                    : "r" (Pos),  /* %0 */
                      "m" (Pitch) /* %1 */
                    : "%xmm0", /* holds fractional position (0.0 <= x < 1.0) of sample 0-3 at the end */
                      "%xmm1", /* holds integer position (back converted to SPFP) of sample 0-3 at the end */
                      "mm4",  /* holds integer position of sample 0-1 at the end */
                      "mm5",  /* holds integer position of sample 2-3 at the end */
                      "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)"
                );

                /* get sample values of pSrc[pos_int], pSrc[pos_int+1], pSrc[pos_int+2] and pSrc[pos_int+3] of the 4 samples */
                __asm__ __volatile__ (
                    "xorl   %%eax,%%eax               # clear eax\n\t"
                    "xorl   %%edx,%%edx               # clear edx\n\t"
                    "movd   %%mm4,%%edi               # sample position of sample 0\n\t"
                    "psrlq  $32,%%mm4                 # mm4 >> 32\n\t"
                    "movswl (%0,%%edi,4),%%eax        # pSrc[pos_int] (sample 0)\n\t"
                    "cvtsi2ss %%eax, %%xmm2           # pSrc[pos_int] -> xmm2[0]\n\t"
                    "shufps    $0x93, %%xmm2, %%xmm2  # shift up\n\t"
                    "movswl 2(%0,%%edi,4),%%edx       # pSrc[pos_int] (sample 0+1)\n\t"
                    "cvtsi2ss %%edx, %%xmm3           # pSrc[pos_int] -> xmm3[0]\n\t"
                    "shufps    $0x93, %%xmm3, %%xmm3  # shift up\n\t"
                    "movswl 4(%0,%%edi,4),%%eax       # pSrc[pos_int] (sample 0+2)\n\t"
                    "cvtsi2ss %%eax, %%xmm4           # pSrc[pos_int] -> xmm4[0]\n\t"
                    "shufps    $0x93, %%xmm4, %%xmm4  # shift up\n\t"
                    "movswl 6(%0,%%edi,4),%%edx       # pSrc[pos_int] (sample 0+3)\n\t"
                    "cvtsi2ss %%edx, %%xmm5           # pSrc[pos_int] -> xmm5[0]\n\t"
                    "movd   %%mm4,%%edi               # sample position of sample 1\n\t"
                    "shufps    $0x93, %%xmm5, %%xmm5  # shift up\n\t"
                    "movswl (%0,%%edi,4),%%eax        # pSrc[pos_int] (sample 1)\n\t"
                    "cvtsi2ss %%eax, %%xmm2           # pSrc[pos_int] -> xmm2[0]\n\t"
                    "shufps    $0x93, %%xmm2, %%xmm2  # shift up\n\t"
                    "movswl 2(%0,%%edi,4),%%edx       # pSrc[pos_int] (sample 1+1)\n\t"
                    "cvtsi2ss %%edx, %%xmm3           # pSrc[pos_int] -> xmm3[0]\n\t"
                    "shufps    $0x93, %%xmm3, %%xmm3  # shift up\n\t"
                    "movswl 4(%0,%%edi,4),%%eax       # pSrc[pos_int] (sample 1+2)\n\t"
                    "cvtsi2ss %%eax, %%xmm4           # pSrc[pos_int] -> xmm4[0]\n\t"
                    "shufps    $0x93, %%xmm4, %%xmm4  # shift up\n\t"
                    "movswl 6(%0,%%edi,4),%%edx       # pSrc[pos_int] (sample 1+3)\n\t"
                    "cvtsi2ss %%edx, %%xmm5           # pSrc[pos_int] -> xmm5[0]\n\t"
                    "movd   %%mm5,%%edi               # sample position of sample 2\n\t"
                    "shufps    $0x93, %%xmm5, %%xmm5  # shift up\n\t"
                    "psrlq  $32,%%mm5                 # mm5 >> 32\n\t"
                    "movswl (%0,%%edi,4),%%eax        # pSrc[pos_int] (sample 2)\n\t"
                    "cvtsi2ss %%eax, %%xmm2           # pSrc[pos_int] -> xmm2[0]\n\t"
                    "shufps    $0x93, %%xmm2, %%xmm2  # shift up\n\t"
                    "movswl 2(%0,%%edi,4),%%edx       # pSrc[pos_int] (sample 2+1)\n\t"
                    "cvtsi2ss %%edx, %%xmm3           # pSrc[pos_int] -> xmm3[0]\n\t"
                    "shufps    $0x93, %%xmm3, %%xmm3  # shift up\n\t"
                    "movswl 4(%0,%%edi,4),%%eax       # pSrc[pos_int] (sample 2+2)\n\t"
                    "cvtsi2ss %%eax, %%xmm4           # pSrc[pos_int] -> xmm4[0]\n\t"
                    "shufps    $0x93, %%xmm4, %%xmm4  # shift up\n\t"
                    "movswl 6(%0,%%edi,4),%%edx       # pSrc[pos_int] (sample 2+3)\n\t"
                    "cvtsi2ss %%edx, %%xmm5           # pSrc[pos_int] -> xmm5[0]\n\t"
                    "movd   %%mm5,%%edi               # sample position of sample 3\n\t"
                    "shufps    $0x93, %%xmm5, %%xmm5  # shift up\n\t"
                    "movswl (%0,%%edi,4),%%eax        # pSrc[pos_int] (sample 3)\n\t"
                    "cvtsi2ss %%eax, %%xmm2           # pSrc[pos_int] -> xmm2[0]\n\t"
                    "shufps    $0x1b, %%xmm2, %%xmm2  # shift up\n\t"
                    "movswl 2(%0,%%edi,4),%%edx       # pSrc[pos_int] (sample 3+1)\n\t"
                    "cvtsi2ss %%edx, %%xmm3           # pSrc[pos_int] -> xmm3[0]\n\t"
                    "shufps    $0x1b, %%xmm3, %%xmm3  # shift up\n\t"
                    "movswl 4(%0,%%edi,4),%%eax       # pSrc[pos_int] (sample 3+2)\n\t"
                    "cvtsi2ss %%eax, %%xmm4           # pSrc[pos_int] -> xmm4[0]\n\t"
                    "shufps    $0x1b, %%xmm4, %%xmm4  # swap to correct order\n\t"
                    "movswl 6(%0,%%edi,4),%%edx       # pSrc[pos_int] (sample 3+3)\n\t"
                    "cvtsi2ss %%edx, %%xmm5           # pSrc[pos_int] -> xmm5[0]\n\t"
                    "shufps    $0x1b, %%xmm5, %%xmm5  # swap to correct order\n\t"
                    : /* no output */
                    : "S" (pSrc) /* %0 - sample read position  */
                    : "%eax", "%edx", "%edi",
                      "xmm2", /* holds pSrc[int_pos]   of the 4 samples at the end */
                      "xmm3", /* holds pSrc[int_pos+1] of the 4 samples at the end */
                      "xmm4", /* holds pSrc[int_pos+2] of the 4 samples at the end */
                      "xmm5", /* holds pSrc[int_pos+3] of the 4 samples at the end */
                      "mm4",  /* holds integer position of sample 0-1 at the end */
                      "mm5",  /* holds integer position of sample 2-3 at the end */
                      "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)"
                );
                /* linear interpolation of the 4 samples (left & right channel) simultaniously */
                __asm__ __volatile__ (
                    "subps %%xmm2,%%xmm4   # xmm4 = pSrc[pos_int+2] - pSrc[pos_int] (left channel)\n\t"
                    "mulps %%xmm0,%%xmm4   # xmm4 = pos_fract * (pSrc[pos_int+2] - pSrc[pos_int]) (left channel)\n\t"
                    "addps %%xmm4,%%xmm2   # xmm2 = pSrc[pos_int] + (pos_fract * (pSrc[pos_int+2] - pSrc[pos_int])) (left channel)\n\t"
                    "subps %%xmm3,%%xmm5   # xmm5 = pSrc[pos_int+3] - pSrc[pos_int+1] (right channel)\n\t"
                    "mulps %%xmm0,%%xmm5   # xmm5 = pos_fract * (pSrc[pos_int+3] - pSrc[pos_int+1]) (right channel)\n\t"
                    "addps %%xmm5,%%xmm3   # xmm3 = pSrc[pos_int+1] + (pos_fract * (pSrc[pos_int+3] - pSrc[pos_int+1])) (right channel)\n\t"
                    : /* no output */
                    : /* no input */
                    : "%xmm2", /* holds linear interpolated sample of left  channel (of all 4 samples) at the end */
                      "%xmm3"  /* holds linear interpolated sample of right channel (of all 4 samples) at the end */
                );
            }
    };

} // namespace LinuxSampler

#endif // __LS_RESAMPLER_H__
1	schoenebeck	320	/***************************************************************************
2			* *
3			* LinuxSampler - modular, streaming capable sampler *
4			* *
5			* Copyright (C) 2003, 2004 by Benno Senoner and Christian Schoenebeck *
6			* *
7			* This program is free software; you can redistribute it and/or modify *
8			* it under the terms of the GNU General Public License as published by *
9			* the Free Software Foundation; either version 2 of the License, or *
10			* (at your option) any later version. *
11			* *
12			* This program is distributed in the hope that it will be useful, *
13			* but WITHOUT ANY WARRANTY; without even the implied warranty of *
14			* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
15			* GNU General Public License for more details. *
16			* *
17			* You should have received a copy of the GNU General Public License *
18			* along with this program; if not, write to the Free Software *
19			* Foundation, Inc., 59 Temple Place, Suite 330, Boston, *
20			* MA 02111-1307 USA *
21			***************************************************************************/
22
23			#ifndef __LS_RESAMPLER_H__
24			#define __LS_RESAMPLER_H__
25
26			#include "../../common/global.h"
27
28			// TODO: cubic interpolation is not yet supported by the MMX/SSE(1) version though
29			#ifndef USE_LINEAR_INTERPOLATION
30			# define USE_LINEAR_INTERPOLATION 1 ///< set to 0 if you prefer cubic interpolation (slower, better quality)
31			#endif
32
33			namespace LinuxSampler {
34
35			struct stereo_sample_t {
36			float left;
37			float right;
38			};
39
40			template<bool INTERPOLATE>
41			class Resampler {
42			public:
43			inline static float GetNextSampleMonoCPP(sample_t* pSrc, double* Pos, float& Pitch) {
44			if (INTERPOLATE) return Interpolate1StepMonoCPP(pSrc, Pos, Pitch);
45			else { // no pitch, so no interpolation necessary
46			int pos_int = (int) *Pos;
47			*Pos += 1.0;
48			return pSrc [pos_int];
49			}
50			}
51
52			inline static stereo_sample_t GetNextSampleStereoCPP(sample_t* pSrc, double* Pos, float& Pitch) {
53			if (INTERPOLATE) return Interpolate1StepStereoCPP(pSrc, Pos, Pitch);
54			else { // no pitch, so no interpolation necessary
55			int pos_int = (int) *Pos;
56			pos_int <<= 1;
57			*Pos += 1.0;
58			stereo_sample_t samplePoint;
59			samplePoint.left = pSrc[pos_int];
60			samplePoint.right = pSrc[pos_int+1];
61			return samplePoint;
62			}
63			}
64
65			inline static void GetNext4SamplesMonoMMXSSE(sample_t* pSrc, void* Pos, float& Pitch) {
66			if (INTERPOLATE) Interpolate4StepsMonoMMXSSE(pSrc, Pos, Pitch);
67			else { // no pitch, so no interpolation necessary
68			const float __4f = 4.0f;
69			__asm__ __volatile__ (
70			"movss (%1), %%xmm5 # load Pos\n\t"
71			"cvtss2si %%xmm5, %%edi # int(Pos)\n\t"
72			"addss %2, %%xmm5 # Pos += 4.0f\n\t"
73			"movswl (%0,%%edi,2), %%eax # load sample 0\n\t"
74			"cvtsi2ss %%eax, %%xmm2 # convert to float\n\t"
75			"shufps $0x93, %%xmm2, %%xmm2 # shift up\n\t"
76			"movswl 2(%0,%%edi,2), %%edx # load sample 1\n\t"
77			"cvtsi2ss %%edx, %%xmm2 # convert to float\n\t"
78			"shufps $0x93, %%xmm2, %%xmm2 # shift up\n\t"
79			"movss %%xmm5, (%1) # update Pos\n\t"
80			"movswl 4(%0,%%edi,2), %%eax # load sample 2\n\t"
81			"cvtsi2ss %%eax, %%xmm2 # convert to float\n\t"
82			"shufps $0x93, %%xmm2, %%xmm2 # shift up\n\t"
83			"movswl 6(%0,%%edi,2), %%edx # load sample 3\n\t"
84			"cvtsi2ss %%edx, %%xmm2 # convert to float\n\t"
85			"shufps $0x1b, %%xmm2, %%xmm2 # swap to correct order\n\t"
86			:: "r" (pSrc), "r" (Pos), "m" (__4f)
87			: "%eax", "%edx", "%edi"
88			);
89			}
90			}
91
92			inline static void GetNext4SamplesStereoMMXSSE(sample_t* pSrc, void* Pos, float& Pitch) {
93			if (INTERPOLATE) {
94			Interpolate4StepsStereoMMXSSE(pSrc, Pos, Pitch);
95			//EMMS;
96			} else { // no pitch, so no interpolation necessary
97			const float __4f = 4.0f;
98			__asm__ __volatile__ (
99			"movss (%1), %%xmm5 # load Pos\n\t"
100			"cvtss2si %%xmm5, %%edi # int(Pos)\n\t"
101			"addss %2, %%xmm5 # Pos += 4.0f\n\t"
102			"movswl (%0, %%edi,4), %%eax # load sample 0 (left)\n\t"
103			"cvtsi2ss %%eax, %%xmm2 # convert to float\n\t"
104			"shufps $0x93, %%xmm2, %%xmm2 # shift up\n\t"
105			"movss %%xmm5, (%1) # update Pos\n\t"
106			"movswl 2(%0, %%edi,4), %%edx # load sample 0 (left)\n\t"
107			"cvtsi2ss %%edx, %%xmm3 # convert to float\n\t"
108			"shufps $0x93, %%xmm3, %%xmm3 # shift up\n\t"
109			"movswl 4(%0, %%edi,4), %%eax # load sample 1 (left)\n\t"
110			"cvtsi2ss %%eax, %%xmm2 # convert to float\n\t"
111			"shufps $0x93, %%xmm2, %%xmm2 # shift up\n\t"
112			"movswl 6(%0, %%edi,4), %%edx # load sample 1 (right)\n\t"
113			"cvtsi2ss %%edx, %%xmm3 # convert to float\n\t"
114			"shufps $0x93, %%xmm3, %%xmm3 # shift up\n\t"
115			"movswl 8(%0, %%edi,4), %%eax # load sample 2 (left)\n\t"
116			"cvtsi2ss %%eax, %%xmm2 # convert to float\n\t"
117			"shufps $0x93, %%xmm2, %%xmm2 # shift up\n\t"
118			"movswl 10(%0, %%edi,4), %%edx # load sample 2 (right)\n\t"
119			"cvtsi2ss %%edx, %%xmm3 # convert to float\n\t"
120			"shufps $0x93, %%xmm3, %%xmm3 # shift up\n\t"
121			"movswl 12(%0, %%edi,4), %%eax # load sample 3 (left)\n\t"
122			"cvtsi2ss %%eax, %%xmm2 # convert to float\n\t"
123			"shufps $0x1b, %%xmm2, %%xmm2 # swap to correct order\n\t"
124			"movswl 14(%0, %%edi,4), %%edx # load sample 3 (right)\n\t"
125			"cvtsi2ss %%edx, %%xmm3 # convert to float\n\t"
126			"shufps $0x1b, %%xmm3, %%xmm3 # swap to correct order\n\t"
127			:: "r" (pSrc), "r" (Pos), "m" (__4f)
128			: "%eax", "%edx", "%edi"
129			);
130			}
131			}
132
133			protected:
134
135			inline static float Interpolate1StepMonoCPP(sample_t* pSrc, double* Pos, float& Pitch) {
136			int pos_int = (int) *Pos; // integer position
137			float pos_fract = *Pos - pos_int; // fractional part of position
138
139			#if USE_LINEAR_INTERPOLATION
140			float samplePoint = pSrc[pos_int] + pos_fract * (pSrc[pos_int+1] - pSrc[pos_int]);
141			#else // polynomial interpolation
142			float xm1 = pSrc[pos_int];
143			float x0 = pSrc[pos_int+1];
144			float x1 = pSrc[pos_int+2];
145			float x2 = pSrc[pos_int+3];
146			float a = (3.0f * (x0 - x1) - xm1 + x2) * 0.5f;
147			float b = 2.0f * x1 + xm1 - (5.0f * x0 + x2) * 0.5f;
148			float c = (x1 - xm1) * 0.5f;
149			float samplePoint = (((a * pos_fract) + b) * pos_fract + c) * pos_fract + x0;
150			#endif // USE_LINEAR_INTERPOLATION
151
152			*Pos += Pitch;
153			return samplePoint;
154			}
155
156			inline static stereo_sample_t Interpolate1StepStereoCPP(sample_t* pSrc, double* Pos, float& Pitch) {
157			int pos_int = (int) *Pos; // integer position
158			float pos_fract = *Pos - pos_int; // fractional part of position
159			pos_int <<= 1;
160
161			stereo_sample_t samplePoint;
162
163			#if USE_LINEAR_INTERPOLATION
164			// left channel
165			samplePoint.left = pSrc[pos_int] + pos_fract * (pSrc[pos_int+2] - pSrc[pos_int]);
166			// right channel
167			samplePoint.right = pSrc[pos_int+1] + pos_fract * (pSrc[pos_int+3] - pSrc[pos_int+1]);
168			#else // polynomial interpolation
169			// calculate left channel
170			float xm1 = pSrc[pos_int];
171			float x0 = pSrc[pos_int+2];
172			float x1 = pSrc[pos_int+4];
173			float x2 = pSrc[pos_int+6];
174			float a = (3.0f * (x0 - x1) - xm1 + x2) * 0.5f;
175			float b = 2.0f * x1 + xm1 - (5.0f * x0 + x2) * 0.5f;
176			float c = (x1 - xm1) * 0.5f;
177			samplePoint.left = (((a * pos_fract) + b) * pos_fract + c) * pos_fract + x0;
178
179			//calculate right channel
180			xm1 = pSrc[pos_int+1];
181			x0 = pSrc[pos_int+3];
182			x1 = pSrc[pos_int+5];
183			x2 = pSrc[pos_int+7];
184			a = (3.0f * (x0 - x1) - xm1 + x2) * 0.5f;
185			b = 2.0f * x1 + xm1 - (5.0f * x0 + x2) * 0.5f;
186			c = (x1 - xm1) * 0.5f;
187			samplePoint.right = (((a * pos_fract) + b) * pos_fract + c) * pos_fract + x0;
188			#endif // USE_LINEAR_INTERPOLATION
189
190			*Pos += Pitch;
191			return samplePoint;
192			}
193
194			// TODO: no support for cubic interpolation yet
195			inline static void Interpolate4StepsMonoMMXSSE(sample_t* pSrc, void* Pos, float& Pitch) {
196			/* calculate playback position of each of the 4 samples by adding the associated pitch */
197			__asm__ __volatile__ (
198			"movss (%0),%%xmm0 # sample position of sample[0] -> xmm0[0]\n\t"
199			"movss %1,%%xmm1 # copy pitch -> xmm1[0]\n\t"
200			"shufps $0x90,%%xmm0,%%xmm0 # shift up, but keep xmm0[0]\n\t"
201			"addss %%xmm1,%%xmm0 # calculate sample position of sample[1]\n\t"
202			"shufps $0x90,%%xmm0,%%xmm0 # shift up, but keep xmm0[0]\n\t"
203			"addss %%xmm1,%%xmm0 # calculate sample position of sample[2]\n\t"
204			"shufps $0x90,%%xmm0,%%xmm0 # shift up, but keep xmm0[0]\n\t"
205			"addss %%xmm1,%%xmm0 # calculate sample position of sample[3]\n\t"
206			"movss %%xmm0,%%xmm2 # xmm0[0] -> xmm2[0]\n\t"
207			"addss %%xmm1,%%xmm2 # calculate initial sample position for the next 4-sample cycle\n\t"
208			"movss %%xmm2,(%0) # update 'Pos'\n\t"
209			"shufps $0x1b,%%xmm0,%%xmm0 # swap, so that xmm0[0]=sample pos 0, xmm0[1]=sample pos 1,...\n\t"
210			"cvttps2pi %%xmm0,%%mm4 # int(xmm0[0-1]) -> mm4\n\t"
211			"shufps $0xe4,%%xmm0,%%xmm1 # xmm0[2-3] -> xmm1[2-3]\n\t"
212			"shufps $0x0e,%%xmm1,%%xmm1 # xmm1[2-3] -> xmm1[0-1]\n\t"
213			"cvttps2pi %%xmm1,%%mm5 # int(xmm1[0-1]) -> mm5\n\t"
214			"cvtpi2ps %%mm5,%%xmm1 # double(mm5) -> xmm1[0-1]\n\t"
215			"shufps $0x44,%%xmm1,%%xmm1 # shift lower 2 FPs up to the upper 2 cells\n\t"
216			"cvtpi2ps %%mm4,%%xmm1 # double(mm4) -> xmm1[0-1]\n\t"
217			"subps %%xmm1,%%xmm0 # xmm0[1-3] = xmm0[1-3] - xmm1[1-3]\n\t"
218			:
219			: "r" (Pos), /* %0 */
220			"m" (Pitch) /* %1 */
221			: "%xmm0", /* holds fractional position (0.0 <= x < 1.0) of sample 0-3 at the end */
222			"%xmm1", /* holds integer position (back converted to SPFP) of sample 0-3 at the end */
223			"mm4", /* holds integer position of sample 0-1 at the end */
224			"mm5", /* holds integer position of sample 2-3 at the end */
225			"st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)"
226			);
227			/* get sample values of pSrc[pos_int] and pSrc[pos_int+1] of the 4 samples */
228			__asm__ __volatile__ (
229			"movd %%mm4,%%edi # sample position of sample 0\n\t"
230			"psrlq $32,%%mm4 # mm4 >> 32\n\t"
231			"movswl (%0,%%edi,2),%%eax # pSrc[pos_int] (sample 0)\n\t"
232			"movswl 2(%0,%%edi,2),%%ecx # pSrc[pos_int] (sample 0+1)\n\t"
233			"cvtsi2ss %%eax, %%xmm2 # pSrc[pos_int] -> xmm2[0]\n\t"
234			"cvtsi2ss %%ecx, %%xmm3 # pSrc[pos_int] -> xmm3[0]\n\t"
235			"shufps $0x93, %%xmm2, %%xmm2 # shift up\n\t"
236			"shufps $0x93, %%xmm3, %%xmm3 # shift up\n\t"
237			"movd %%mm4,%%edi # sample position of sample 1\n\t"
238			"movswl (%0,%%edi,2),%%eax # pSrc[pos_int] (sample 1)\n\t"
239			"movswl 2(%0,%%edi,2),%%ecx # pSrc[pos_int] (sample 1+1)\n\t"
240			"cvtsi2ss %%eax, %%xmm2 # pSrc[pos_int] -> xmm2[0]\n\t"
241			"cvtsi2ss %%ecx, %%xmm3 # pSrc[pos_int] -> xmm3[0]\n\t"
242			"shufps $0x93, %%xmm2, %%xmm2 # shift up\n\t"
243			"shufps $0x93, %%xmm3, %%xmm3 # shift up\n\t"
244			"movd %%mm5,%%edi # sample position of sample 2\n\t"
245			"psrlq $32,%%mm5 # mm5 >> 32\n\t"
246			"movswl (%0,%%edi,2),%%eax # pSrc[pos_int] (sample 2)\n\t"
247			"movswl 2(%0,%%edi,2),%%ecx # pSrc[pos_int] (sample 2+1)\n\t"
248			"cvtsi2ss %%eax, %%xmm2 # pSrc[pos_int] -> xmm2[0]\n\t"
249			"cvtsi2ss %%ecx, %%xmm3 # pSrc[pos_int] -> xmm3[0]\n\t"
250			"shufps $0x93, %%xmm2, %%xmm2 # shift up\n\t"
251			"shufps $0x93, %%xmm3, %%xmm3 # shift up\n\t"
252			"movd %%mm5,%%edi # sample position of sample 2\n\t"
253			"movswl (%0,%%edi,2),%%eax # pSrc[pos_int] (sample 3)\n\t"
254			"movswl 2(%0,%%edi,2),%%ecx # pSrc[pos_int] (sample 3+1)\n\t"
255			"cvtsi2ss %%eax, %%xmm2 # pSrc[pos_int] -> xmm2[0]\n\t"
256			"cvtsi2ss %%ecx, %%xmm3 # pSrc[pos_int] -> xmm3[0]\n\t"
257			"shufps $0x1b, %%xmm2, %%xmm2 # swap to correct order\n\t"
258			"shufps $0x1b, %%xmm3, %%xmm3 # swap to correct order\n\t"
259			: /* no output */
260			: "S" (pSrc) /* %0 - sample read position */
261			: "%eax", "%ecx", /"%edx",/ "%edi",
262			"%xmm2", /* holds pSrc[int_pos] of the 4 samples at the end */
263			"%xmm3", /* holds pSrc[int_pos+1] of the 4 samples at the end */
264			"mm4", /* holds integer position of sample 0-1 at the end */
265			"mm5", /* holds integer position of sample 2-3 at the end */
266			"st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)"
267			);
268			/* linear interpolation of the 4 samples simultaniously */
269			__asm__ __volatile__ (
270			"subps %%xmm2,%%xmm3 # xmm3 = pSrc[pos_int+1] - pSrc[pos_int]\n\t"
271			"mulps %%xmm0,%%xmm3 # xmm3 = pos_fract * (pSrc[pos_int+1] - pSrc[pos_int])\n\t"
272			"addps %%xmm3,%%xmm2 # xmm2 = pSrc[pos_int] + (pos_fract * (pSrc[pos_int+1] - pSrc[pos_int]))\n\t"
273			: /* no output */
274			: /* no input */
275			: "%xmm2" /* holds linear interpolated sample point (of all 4 samples) at the end */
276			);
277			}
278
279			// TODO: no support for cubic interpolation yet
280			inline static void Interpolate4StepsStereoMMXSSE(sample_t* pSrc, void* Pos, float& Pitch) {
281			/* calculate playback position of each of the 4 samples by adding the associated pitch */
282			__asm__ __volatile__ (
283			"movss (%0),%%xmm0 # sample position of sample[0] -> xmm0[0]\n\t"
284			"movss %1,%%xmm1 # copy pitch -> xmm1[0]\n\t"
285			"shufps $0x90,%%xmm0,%%xmm0 # shift up, but keep xmm0[0]\n\t"
286			"addss %%xmm1,%%xmm0 # calculate sample position of sample[1]\n\t"
287			"shufps $0x90,%%xmm0,%%xmm0 # shift up, but keep xmm0[0]\n\t"
288			"addss %%xmm1,%%xmm0 # calculate sample position of sample[2]\n\t"
289			"shufps $0x90,%%xmm0,%%xmm0 # shift up, but keep xmm0[0]\n\t"
290			"addss %%xmm1,%%xmm0 # calculate sample position of sample[3]\n\t"
291			"movss %%xmm0,%%xmm2 # xmm0[0] -> xmm2[0]\n\t"
292			"addss %%xmm1,%%xmm2 # calculate initial sample position for the next 4-sample cycle\n\t"
293			"movss %%xmm2,(%0) # update 'Pos'\n\t"
294			"shufps $0x1b,%%xmm0,%%xmm0 # swap, so that xmm0[0]=sample pos 0, xmm0[1]=sample pos 1,...\n\t"
295			"cvttps2pi %%xmm0,%%mm4 # int(xmm0[0-1]) -> mm4\n\t"
296			"shufps $0xe4,%%xmm0,%%xmm1 # xmm0[2-3] -> xmm1[2-3]\n\t"
297			"shufps $0x0e,%%xmm1,%%xmm1 # xmm1[2-3] -> xmm1[0-1]\n\t"
298			"cvttps2pi %%xmm1,%%mm5 # int(xmm1[0-1]) -> mm5\n\t"
299			"cvtpi2ps %%mm5,%%xmm1 # double(mm5) -> xmm1[0-1]\n\t"
300			"shufps $0x44,%%xmm1,%%xmm1 # shift lower 2 FPs up to the upper 2 cells\n\t"
301			"cvtpi2ps %%mm4,%%xmm1 # double(mm4) -> xmm1[0-1]\n\t"
302			"subps %%xmm1,%%xmm0 # xmm0[1-3] = xmm0[1-3] - xmm1[1-3]\n\t"
303			:
304			: "r" (Pos), /* %0 */
305			"m" (Pitch) /* %1 */
306			: "%xmm0", /* holds fractional position (0.0 <= x < 1.0) of sample 0-3 at the end */
307			"%xmm1", /* holds integer position (back converted to SPFP) of sample 0-3 at the end */
308			"mm4", /* holds integer position of sample 0-1 at the end */
309			"mm5", /* holds integer position of sample 2-3 at the end */
310			"st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)"
311			);
312
313			/* get sample values of pSrc[pos_int], pSrc[pos_int+1], pSrc[pos_int+2] and pSrc[pos_int+3] of the 4 samples */
314			__asm__ __volatile__ (
315			"xorl %%eax,%%eax # clear eax\n\t"
316			"xorl %%edx,%%edx # clear edx\n\t"
317			"movd %%mm4,%%edi # sample position of sample 0\n\t"
318			"psrlq $32,%%mm4 # mm4 >> 32\n\t"
319			"movswl (%0,%%edi,4),%%eax # pSrc[pos_int] (sample 0)\n\t"
320			"cvtsi2ss %%eax, %%xmm2 # pSrc[pos_int] -> xmm2[0]\n\t"
321			"shufps $0x93, %%xmm2, %%xmm2 # shift up\n\t"
322			"movswl 2(%0,%%edi,4),%%edx # pSrc[pos_int] (sample 0+1)\n\t"
323			"cvtsi2ss %%edx, %%xmm3 # pSrc[pos_int] -> xmm3[0]\n\t"
324			"shufps $0x93, %%xmm3, %%xmm3 # shift up\n\t"
325			"movswl 4(%0,%%edi,4),%%eax # pSrc[pos_int] (sample 0+2)\n\t"
326			"cvtsi2ss %%eax, %%xmm4 # pSrc[pos_int] -> xmm4[0]\n\t"
327			"shufps $0x93, %%xmm4, %%xmm4 # shift up\n\t"
328			"movswl 6(%0,%%edi,4),%%edx # pSrc[pos_int] (sample 0+3)\n\t"
329			"cvtsi2ss %%edx, %%xmm5 # pSrc[pos_int] -> xmm5[0]\n\t"
330			"movd %%mm4,%%edi # sample position of sample 1\n\t"
331			"shufps $0x93, %%xmm5, %%xmm5 # shift up\n\t"
332			"movswl (%0,%%edi,4),%%eax # pSrc[pos_int] (sample 1)\n\t"
333			"cvtsi2ss %%eax, %%xmm2 # pSrc[pos_int] -> xmm2[0]\n\t"
334			"shufps $0x93, %%xmm2, %%xmm2 # shift up\n\t"
335			"movswl 2(%0,%%edi,4),%%edx # pSrc[pos_int] (sample 1+1)\n\t"
336			"cvtsi2ss %%edx, %%xmm3 # pSrc[pos_int] -> xmm3[0]\n\t"
337			"shufps $0x93, %%xmm3, %%xmm3 # shift up\n\t"
338			"movswl 4(%0,%%edi,4),%%eax # pSrc[pos_int] (sample 1+2)\n\t"
339			"cvtsi2ss %%eax, %%xmm4 # pSrc[pos_int] -> xmm4[0]\n\t"
340			"shufps $0x93, %%xmm4, %%xmm4 # shift up\n\t"
341			"movswl 6(%0,%%edi,4),%%edx # pSrc[pos_int] (sample 1+3)\n\t"
342			"cvtsi2ss %%edx, %%xmm5 # pSrc[pos_int] -> xmm5[0]\n\t"
343			"movd %%mm5,%%edi # sample position of sample 2\n\t"
344			"shufps $0x93, %%xmm5, %%xmm5 # shift up\n\t"
345			"psrlq $32,%%mm5 # mm5 >> 32\n\t"
346			"movswl (%0,%%edi,4),%%eax # pSrc[pos_int] (sample 2)\n\t"
347			"cvtsi2ss %%eax, %%xmm2 # pSrc[pos_int] -> xmm2[0]\n\t"
348			"shufps $0x93, %%xmm2, %%xmm2 # shift up\n\t"
349			"movswl 2(%0,%%edi,4),%%edx # pSrc[pos_int] (sample 2+1)\n\t"
350			"cvtsi2ss %%edx, %%xmm3 # pSrc[pos_int] -> xmm3[0]\n\t"
351			"shufps $0x93, %%xmm3, %%xmm3 # shift up\n\t"
352			"movswl 4(%0,%%edi,4),%%eax # pSrc[pos_int] (sample 2+2)\n\t"
353			"cvtsi2ss %%eax, %%xmm4 # pSrc[pos_int] -> xmm4[0]\n\t"
354			"shufps $0x93, %%xmm4, %%xmm4 # shift up\n\t"
355			"movswl 6(%0,%%edi,4),%%edx # pSrc[pos_int] (sample 2+3)\n\t"
356			"cvtsi2ss %%edx, %%xmm5 # pSrc[pos_int] -> xmm5[0]\n\t"
357			"movd %%mm5,%%edi # sample position of sample 3\n\t"
358			"shufps $0x93, %%xmm5, %%xmm5 # shift up\n\t"
359			"movswl (%0,%%edi,4),%%eax # pSrc[pos_int] (sample 3)\n\t"
360			"cvtsi2ss %%eax, %%xmm2 # pSrc[pos_int] -> xmm2[0]\n\t"
361			"shufps $0x1b, %%xmm2, %%xmm2 # shift up\n\t"
362			"movswl 2(%0,%%edi,4),%%edx # pSrc[pos_int] (sample 3+1)\n\t"
363			"cvtsi2ss %%edx, %%xmm3 # pSrc[pos_int] -> xmm3[0]\n\t"
364			"shufps $0x1b, %%xmm3, %%xmm3 # shift up\n\t"
365			"movswl 4(%0,%%edi,4),%%eax # pSrc[pos_int] (sample 3+2)\n\t"
366			"cvtsi2ss %%eax, %%xmm4 # pSrc[pos_int] -> xmm4[0]\n\t"
367			"shufps $0x1b, %%xmm4, %%xmm4 # swap to correct order\n\t"
368			"movswl 6(%0,%%edi,4),%%edx # pSrc[pos_int] (sample 3+3)\n\t"
369			"cvtsi2ss %%edx, %%xmm5 # pSrc[pos_int] -> xmm5[0]\n\t"
370			"shufps $0x1b, %%xmm5, %%xmm5 # swap to correct order\n\t"
371			: /* no output */
372			: "S" (pSrc) /* %0 - sample read position */
373			: "%eax", "%edx", "%edi",
374			"xmm2", /* holds pSrc[int_pos] of the 4 samples at the end */
375			"xmm3", /* holds pSrc[int_pos+1] of the 4 samples at the end */
376			"xmm4", /* holds pSrc[int_pos+2] of the 4 samples at the end */
377			"xmm5", /* holds pSrc[int_pos+3] of the 4 samples at the end */
378			"mm4", /* holds integer position of sample 0-1 at the end */
379			"mm5", /* holds integer position of sample 2-3 at the end */
380			"st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)"
381			);
382			/* linear interpolation of the 4 samples (left & right channel) simultaniously */
383			__asm__ __volatile__ (
384			"subps %%xmm2,%%xmm4 # xmm4 = pSrc[pos_int+2] - pSrc[pos_int] (left channel)\n\t"
385			"mulps %%xmm0,%%xmm4 # xmm4 = pos_fract * (pSrc[pos_int+2] - pSrc[pos_int]) (left channel)\n\t"
386			"addps %%xmm4,%%xmm2 # xmm2 = pSrc[pos_int] + (pos_fract * (pSrc[pos_int+2] - pSrc[pos_int])) (left channel)\n\t"
387			"subps %%xmm3,%%xmm5 # xmm5 = pSrc[pos_int+3] - pSrc[pos_int+1] (right channel)\n\t"
388			"mulps %%xmm0,%%xmm5 # xmm5 = pos_fract * (pSrc[pos_int+3] - pSrc[pos_int+1]) (right channel)\n\t"
389			"addps %%xmm5,%%xmm3 # xmm3 = pSrc[pos_int+1] + (pos_fract * (pSrc[pos_int+3] - pSrc[pos_int+1])) (right channel)\n\t"
390			: /* no output */
391			: /* no input */
392			: "%xmm2", /* holds linear interpolated sample of left channel (of all 4 samples) at the end */
393			"%xmm3" /* holds linear interpolated sample of right channel (of all 4 samples) at the end */
394			);
395			}
396			};
397
398			} // namespace LinuxSampler
399
400			#endif // __LS_RESAMPLER_H__