engines/common/Resampler.h

/***************************************************************************
 *                                                                         *
 *   LinuxSampler - modular, streaming capable sampler                     *
 *                                                                         *
 *   Copyright (C) 2003, 2004 by Benno Senoner and Christian Schoenebeck   *
 *   Copyright (C) 2005 - 2007 Christian Schoenebeck                       *
 *                                                                         *
 *   This program is free software; you can redistribute it and/or modify  *
 *   it under the terms of the GNU General Public License as published by  *
 *   the Free Software Foundation; either version 2 of the License, or     *
 *   (at your option) any later version.                                   *
 *                                                                         *
 *   This program is distributed in the hope that it will be useful,       *
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
 *   GNU General Public License for more details.                          *
 *                                                                         *
 *   You should have received a copy of the GNU General Public License     *
 *   along with this program; if not, write to the Free Software           *
 *   Foundation, Inc., 59 Temple Place, Suite 330, Boston,                 *
 *   MA  02111-1307  USA                                                   *
 ***************************************************************************/

// Note: the assembly code is currently disabled, as it doesn't fit into
// the new synthesis core introduced by LS 0.4.0

#ifndef __LS_RESAMPLER_H__
#define __LS_RESAMPLER_H__

#include "../../common/global.h"

// TODO: cubic interpolation is not yet supported by the MMX/SSE(1) version though
// TODO: cubic interpolation is not supported for 24 bit samples
#ifndef USE_LINEAR_INTERPOLATION
# define USE_LINEAR_INTERPOLATION   1  ///< set to 0 if you prefer cubic interpolation (slower, better quality)
#endif

namespace LinuxSampler {

    /** @brief Stereo sample point
     *
     * Encapsulates one stereo sample point, thus signal value for one
     * sample point for left and right channel.
     */
    struct stereo_sample_t {
        float left;
        float right;
    };

    /** @brief Resampler Template
     *
     * This template provides pure C++ and MMX/SSE assembly implementations
     * for linear and cubic interpolation for pitching a mono or stereo
     * input signal.
     */
    template<bool INTERPOLATE,bool BITDEPTH24>
    class Resampler {
        public:
            inline static float GetNextSampleMonoCPP(sample_t* pSrc, double* Pos, float& Pitch) {
                if (INTERPOLATE) return Interpolate1StepMonoCPP(pSrc, Pos, Pitch);
                else { // no pitch, so no interpolation necessary
                    int pos_int = (int) *Pos;
                    *Pos += 1.0;
                    return pSrc [pos_int];
                }
            }

            inline static stereo_sample_t GetNextSampleStereoCPP(sample_t* pSrc, double* Pos, float& Pitch) {
                if (INTERPOLATE) return Interpolate1StepStereoCPP(pSrc, Pos, Pitch);
                else { // no pitch, so no interpolation necessary
                    int pos_int = (int) *Pos;
                    pos_int <<= 1;
                    *Pos += 1.0;
                    stereo_sample_t samplePoint;
                    samplePoint.left  = pSrc[pos_int];
                    samplePoint.right = pSrc[pos_int+1];
                    return samplePoint;
                }
            }

#if 0 // CONFIG_ASM && ARCH_X86
            inline static void GetNext4SamplesMonoMMXSSE(sample_t* pSrc, void* Pos, float& Pitch) {
                if (INTERPOLATE) Interpolate4StepsMonoMMXSSE(pSrc, Pos, Pitch);
                else { // no pitch, so no interpolation necessary
                    const float __4f = 4.0f;
                    __asm__ __volatile__ (
                        "movss    (%1), %%xmm5           # load Pos\n\t"
                        "cvtss2si %%xmm5, %%edi          # int(Pos)\n\t"
                        "addss    %2, %%xmm5             # Pos += 4.0f\n\t"
                        "movswl   (%0,%%edi,2), %%eax    # load sample 0\n\t"
                        "cvtsi2ss  %%eax, %%xmm2         # convert to float\n\t"
                        "shufps    $0x93, %%xmm2, %%xmm2 # shift up\n\t"
                        "movswl   2(%0,%%edi,2), %%edx   # load sample 1\n\t"
                        "cvtsi2ss  %%edx, %%xmm2         # convert to float\n\t"
                        "shufps    $0x93, %%xmm2, %%xmm2 # shift up\n\t"
                        "movss     %%xmm5, (%1)          # update Pos\n\t"
                        "movswl   4(%0,%%edi,2), %%eax   # load sample 2\n\t"
                        "cvtsi2ss  %%eax, %%xmm2         # convert to float\n\t"
                        "shufps    $0x93, %%xmm2, %%xmm2 # shift up\n\t"
                        "movswl   6(%0,%%edi,2), %%edx   # load sample 3\n\t"
                        "cvtsi2ss  %%edx, %%xmm2         # convert to float\n\t"
                        "shufps    $0x1b, %%xmm2, %%xmm2 # swap to correct order\n\t"
                        :: "r" (pSrc), "r" (Pos), "m" (__4f)
                        :  "%eax", "%edx", "%edi"
                    );
                }
            }

            inline static void GetNext4SamplesStereoMMXSSE(sample_t* pSrc, void* Pos, float& Pitch) {
                if (INTERPOLATE) {
                    Interpolate4StepsStereoMMXSSE(pSrc, Pos, Pitch);
                    //EMMS;
                } else { // no pitch, so no interpolation necessary
                    const float __4f = 4.0f;
                    __asm__ __volatile__ (
                        "movss    (%1), %%xmm5           # load Pos\n\t"
                        "cvtss2si %%xmm5, %%edi          # int(Pos)\n\t"
                        "addss    %2, %%xmm5             # Pos += 4.0f\n\t"
                        "movswl    (%0, %%edi,4), %%eax  # load sample 0 (left)\n\t"
                        "cvtsi2ss  %%eax, %%xmm2         # convert to float\n\t"
                        "shufps    $0x93, %%xmm2, %%xmm2 # shift up\n\t"
                        "movss     %%xmm5, (%1)          # update Pos\n\t"
                        "movswl   2(%0, %%edi,4), %%edx  # load sample 0 (left)\n\t"
                        "cvtsi2ss  %%edx, %%xmm3         # convert to float\n\t"
                        "shufps    $0x93, %%xmm3, %%xmm3 # shift up\n\t"
                        "movswl   4(%0, %%edi,4), %%eax  # load sample 1 (left)\n\t"
                        "cvtsi2ss  %%eax, %%xmm2         # convert to float\n\t"
                        "shufps    $0x93, %%xmm2, %%xmm2 # shift up\n\t"
                        "movswl   6(%0, %%edi,4), %%edx  # load sample 1 (right)\n\t"
                        "cvtsi2ss  %%edx, %%xmm3         # convert to float\n\t"
                        "shufps    $0x93, %%xmm3, %%xmm3 # shift up\n\t"
                        "movswl   8(%0, %%edi,4), %%eax  # load sample 2 (left)\n\t"
                        "cvtsi2ss  %%eax, %%xmm2         # convert to float\n\t"
                        "shufps    $0x93, %%xmm2, %%xmm2 # shift up\n\t"
                        "movswl  10(%0, %%edi,4), %%edx  # load sample 2 (right)\n\t"
                        "cvtsi2ss  %%edx, %%xmm3         # convert to float\n\t"
                        "shufps    $0x93, %%xmm3, %%xmm3 # shift up\n\t"
                        "movswl  12(%0, %%edi,4), %%eax  # load sample 3 (left)\n\t"
                        "cvtsi2ss  %%eax, %%xmm2         # convert to float\n\t"
                        "shufps    $0x1b, %%xmm2, %%xmm2 # swap to correct order\n\t"
                        "movswl  14(%0, %%edi,4), %%edx  # load sample 3 (right)\n\t"
                        "cvtsi2ss  %%edx, %%xmm3         # convert to float\n\t"
                        "shufps    $0x1b, %%xmm3, %%xmm3 # swap to correct order\n\t"
                        :: "r" (pSrc), "r" (Pos), "m" (__4f)
                        :  "%eax", "%edx", "%edi"
                    );
                }
            }
#endif // CONFIG_ASM && ARCH_X86

        protected:

            static int getSample(sample_t* src, int pos) {
                if (BITDEPTH24) {
                    pos *= 3;
                    unsigned char* p = (unsigned char*)src;
                    return p[pos] << 8 | p[pos + 1] << 16 | p[pos + 2] << 24;
                } else {
                    return src[pos];
                }
            }

            inline static float Interpolate1StepMonoCPP(sample_t* pSrc, double* Pos, float& Pitch) {
                int   pos_int   = (int) *Pos;     // integer position
                float pos_fract = *Pos - pos_int; // fractional part of position

                #if USE_LINEAR_INTERPOLATION
                    int x1 = getSample(pSrc, pos_int);
                    int x2 = getSample(pSrc, pos_int + 1);
                    float samplePoint  = (x1 + pos_fract * (x2 - x1));
                #else // polynomial interpolation
                    float xm1 = pSrc[pos_int];
                    float x0  = pSrc[pos_int+1];
                    float x1  = pSrc[pos_int+2];
                    float x2  = pSrc[pos_int+3];
                    float a   = (3.0f * (x0 - x1) - xm1 + x2) * 0.5f;
                    float b   = 2.0f * x1 + xm1 - (5.0f * x0 + x2) * 0.5f;
                    float c   = (x1 - xm1) * 0.5f;
                    float samplePoint =  (((a * pos_fract) + b) * pos_fract + c) * pos_fract + x0;
                #endif // USE_LINEAR_INTERPOLATION

                *Pos += Pitch;
                return samplePoint;
            }

            inline static stereo_sample_t Interpolate1StepStereoCPP(sample_t* pSrc, double* Pos, float& Pitch) {
                int   pos_int   = (int) *Pos;  // integer position
                float pos_fract = *Pos - pos_int;     // fractional part of position
                pos_int <<= 1;

                stereo_sample_t samplePoint;

                #if USE_LINEAR_INTERPOLATION
                    // left channel
                    int x1 = getSample(pSrc, pos_int);
                    int x2 = getSample(pSrc, pos_int + 2);
                    samplePoint.left  = (x1 + pos_fract * (x2 - x1));
                    // right channel
                    x1 = getSample(pSrc, pos_int + 1);
                    x2 = getSample(pSrc, pos_int + 3);
                    samplePoint.right = (x1 + pos_fract * (x2 - x1));
                #else // polynomial interpolation
                    // calculate left channel
                    float xm1 = pSrc[pos_int];
                    float x0  = pSrc[pos_int+2];
                    float x1  = pSrc[pos_int+4];
                    float x2  = pSrc[pos_int+6];
                    float a   = (3.0f * (x0 - x1) - xm1 + x2) * 0.5f;
                    float b   = 2.0f * x1 + xm1 - (5.0f * x0 + x2) * 0.5f;
                    float c   = (x1 - xm1) * 0.5f;
                    samplePoint.left = (((a * pos_fract) + b) * pos_fract + c) * pos_fract + x0;

                    //calculate right channel
                    xm1 = pSrc[pos_int+1];
                    x0  = pSrc[pos_int+3];
                    x1  = pSrc[pos_int+5];
                    x2  = pSrc[pos_int+7];
                    a   = (3.0f * (x0 - x1) - xm1 + x2) * 0.5f;
                    b   = 2.0f * x1 + xm1 - (5.0f * x0 + x2) * 0.5f;
                    c   = (x1 - xm1) * 0.5f;
                    samplePoint.right =  (((a * pos_fract) + b) * pos_fract + c) * pos_fract + x0;
                #endif // USE_LINEAR_INTERPOLATION

                *Pos += Pitch;
                return samplePoint;
            }

#if 0 // CONFIG_ASM && ARCH_X86
            // TODO: no support for cubic interpolation yet
            inline static void Interpolate4StepsMonoMMXSSE(sample_t* pSrc, void* Pos, float& Pitch) {
                /* calculate playback position of each of the 4 samples by adding the associated pitch */
                __asm__ __volatile__ (
                    "movss    (%0),%%xmm0             # sample position of sample[0] -> xmm0[0]\n\t"
                    "movss    %1,%%xmm1               # copy pitch -> xmm1[0]\n\t"
                    "shufps   $0x90,%%xmm0,%%xmm0     # shift up, but keep xmm0[0]\n\t"
                    "addss    %%xmm1,%%xmm0           # calculate sample position of sample[1]\n\t"
                    "shufps   $0x90,%%xmm0,%%xmm0     # shift up, but keep xmm0[0]\n\t"
                    "addss    %%xmm1,%%xmm0           # calculate sample position of sample[2]\n\t"
                    "shufps   $0x90,%%xmm0,%%xmm0     # shift up, but keep xmm0[0]\n\t"
                    "addss    %%xmm1,%%xmm0           # calculate sample position of sample[3]\n\t"
                    "movss    %%xmm0,%%xmm2           # xmm0[0] -> xmm2[0]\n\t"
                    "addss    %%xmm1,%%xmm2           # calculate initial sample position for the next 4-sample cycle\n\t"
                    "movss    %%xmm2,(%0)             # update 'Pos'\n\t"
                    "shufps   $0x1b,%%xmm0,%%xmm0     # swap, so that xmm0[0]=sample pos 0, xmm0[1]=sample pos 1,...\n\t"
                    "cvttps2pi %%xmm0,%%mm4           # int(xmm0[0-1]) -> mm4\n\t"
                    "shufps   $0xe4,%%xmm0,%%xmm1     # xmm0[2-3] -> xmm1[2-3]\n\t"
                    "shufps   $0x0e,%%xmm1,%%xmm1     # xmm1[2-3] -> xmm1[0-1]\n\t"
                    "cvttps2pi %%xmm1,%%mm5           # int(xmm1[0-1]) -> mm5\n\t"
                    "cvtpi2ps %%mm5,%%xmm1            # double(mm5) -> xmm1[0-1]\n\t"
                    "shufps   $0x44,%%xmm1,%%xmm1     # shift lower 2 FPs up to the upper 2 cells\n\t"
                    "cvtpi2ps %%mm4,%%xmm1            # double(mm4) -> xmm1[0-1]\n\t"
                    "subps    %%xmm1,%%xmm0           # xmm0[1-3] = xmm0[1-3] - xmm1[1-3]\n\t"
                    :
                    : "r" (Pos),  /* %0 */
                      "m" (Pitch) /* %1 */
                    : "%xmm0", /* holds fractional position (0.0 <= x < 1.0) of sample 0-3 at the end */
                      "%xmm1", /* holds integer position (back converted to SPFP) of sample 0-3 at the end */
                      "mm4",  /* holds integer position of sample 0-1 at the end */
                      "mm5",  /* holds integer position of sample 2-3 at the end */
                      "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)"
                );
                /* get sample values of pSrc[pos_int] and pSrc[pos_int+1] of the 4 samples */
                __asm__ __volatile__ (
                    "movd   %%mm4,%%edi               # sample position of sample 0\n\t"
                    "psrlq  $32,%%mm4                 # mm4 >> 32\n\t"
                    "movswl (%0,%%edi,2),%%eax        # pSrc[pos_int] (sample 0)\n\t"
                    "movswl 2(%0,%%edi,2),%%ecx       # pSrc[pos_int] (sample 0+1)\n\t"
                    "cvtsi2ss %%eax, %%xmm2           # pSrc[pos_int] -> xmm2[0]\n\t"
                    "cvtsi2ss %%ecx, %%xmm3           # pSrc[pos_int] -> xmm3[0]\n\t"
                    "shufps    $0x93, %%xmm2, %%xmm2  # shift up\n\t"
                    "shufps    $0x93, %%xmm3, %%xmm3  # shift up\n\t"
                    "movd   %%mm4,%%edi               # sample position of sample 1\n\t"
                    "movswl (%0,%%edi,2),%%eax        # pSrc[pos_int] (sample 1)\n\t"
                    "movswl 2(%0,%%edi,2),%%ecx       # pSrc[pos_int] (sample 1+1)\n\t"
                    "cvtsi2ss %%eax, %%xmm2           # pSrc[pos_int] -> xmm2[0]\n\t"
                    "cvtsi2ss %%ecx, %%xmm3           # pSrc[pos_int] -> xmm3[0]\n\t"
                    "shufps    $0x93, %%xmm2, %%xmm2  # shift up\n\t"
                    "shufps    $0x93, %%xmm3, %%xmm3  # shift up\n\t"
                    "movd   %%mm5,%%edi               # sample position of sample 2\n\t"
                    "psrlq  $32,%%mm5                 # mm5 >> 32\n\t"
                    "movswl (%0,%%edi,2),%%eax        # pSrc[pos_int] (sample 2)\n\t"
                    "movswl 2(%0,%%edi,2),%%ecx       # pSrc[pos_int] (sample 2+1)\n\t"
                    "cvtsi2ss %%eax, %%xmm2           # pSrc[pos_int] -> xmm2[0]\n\t"
                    "cvtsi2ss %%ecx, %%xmm3           # pSrc[pos_int] -> xmm3[0]\n\t"
                    "shufps    $0x93, %%xmm2, %%xmm2  # shift up\n\t"
                    "shufps    $0x93, %%xmm3, %%xmm3  # shift up\n\t"
                    "movd   %%mm5,%%edi               # sample position of sample 2\n\t"
                    "movswl (%0,%%edi,2),%%eax        # pSrc[pos_int] (sample 3)\n\t"
                    "movswl 2(%0,%%edi,2),%%ecx       # pSrc[pos_int] (sample 3+1)\n\t"
                    "cvtsi2ss %%eax, %%xmm2           # pSrc[pos_int] -> xmm2[0]\n\t"
                    "cvtsi2ss %%ecx, %%xmm3           # pSrc[pos_int] -> xmm3[0]\n\t"
                    "shufps    $0x1b, %%xmm2, %%xmm2  # swap to correct order\n\t"
                    "shufps    $0x1b, %%xmm3, %%xmm3  # swap to correct order\n\t"
                    : /* no output */
                    : "S" (pSrc) /* %0 - sample read position  */
                    : "%eax", "%ecx", /*"%edx",*/ "%edi",
                      "%xmm2", /* holds pSrc[int_pos]   of the 4 samples at the end */
                      "%xmm3", /* holds pSrc[int_pos+1] of the 4 samples at the end */
                      "mm4",  /* holds integer position of sample 0-1 at the end */
                      "mm5",  /* holds integer position of sample 2-3 at the end */
                      "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)"
                );
                /* linear interpolation of the 4 samples simultaniously */
                __asm__ __volatile__ (
                    "subps %%xmm2,%%xmm3   # xmm3 = pSrc[pos_int+1] - pSrc[pos_int]\n\t"
                    "mulps %%xmm0,%%xmm3   # xmm3 = pos_fract * (pSrc[pos_int+1] - pSrc[pos_int])\n\t"
                    "addps %%xmm3,%%xmm2   # xmm2 = pSrc[pos_int] + (pos_fract * (pSrc[pos_int+1] - pSrc[pos_int]))\n\t"
                    : /* no output */
                    : /* no input */
                    : "%xmm2" /* holds linear interpolated sample point (of all 4 samples) at the end */
                );
            }

            // TODO: no support for cubic interpolation yet
            inline static void Interpolate4StepsStereoMMXSSE(sample_t* pSrc, void* Pos, float& Pitch) {
                /* calculate playback position of each of the 4 samples by adding the associated pitch */
                __asm__ __volatile__ (
                    "movss    (%0),%%xmm0             # sample position of sample[0] -> xmm0[0]\n\t"
                    "movss    %1,%%xmm1               # copy pitch -> xmm1[0]\n\t"
                    "shufps   $0x90,%%xmm0,%%xmm0     # shift up, but keep xmm0[0]\n\t"
                    "addss    %%xmm1,%%xmm0           # calculate sample position of sample[1]\n\t"
                    "shufps   $0x90,%%xmm0,%%xmm0     # shift up, but keep xmm0[0]\n\t"
                    "addss    %%xmm1,%%xmm0           # calculate sample position of sample[2]\n\t"
                    "shufps   $0x90,%%xmm0,%%xmm0     # shift up, but keep xmm0[0]\n\t"
                    "addss    %%xmm1,%%xmm0           # calculate sample position of sample[3]\n\t"
                    "movss    %%xmm0,%%xmm2           # xmm0[0] -> xmm2[0]\n\t"
                    "addss    %%xmm1,%%xmm2           # calculate initial sample position for the next 4-sample cycle\n\t"
                    "movss    %%xmm2,(%0)             # update 'Pos'\n\t"
                    "shufps   $0x1b,%%xmm0,%%xmm0     # swap, so that xmm0[0]=sample pos 0, xmm0[1]=sample pos 1,...\n\t"
                    "cvttps2pi %%xmm0,%%mm4           # int(xmm0[0-1]) -> mm4\n\t"
                    "shufps   $0xe4,%%xmm0,%%xmm1     # xmm0[2-3] -> xmm1[2-3]\n\t"
                    "shufps   $0x0e,%%xmm1,%%xmm1     # xmm1[2-3] -> xmm1[0-1]\n\t"
                    "cvttps2pi %%xmm1,%%mm5           # int(xmm1[0-1]) -> mm5\n\t"
                    "cvtpi2ps %%mm5,%%xmm1            # double(mm5) -> xmm1[0-1]\n\t"
                    "shufps   $0x44,%%xmm1,%%xmm1     # shift lower 2 FPs up to the upper 2 cells\n\t"
                    "cvtpi2ps %%mm4,%%xmm1            # double(mm4) -> xmm1[0-1]\n\t"
                    "subps    %%xmm1,%%xmm0           # xmm0[1-3] = xmm0[1-3] - xmm1[1-3]\n\t"
                    :
                    : "r" (Pos),  /* %0 */
                      "m" (Pitch) /* %1 */
                    : "%xmm0", /* holds fractional position (0.0 <= x < 1.0) of sample 0-3 at the end */
                      "%xmm1", /* holds integer position (back converted to SPFP) of sample 0-3 at the end */
                      "mm4",  /* holds integer position of sample 0-1 at the end */
                      "mm5",  /* holds integer position of sample 2-3 at the end */
                      "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)"
                );

                /* get sample values of pSrc[pos_int], pSrc[pos_int+1], pSrc[pos_int+2] and pSrc[pos_int+3] of the 4 samples */
                __asm__ __volatile__ (
                    "xorl   %%eax,%%eax               # clear eax\n\t"
                    "xorl   %%edx,%%edx               # clear edx\n\t"
                    "movd   %%mm4,%%edi               # sample position of sample 0\n\t"
                    "psrlq  $32,%%mm4                 # mm4 >> 32\n\t"
                    "movswl (%0,%%edi,4),%%eax        # pSrc[pos_int] (sample 0)\n\t"
                    "cvtsi2ss %%eax, %%xmm2           # pSrc[pos_int] -> xmm2[0]\n\t"
                    "shufps    $0x93, %%xmm2, %%xmm2  # shift up\n\t"
                    "movswl 2(%0,%%edi,4),%%edx       # pSrc[pos_int] (sample 0+1)\n\t"
                    "cvtsi2ss %%edx, %%xmm3           # pSrc[pos_int] -> xmm3[0]\n\t"
                    "shufps    $0x93, %%xmm3, %%xmm3  # shift up\n\t"
                    "movswl 4(%0,%%edi,4),%%eax       # pSrc[pos_int] (sample 0+2)\n\t"
                    "cvtsi2ss %%eax, %%xmm4           # pSrc[pos_int] -> xmm4[0]\n\t"
                    "shufps    $0x93, %%xmm4, %%xmm4  # shift up\n\t"
                    "movswl 6(%0,%%edi,4),%%edx       # pSrc[pos_int] (sample 0+3)\n\t"
                    "cvtsi2ss %%edx, %%xmm5           # pSrc[pos_int] -> xmm5[0]\n\t"
                    "movd   %%mm4,%%edi               # sample position of sample 1\n\t"
                    "shufps    $0x93, %%xmm5, %%xmm5  # shift up\n\t"
                    "movswl (%0,%%edi,4),%%eax        # pSrc[pos_int] (sample 1)\n\t"
                    "cvtsi2ss %%eax, %%xmm2           # pSrc[pos_int] -> xmm2[0]\n\t"
                    "shufps    $0x93, %%xmm2, %%xmm2  # shift up\n\t"
                    "movswl 2(%0,%%edi,4),%%edx       # pSrc[pos_int] (sample 1+1)\n\t"
                    "cvtsi2ss %%edx, %%xmm3           # pSrc[pos_int] -> xmm3[0]\n\t"
                    "shufps    $0x93, %%xmm3, %%xmm3  # shift up\n\t"
                    "movswl 4(%0,%%edi,4),%%eax       # pSrc[pos_int] (sample 1+2)\n\t"
                    "cvtsi2ss %%eax, %%xmm4           # pSrc[pos_int] -> xmm4[0]\n\t"
                    "shufps    $0x93, %%xmm4, %%xmm4  # shift up\n\t"
                    "movswl 6(%0,%%edi,4),%%edx       # pSrc[pos_int] (sample 1+3)\n\t"
                    "cvtsi2ss %%edx, %%xmm5           # pSrc[pos_int] -> xmm5[0]\n\t"
                    "movd   %%mm5,%%edi               # sample position of sample 2\n\t"
                    "shufps    $0x93, %%xmm5, %%xmm5  # shift up\n\t"
                    "psrlq  $32,%%mm5                 # mm5 >> 32\n\t"
                    "movswl (%0,%%edi,4),%%eax        # pSrc[pos_int] (sample 2)\n\t"
                    "cvtsi2ss %%eax, %%xmm2           # pSrc[pos_int] -> xmm2[0]\n\t"
                    "shufps    $0x93, %%xmm2, %%xmm2  # shift up\n\t"
                    "movswl 2(%0,%%edi,4),%%edx       # pSrc[pos_int] (sample 2+1)\n\t"
                    "cvtsi2ss %%edx, %%xmm3           # pSrc[pos_int] -> xmm3[0]\n\t"
                    "shufps    $0x93, %%xmm3, %%xmm3  # shift up\n\t"
                    "movswl 4(%0,%%edi,4),%%eax       # pSrc[pos_int] (sample 2+2)\n\t"
                    "cvtsi2ss %%eax, %%xmm4           # pSrc[pos_int] -> xmm4[0]\n\t"
                    "shufps    $0x93, %%xmm4, %%xmm4  # shift up\n\t"
                    "movswl 6(%0,%%edi,4),%%edx       # pSrc[pos_int] (sample 2+3)\n\t"
                    "cvtsi2ss %%edx, %%xmm5           # pSrc[pos_int] -> xmm5[0]\n\t"
                    "movd   %%mm5,%%edi               # sample position of sample 3\n\t"
                    "shufps    $0x93, %%xmm5, %%xmm5  # shift up\n\t"
                    "movswl (%0,%%edi,4),%%eax        # pSrc[pos_int] (sample 3)\n\t"
                    "cvtsi2ss %%eax, %%xmm2           # pSrc[pos_int] -> xmm2[0]\n\t"
                    "shufps    $0x1b, %%xmm2, %%xmm2  # shift up\n\t"
                    "movswl 2(%0,%%edi,4),%%edx       # pSrc[pos_int] (sample 3+1)\n\t"
                    "cvtsi2ss %%edx, %%xmm3           # pSrc[pos_int] -> xmm3[0]\n\t"
                    "shufps    $0x1b, %%xmm3, %%xmm3  # shift up\n\t"
                    "movswl 4(%0,%%edi,4),%%eax       # pSrc[pos_int] (sample 3+2)\n\t"
                    "cvtsi2ss %%eax, %%xmm4           # pSrc[pos_int] -> xmm4[0]\n\t"
                    "shufps    $0x1b, %%xmm4, %%xmm4  # swap to correct order\n\t"
                    "movswl 6(%0,%%edi,4),%%edx       # pSrc[pos_int] (sample 3+3)\n\t"
                    "cvtsi2ss %%edx, %%xmm5           # pSrc[pos_int] -> xmm5[0]\n\t"
                    "shufps    $0x1b, %%xmm5, %%xmm5  # swap to correct order\n\t"
                    : /* no output */
                    : "S" (pSrc) /* %0 - sample read position  */
                    : "%eax", "%edx", "%edi",
                      "xmm2", /* holds pSrc[int_pos]   of the 4 samples at the end */
                      "xmm3", /* holds pSrc[int_pos+1] of the 4 samples at the end */
                      "xmm4", /* holds pSrc[int_pos+2] of the 4 samples at the end */
                      "xmm5", /* holds pSrc[int_pos+3] of the 4 samples at the end */
                      "mm4",  /* holds integer position of sample 0-1 at the end */
                      "mm5",  /* holds integer position of sample 2-3 at the end */
                      "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)"
                );
                /* linear interpolation of the 4 samples (left & right channel) simultaniously */
                __asm__ __volatile__ (
                    "subps %%xmm2,%%xmm4   # xmm4 = pSrc[pos_int+2] - pSrc[pos_int] (left channel)\n\t"
                    "mulps %%xmm0,%%xmm4   # xmm4 = pos_fract * (pSrc[pos_int+2] - pSrc[pos_int]) (left channel)\n\t"
                    "addps %%xmm4,%%xmm2   # xmm2 = pSrc[pos_int] + (pos_fract * (pSrc[pos_int+2] - pSrc[pos_int])) (left channel)\n\t"
                    "subps %%xmm3,%%xmm5   # xmm5 = pSrc[pos_int+3] - pSrc[pos_int+1] (right channel)\n\t"
                    "mulps %%xmm0,%%xmm5   # xmm5 = pos_fract * (pSrc[pos_int+3] - pSrc[pos_int+1]) (right channel)\n\t"
                    "addps %%xmm5,%%xmm3   # xmm3 = pSrc[pos_int+1] + (pos_fract * (pSrc[pos_int+3] - pSrc[pos_int+1])) (right channel)\n\t"
                    : /* no output */
                    : /* no input */
                    : "%xmm2", /* holds linear interpolated sample of left  channel (of all 4 samples) at the end */
                      "%xmm3"  /* holds linear interpolated sample of right channel (of all 4 samples) at the end */
                );
            }
#endif // CONFIG_ASM && ARCH_X86
    };

} // namespace LinuxSampler

#endif // __LS_RESAMPLER_H__
1	/***************************************************************************
2	* *
3	* LinuxSampler - modular, streaming capable sampler *
4	* *
5	* Copyright (C) 2003, 2004 by Benno Senoner and Christian Schoenebeck *
6	* Copyright (C) 2005 - 2007 Christian Schoenebeck *
7	* *
8	* This program is free software; you can redistribute it and/or modify *
9	* it under the terms of the GNU General Public License as published by *
10	* the Free Software Foundation; either version 2 of the License, or *
11	* (at your option) any later version. *
12	* *
13	* This program is distributed in the hope that it will be useful, *
14	* but WITHOUT ANY WARRANTY; without even the implied warranty of *
15	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
16	* GNU General Public License for more details. *
17	* *
18	* You should have received a copy of the GNU General Public License *
19	* along with this program; if not, write to the Free Software *
20	* Foundation, Inc., 59 Temple Place, Suite 330, Boston, *
21	* MA 02111-1307 USA *
22	***************************************************************************/
23
24	// Note: the assembly code is currently disabled, as it doesn't fit into
25	// the new synthesis core introduced by LS 0.4.0
26
27	#ifndef __LS_RESAMPLER_H__
28	#define __LS_RESAMPLER_H__
29
30	#include "../../common/global.h"
31
32	// TODO: cubic interpolation is not yet supported by the MMX/SSE(1) version though
33	// TODO: cubic interpolation is not supported for 24 bit samples
34	#ifndef USE_LINEAR_INTERPOLATION
35	# define USE_LINEAR_INTERPOLATION 1 ///< set to 0 if you prefer cubic interpolation (slower, better quality)
36	#endif
37
38	namespace LinuxSampler {
39
40	/** @brief Stereo sample point
41	*
42	* Encapsulates one stereo sample point, thus signal value for one
43	* sample point for left and right channel.
44	*/
45	struct stereo_sample_t {
46	float left;
47	float right;
48	};
49
50	/** @brief Resampler Template
51	*
52	* This template provides pure C++ and MMX/SSE assembly implementations
53	* for linear and cubic interpolation for pitching a mono or stereo
54	* input signal.
55	*/
56	template<bool INTERPOLATE,bool BITDEPTH24>
57	class Resampler {
58	public:
59	inline static float GetNextSampleMonoCPP(sample_t* pSrc, double* Pos, float& Pitch) {
60	if (INTERPOLATE) return Interpolate1StepMonoCPP(pSrc, Pos, Pitch);
61	else { // no pitch, so no interpolation necessary
62	int pos_int = (int) *Pos;
63	*Pos += 1.0;
64	return pSrc [pos_int];
65	}
66	}
67
68	inline static stereo_sample_t GetNextSampleStereoCPP(sample_t* pSrc, double* Pos, float& Pitch) {
69	if (INTERPOLATE) return Interpolate1StepStereoCPP(pSrc, Pos, Pitch);
70	else { // no pitch, so no interpolation necessary
71	int pos_int = (int) *Pos;
72	pos_int <<= 1;
73	*Pos += 1.0;
74	stereo_sample_t samplePoint;
75	samplePoint.left = pSrc[pos_int];
76	samplePoint.right = pSrc[pos_int+1];
77	return samplePoint;
78	}
79	}
80
81	#if 0 // CONFIG_ASM && ARCH_X86
82	inline static void GetNext4SamplesMonoMMXSSE(sample_t* pSrc, void* Pos, float& Pitch) {
83	if (INTERPOLATE) Interpolate4StepsMonoMMXSSE(pSrc, Pos, Pitch);
84	else { // no pitch, so no interpolation necessary
85	const float __4f = 4.0f;
86	__asm__ __volatile__ (
87	"movss (%1), %%xmm5 # load Pos\n\t"
88	"cvtss2si %%xmm5, %%edi # int(Pos)\n\t"
89	"addss %2, %%xmm5 # Pos += 4.0f\n\t"
90	"movswl (%0,%%edi,2), %%eax # load sample 0\n\t"
91	"cvtsi2ss %%eax, %%xmm2 # convert to float\n\t"
92	"shufps $0x93, %%xmm2, %%xmm2 # shift up\n\t"
93	"movswl 2(%0,%%edi,2), %%edx # load sample 1\n\t"
94	"cvtsi2ss %%edx, %%xmm2 # convert to float\n\t"
95	"shufps $0x93, %%xmm2, %%xmm2 # shift up\n\t"
96	"movss %%xmm5, (%1) # update Pos\n\t"
97	"movswl 4(%0,%%edi,2), %%eax # load sample 2\n\t"
98	"cvtsi2ss %%eax, %%xmm2 # convert to float\n\t"
99	"shufps $0x93, %%xmm2, %%xmm2 # shift up\n\t"
100	"movswl 6(%0,%%edi,2), %%edx # load sample 3\n\t"
101	"cvtsi2ss %%edx, %%xmm2 # convert to float\n\t"
102	"shufps $0x1b, %%xmm2, %%xmm2 # swap to correct order\n\t"
103	:: "r" (pSrc), "r" (Pos), "m" (__4f)
104	: "%eax", "%edx", "%edi"
105	);
106	}
107	}
108
109	inline static void GetNext4SamplesStereoMMXSSE(sample_t* pSrc, void* Pos, float& Pitch) {
110	if (INTERPOLATE) {
111	Interpolate4StepsStereoMMXSSE(pSrc, Pos, Pitch);
112	//EMMS;
113	} else { // no pitch, so no interpolation necessary
114	const float __4f = 4.0f;
115	__asm__ __volatile__ (
116	"movss (%1), %%xmm5 # load Pos\n\t"
117	"cvtss2si %%xmm5, %%edi # int(Pos)\n\t"
118	"addss %2, %%xmm5 # Pos += 4.0f\n\t"
119	"movswl (%0, %%edi,4), %%eax # load sample 0 (left)\n\t"
120	"cvtsi2ss %%eax, %%xmm2 # convert to float\n\t"
121	"shufps $0x93, %%xmm2, %%xmm2 # shift up\n\t"
122	"movss %%xmm5, (%1) # update Pos\n\t"
123	"movswl 2(%0, %%edi,4), %%edx # load sample 0 (left)\n\t"
124	"cvtsi2ss %%edx, %%xmm3 # convert to float\n\t"
125	"shufps $0x93, %%xmm3, %%xmm3 # shift up\n\t"
126	"movswl 4(%0, %%edi,4), %%eax # load sample 1 (left)\n\t"
127	"cvtsi2ss %%eax, %%xmm2 # convert to float\n\t"
128	"shufps $0x93, %%xmm2, %%xmm2 # shift up\n\t"
129	"movswl 6(%0, %%edi,4), %%edx # load sample 1 (right)\n\t"
130	"cvtsi2ss %%edx, %%xmm3 # convert to float\n\t"
131	"shufps $0x93, %%xmm3, %%xmm3 # shift up\n\t"
132	"movswl 8(%0, %%edi,4), %%eax # load sample 2 (left)\n\t"
133	"cvtsi2ss %%eax, %%xmm2 # convert to float\n\t"
134	"shufps $0x93, %%xmm2, %%xmm2 # shift up\n\t"
135	"movswl 10(%0, %%edi,4), %%edx # load sample 2 (right)\n\t"
136	"cvtsi2ss %%edx, %%xmm3 # convert to float\n\t"
137	"shufps $0x93, %%xmm3, %%xmm3 # shift up\n\t"
138	"movswl 12(%0, %%edi,4), %%eax # load sample 3 (left)\n\t"
139	"cvtsi2ss %%eax, %%xmm2 # convert to float\n\t"
140	"shufps $0x1b, %%xmm2, %%xmm2 # swap to correct order\n\t"
141	"movswl 14(%0, %%edi,4), %%edx # load sample 3 (right)\n\t"
142	"cvtsi2ss %%edx, %%xmm3 # convert to float\n\t"
143	"shufps $0x1b, %%xmm3, %%xmm3 # swap to correct order\n\t"
144	:: "r" (pSrc), "r" (Pos), "m" (__4f)
145	: "%eax", "%edx", "%edi"
146	);
147	}
148	}
149	#endif // CONFIG_ASM && ARCH_X86
150
151	protected:
152
153	static int getSample(sample_t* src, int pos) {
154	if (BITDEPTH24) {
155	pos *= 3;
156	unsigned char* p = (unsigned char*)src;
157	return p[pos] << 8 \| p[pos + 1] << 16 \| p[pos + 2] << 24;
158	} else {
159	return src[pos];
160	}
161	}
162
163	inline static float Interpolate1StepMonoCPP(sample_t* pSrc, double* Pos, float& Pitch) {
164	int pos_int = (int) *Pos; // integer position
165	float pos_fract = *Pos - pos_int; // fractional part of position
166
167	#if USE_LINEAR_INTERPOLATION
168	int x1 = getSample(pSrc, pos_int);
169	int x2 = getSample(pSrc, pos_int + 1);
170	float samplePoint = (x1 + pos_fract * (x2 - x1));
171	#else // polynomial interpolation
172	float xm1 = pSrc[pos_int];
173	float x0 = pSrc[pos_int+1];
174	float x1 = pSrc[pos_int+2];
175	float x2 = pSrc[pos_int+3];
176	float a = (3.0f * (x0 - x1) - xm1 + x2) * 0.5f;
177	float b = 2.0f * x1 + xm1 - (5.0f * x0 + x2) * 0.5f;
178	float c = (x1 - xm1) * 0.5f;
179	float samplePoint = (((a * pos_fract) + b) * pos_fract + c) * pos_fract + x0;
180	#endif // USE_LINEAR_INTERPOLATION
181
182	*Pos += Pitch;
183	return samplePoint;
184	}
185
186	inline static stereo_sample_t Interpolate1StepStereoCPP(sample_t* pSrc, double* Pos, float& Pitch) {
187	int pos_int = (int) *Pos; // integer position
188	float pos_fract = *Pos - pos_int; // fractional part of position
189	pos_int <<= 1;
190
191	stereo_sample_t samplePoint;
192
193	#if USE_LINEAR_INTERPOLATION
194	// left channel
195	int x1 = getSample(pSrc, pos_int);
196	int x2 = getSample(pSrc, pos_int + 2);
197	samplePoint.left = (x1 + pos_fract * (x2 - x1));
198	// right channel
199	x1 = getSample(pSrc, pos_int + 1);
200	x2 = getSample(pSrc, pos_int + 3);
201	samplePoint.right = (x1 + pos_fract * (x2 - x1));
202	#else // polynomial interpolation
203	// calculate left channel
204	float xm1 = pSrc[pos_int];
205	float x0 = pSrc[pos_int+2];
206	float x1 = pSrc[pos_int+4];
207	float x2 = pSrc[pos_int+6];
208	float a = (3.0f * (x0 - x1) - xm1 + x2) * 0.5f;
209	float b = 2.0f * x1 + xm1 - (5.0f * x0 + x2) * 0.5f;
210	float c = (x1 - xm1) * 0.5f;
211	samplePoint.left = (((a * pos_fract) + b) * pos_fract + c) * pos_fract + x0;
212
213	//calculate right channel
214	xm1 = pSrc[pos_int+1];
215	x0 = pSrc[pos_int+3];
216	x1 = pSrc[pos_int+5];
217	x2 = pSrc[pos_int+7];
218	a = (3.0f * (x0 - x1) - xm1 + x2) * 0.5f;
219	b = 2.0f * x1 + xm1 - (5.0f * x0 + x2) * 0.5f;
220	c = (x1 - xm1) * 0.5f;
221	samplePoint.right = (((a * pos_fract) + b) * pos_fract + c) * pos_fract + x0;
222	#endif // USE_LINEAR_INTERPOLATION
223
224	*Pos += Pitch;
225	return samplePoint;
226	}
227
228	#if 0 // CONFIG_ASM && ARCH_X86
229	// TODO: no support for cubic interpolation yet
230	inline static void Interpolate4StepsMonoMMXSSE(sample_t* pSrc, void* Pos, float& Pitch) {
231	/* calculate playback position of each of the 4 samples by adding the associated pitch */
232	__asm__ __volatile__ (
233	"movss (%0),%%xmm0 # sample position of sample[0] -> xmm0[0]\n\t"
234	"movss %1,%%xmm1 # copy pitch -> xmm1[0]\n\t"
235	"shufps $0x90,%%xmm0,%%xmm0 # shift up, but keep xmm0[0]\n\t"
236	"addss %%xmm1,%%xmm0 # calculate sample position of sample[1]\n\t"
237	"shufps $0x90,%%xmm0,%%xmm0 # shift up, but keep xmm0[0]\n\t"
238	"addss %%xmm1,%%xmm0 # calculate sample position of sample[2]\n\t"
239	"shufps $0x90,%%xmm0,%%xmm0 # shift up, but keep xmm0[0]\n\t"
240	"addss %%xmm1,%%xmm0 # calculate sample position of sample[3]\n\t"
241	"movss %%xmm0,%%xmm2 # xmm0[0] -> xmm2[0]\n\t"
242	"addss %%xmm1,%%xmm2 # calculate initial sample position for the next 4-sample cycle\n\t"
243	"movss %%xmm2,(%0) # update 'Pos'\n\t"
244	"shufps $0x1b,%%xmm0,%%xmm0 # swap, so that xmm0[0]=sample pos 0, xmm0[1]=sample pos 1,...\n\t"
245	"cvttps2pi %%xmm0,%%mm4 # int(xmm0[0-1]) -> mm4\n\t"
246	"shufps $0xe4,%%xmm0,%%xmm1 # xmm0[2-3] -> xmm1[2-3]\n\t"
247	"shufps $0x0e,%%xmm1,%%xmm1 # xmm1[2-3] -> xmm1[0-1]\n\t"
248	"cvttps2pi %%xmm1,%%mm5 # int(xmm1[0-1]) -> mm5\n\t"
249	"cvtpi2ps %%mm5,%%xmm1 # double(mm5) -> xmm1[0-1]\n\t"
250	"shufps $0x44,%%xmm1,%%xmm1 # shift lower 2 FPs up to the upper 2 cells\n\t"
251	"cvtpi2ps %%mm4,%%xmm1 # double(mm4) -> xmm1[0-1]\n\t"
252	"subps %%xmm1,%%xmm0 # xmm0[1-3] = xmm0[1-3] - xmm1[1-3]\n\t"
253	:
254	: "r" (Pos), /* %0 */
255	"m" (Pitch) /* %1 */
256	: "%xmm0", /* holds fractional position (0.0 <= x < 1.0) of sample 0-3 at the end */
257	"%xmm1", /* holds integer position (back converted to SPFP) of sample 0-3 at the end */
258	"mm4", /* holds integer position of sample 0-1 at the end */
259	"mm5", /* holds integer position of sample 2-3 at the end */
260	"st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)"
261	);
262	/* get sample values of pSrc[pos_int] and pSrc[pos_int+1] of the 4 samples */
263	__asm__ __volatile__ (
264	"movd %%mm4,%%edi # sample position of sample 0\n\t"
265	"psrlq $32,%%mm4 # mm4 >> 32\n\t"
266	"movswl (%0,%%edi,2),%%eax # pSrc[pos_int] (sample 0)\n\t"
267	"movswl 2(%0,%%edi,2),%%ecx # pSrc[pos_int] (sample 0+1)\n\t"
268	"cvtsi2ss %%eax, %%xmm2 # pSrc[pos_int] -> xmm2[0]\n\t"
269	"cvtsi2ss %%ecx, %%xmm3 # pSrc[pos_int] -> xmm3[0]\n\t"
270	"shufps $0x93, %%xmm2, %%xmm2 # shift up\n\t"
271	"shufps $0x93, %%xmm3, %%xmm3 # shift up\n\t"
272	"movd %%mm4,%%edi # sample position of sample 1\n\t"
273	"movswl (%0,%%edi,2),%%eax # pSrc[pos_int] (sample 1)\n\t"
274	"movswl 2(%0,%%edi,2),%%ecx # pSrc[pos_int] (sample 1+1)\n\t"
275	"cvtsi2ss %%eax, %%xmm2 # pSrc[pos_int] -> xmm2[0]\n\t"
276	"cvtsi2ss %%ecx, %%xmm3 # pSrc[pos_int] -> xmm3[0]\n\t"
277	"shufps $0x93, %%xmm2, %%xmm2 # shift up\n\t"
278	"shufps $0x93, %%xmm3, %%xmm3 # shift up\n\t"
279	"movd %%mm5,%%edi # sample position of sample 2\n\t"
280	"psrlq $32,%%mm5 # mm5 >> 32\n\t"
281	"movswl (%0,%%edi,2),%%eax # pSrc[pos_int] (sample 2)\n\t"
282	"movswl 2(%0,%%edi,2),%%ecx # pSrc[pos_int] (sample 2+1)\n\t"
283	"cvtsi2ss %%eax, %%xmm2 # pSrc[pos_int] -> xmm2[0]\n\t"
284	"cvtsi2ss %%ecx, %%xmm3 # pSrc[pos_int] -> xmm3[0]\n\t"
285	"shufps $0x93, %%xmm2, %%xmm2 # shift up\n\t"
286	"shufps $0x93, %%xmm3, %%xmm3 # shift up\n\t"
287	"movd %%mm5,%%edi # sample position of sample 2\n\t"
288	"movswl (%0,%%edi,2),%%eax # pSrc[pos_int] (sample 3)\n\t"
289	"movswl 2(%0,%%edi,2),%%ecx # pSrc[pos_int] (sample 3+1)\n\t"
290	"cvtsi2ss %%eax, %%xmm2 # pSrc[pos_int] -> xmm2[0]\n\t"
291	"cvtsi2ss %%ecx, %%xmm3 # pSrc[pos_int] -> xmm3[0]\n\t"
292	"shufps $0x1b, %%xmm2, %%xmm2 # swap to correct order\n\t"
293	"shufps $0x1b, %%xmm3, %%xmm3 # swap to correct order\n\t"
294	: /* no output */
295	: "S" (pSrc) /* %0 - sample read position */
296	: "%eax", "%ecx", /"%edx",/ "%edi",
297	"%xmm2", /* holds pSrc[int_pos] of the 4 samples at the end */
298	"%xmm3", /* holds pSrc[int_pos+1] of the 4 samples at the end */
299	"mm4", /* holds integer position of sample 0-1 at the end */
300	"mm5", /* holds integer position of sample 2-3 at the end */
301	"st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)"
302	);
303	/* linear interpolation of the 4 samples simultaniously */
304	__asm__ __volatile__ (
305	"subps %%xmm2,%%xmm3 # xmm3 = pSrc[pos_int+1] - pSrc[pos_int]\n\t"
306	"mulps %%xmm0,%%xmm3 # xmm3 = pos_fract * (pSrc[pos_int+1] - pSrc[pos_int])\n\t"
307	"addps %%xmm3,%%xmm2 # xmm2 = pSrc[pos_int] + (pos_fract * (pSrc[pos_int+1] - pSrc[pos_int]))\n\t"
308	: /* no output */
309	: /* no input */
310	: "%xmm2" /* holds linear interpolated sample point (of all 4 samples) at the end */
311	);
312	}
313
314	// TODO: no support for cubic interpolation yet
315	inline static void Interpolate4StepsStereoMMXSSE(sample_t* pSrc, void* Pos, float& Pitch) {
316	/* calculate playback position of each of the 4 samples by adding the associated pitch */
317	__asm__ __volatile__ (
318	"movss (%0),%%xmm0 # sample position of sample[0] -> xmm0[0]\n\t"
319	"movss %1,%%xmm1 # copy pitch -> xmm1[0]\n\t"
320	"shufps $0x90,%%xmm0,%%xmm0 # shift up, but keep xmm0[0]\n\t"
321	"addss %%xmm1,%%xmm0 # calculate sample position of sample[1]\n\t"
322	"shufps $0x90,%%xmm0,%%xmm0 # shift up, but keep xmm0[0]\n\t"
323	"addss %%xmm1,%%xmm0 # calculate sample position of sample[2]\n\t"
324	"shufps $0x90,%%xmm0,%%xmm0 # shift up, but keep xmm0[0]\n\t"
325	"addss %%xmm1,%%xmm0 # calculate sample position of sample[3]\n\t"
326	"movss %%xmm0,%%xmm2 # xmm0[0] -> xmm2[0]\n\t"
327	"addss %%xmm1,%%xmm2 # calculate initial sample position for the next 4-sample cycle\n\t"
328	"movss %%xmm2,(%0) # update 'Pos'\n\t"
329	"shufps $0x1b,%%xmm0,%%xmm0 # swap, so that xmm0[0]=sample pos 0, xmm0[1]=sample pos 1,...\n\t"
330	"cvttps2pi %%xmm0,%%mm4 # int(xmm0[0-1]) -> mm4\n\t"
331	"shufps $0xe4,%%xmm0,%%xmm1 # xmm0[2-3] -> xmm1[2-3]\n\t"
332	"shufps $0x0e,%%xmm1,%%xmm1 # xmm1[2-3] -> xmm1[0-1]\n\t"
333	"cvttps2pi %%xmm1,%%mm5 # int(xmm1[0-1]) -> mm5\n\t"
334	"cvtpi2ps %%mm5,%%xmm1 # double(mm5) -> xmm1[0-1]\n\t"
335	"shufps $0x44,%%xmm1,%%xmm1 # shift lower 2 FPs up to the upper 2 cells\n\t"
336	"cvtpi2ps %%mm4,%%xmm1 # double(mm4) -> xmm1[0-1]\n\t"
337	"subps %%xmm1,%%xmm0 # xmm0[1-3] = xmm0[1-3] - xmm1[1-3]\n\t"
338	:
339	: "r" (Pos), /* %0 */
340	"m" (Pitch) /* %1 */
341	: "%xmm0", /* holds fractional position (0.0 <= x < 1.0) of sample 0-3 at the end */
342	"%xmm1", /* holds integer position (back converted to SPFP) of sample 0-3 at the end */
343	"mm4", /* holds integer position of sample 0-1 at the end */
344	"mm5", /* holds integer position of sample 2-3 at the end */
345	"st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)"
346	);
347
348	/* get sample values of pSrc[pos_int], pSrc[pos_int+1], pSrc[pos_int+2] and pSrc[pos_int+3] of the 4 samples */
349	__asm__ __volatile__ (
350	"xorl %%eax,%%eax # clear eax\n\t"
351	"xorl %%edx,%%edx # clear edx\n\t"
352	"movd %%mm4,%%edi # sample position of sample 0\n\t"
353	"psrlq $32,%%mm4 # mm4 >> 32\n\t"
354	"movswl (%0,%%edi,4),%%eax # pSrc[pos_int] (sample 0)\n\t"
355	"cvtsi2ss %%eax, %%xmm2 # pSrc[pos_int] -> xmm2[0]\n\t"
356	"shufps $0x93, %%xmm2, %%xmm2 # shift up\n\t"
357	"movswl 2(%0,%%edi,4),%%edx # pSrc[pos_int] (sample 0+1)\n\t"
358	"cvtsi2ss %%edx, %%xmm3 # pSrc[pos_int] -> xmm3[0]\n\t"
359	"shufps $0x93, %%xmm3, %%xmm3 # shift up\n\t"
360	"movswl 4(%0,%%edi,4),%%eax # pSrc[pos_int] (sample 0+2)\n\t"
361	"cvtsi2ss %%eax, %%xmm4 # pSrc[pos_int] -> xmm4[0]\n\t"
362	"shufps $0x93, %%xmm4, %%xmm4 # shift up\n\t"
363	"movswl 6(%0,%%edi,4),%%edx # pSrc[pos_int] (sample 0+3)\n\t"
364	"cvtsi2ss %%edx, %%xmm5 # pSrc[pos_int] -> xmm5[0]\n\t"
365	"movd %%mm4,%%edi # sample position of sample 1\n\t"
366	"shufps $0x93, %%xmm5, %%xmm5 # shift up\n\t"
367	"movswl (%0,%%edi,4),%%eax # pSrc[pos_int] (sample 1)\n\t"
368	"cvtsi2ss %%eax, %%xmm2 # pSrc[pos_int] -> xmm2[0]\n\t"
369	"shufps $0x93, %%xmm2, %%xmm2 # shift up\n\t"
370	"movswl 2(%0,%%edi,4),%%edx # pSrc[pos_int] (sample 1+1)\n\t"
371	"cvtsi2ss %%edx, %%xmm3 # pSrc[pos_int] -> xmm3[0]\n\t"
372	"shufps $0x93, %%xmm3, %%xmm3 # shift up\n\t"
373	"movswl 4(%0,%%edi,4),%%eax # pSrc[pos_int] (sample 1+2)\n\t"
374	"cvtsi2ss %%eax, %%xmm4 # pSrc[pos_int] -> xmm4[0]\n\t"
375	"shufps $0x93, %%xmm4, %%xmm4 # shift up\n\t"
376	"movswl 6(%0,%%edi,4),%%edx # pSrc[pos_int] (sample 1+3)\n\t"
377	"cvtsi2ss %%edx, %%xmm5 # pSrc[pos_int] -> xmm5[0]\n\t"
378	"movd %%mm5,%%edi # sample position of sample 2\n\t"
379	"shufps $0x93, %%xmm5, %%xmm5 # shift up\n\t"
380	"psrlq $32,%%mm5 # mm5 >> 32\n\t"
381	"movswl (%0,%%edi,4),%%eax # pSrc[pos_int] (sample 2)\n\t"
382	"cvtsi2ss %%eax, %%xmm2 # pSrc[pos_int] -> xmm2[0]\n\t"
383	"shufps $0x93, %%xmm2, %%xmm2 # shift up\n\t"
384	"movswl 2(%0,%%edi,4),%%edx # pSrc[pos_int] (sample 2+1)\n\t"
385	"cvtsi2ss %%edx, %%xmm3 # pSrc[pos_int] -> xmm3[0]\n\t"
386	"shufps $0x93, %%xmm3, %%xmm3 # shift up\n\t"
387	"movswl 4(%0,%%edi,4),%%eax # pSrc[pos_int] (sample 2+2)\n\t"
388	"cvtsi2ss %%eax, %%xmm4 # pSrc[pos_int] -> xmm4[0]\n\t"
389	"shufps $0x93, %%xmm4, %%xmm4 # shift up\n\t"
390	"movswl 6(%0,%%edi,4),%%edx # pSrc[pos_int] (sample 2+3)\n\t"
391	"cvtsi2ss %%edx, %%xmm5 # pSrc[pos_int] -> xmm5[0]\n\t"
392	"movd %%mm5,%%edi # sample position of sample 3\n\t"
393	"shufps $0x93, %%xmm5, %%xmm5 # shift up\n\t"
394	"movswl (%0,%%edi,4),%%eax # pSrc[pos_int] (sample 3)\n\t"
395	"cvtsi2ss %%eax, %%xmm2 # pSrc[pos_int] -> xmm2[0]\n\t"
396	"shufps $0x1b, %%xmm2, %%xmm2 # shift up\n\t"
397	"movswl 2(%0,%%edi,4),%%edx # pSrc[pos_int] (sample 3+1)\n\t"
398	"cvtsi2ss %%edx, %%xmm3 # pSrc[pos_int] -> xmm3[0]\n\t"
399	"shufps $0x1b, %%xmm3, %%xmm3 # shift up\n\t"
400	"movswl 4(%0,%%edi,4),%%eax # pSrc[pos_int] (sample 3+2)\n\t"
401	"cvtsi2ss %%eax, %%xmm4 # pSrc[pos_int] -> xmm4[0]\n\t"
402	"shufps $0x1b, %%xmm4, %%xmm4 # swap to correct order\n\t"
403	"movswl 6(%0,%%edi,4),%%edx # pSrc[pos_int] (sample 3+3)\n\t"
404	"cvtsi2ss %%edx, %%xmm5 # pSrc[pos_int] -> xmm5[0]\n\t"
405	"shufps $0x1b, %%xmm5, %%xmm5 # swap to correct order\n\t"
406	: /* no output */
407	: "S" (pSrc) /* %0 - sample read position */
408	: "%eax", "%edx", "%edi",
409	"xmm2", /* holds pSrc[int_pos] of the 4 samples at the end */
410	"xmm3", /* holds pSrc[int_pos+1] of the 4 samples at the end */
411	"xmm4", /* holds pSrc[int_pos+2] of the 4 samples at the end */
412	"xmm5", /* holds pSrc[int_pos+3] of the 4 samples at the end */
413	"mm4", /* holds integer position of sample 0-1 at the end */
414	"mm5", /* holds integer position of sample 2-3 at the end */
415	"st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)"
416	);
417	/* linear interpolation of the 4 samples (left & right channel) simultaniously */
418	__asm__ __volatile__ (
419	"subps %%xmm2,%%xmm4 # xmm4 = pSrc[pos_int+2] - pSrc[pos_int] (left channel)\n\t"
420	"mulps %%xmm0,%%xmm4 # xmm4 = pos_fract * (pSrc[pos_int+2] - pSrc[pos_int]) (left channel)\n\t"
421	"addps %%xmm4,%%xmm2 # xmm2 = pSrc[pos_int] + (pos_fract * (pSrc[pos_int+2] - pSrc[pos_int])) (left channel)\n\t"
422	"subps %%xmm3,%%xmm5 # xmm5 = pSrc[pos_int+3] - pSrc[pos_int+1] (right channel)\n\t"
423	"mulps %%xmm0,%%xmm5 # xmm5 = pos_fract * (pSrc[pos_int+3] - pSrc[pos_int+1]) (right channel)\n\t"
424	"addps %%xmm5,%%xmm3 # xmm3 = pSrc[pos_int+1] + (pos_fract * (pSrc[pos_int+3] - pSrc[pos_int+1])) (right channel)\n\t"
425	: /* no output */
426	: /* no input */
427	: "%xmm2", /* holds linear interpolated sample of left channel (of all 4 samples) at the end */
428	"%xmm3" /* holds linear interpolated sample of right channel (of all 4 samples) at the end */
429	);
430	}
431	#endif // CONFIG_ASM && ARCH_X86
432	};
433
434	} // namespace LinuxSampler
435
436	#endif // __LS_RESAMPLER_H__