--- linuxsampler/trunk/src/engines/gig/Synthesizer.h	2005/03/04 22:54:11	424
+++ linuxsampler/trunk/src/engines/gig/Synthesizer.h	2006/05/25 14:39:45	866
@@ -29,316 +29,248 @@
 #include "../common/Resampler.h"
 #include "../common/BiquadFilter.h"
 #include "Filter.h"
-#include "Voice.h"
+#include "SynthesisParam.h"
 
-#define SYNTHESIS_MODE_SET_CONSTPITCH(iMode,bVal)       if (bVal) iMode |= 0x01; else iMode &= ~0x01   /* (un)set mode bit 0 */
-#define SYNTHESIS_MODE_SET_LOOP(iMode,bVal)             if (bVal) iMode |= 0x02; else iMode &= ~0x02   /* (un)set mode bit 1 */
-#define SYNTHESIS_MODE_SET_INTERPOLATE(iMode,bVal)      if (bVal) iMode |= 0x04; else iMode &= ~0x04   /* (un)set mode bit 2 */
-#define SYNTHESIS_MODE_SET_FILTER(iMode,bVal)           if (bVal) iMode |= 0x08; else iMode &= ~0x08   /* (un)set mode bit 3 */
-#define SYNTHESIS_MODE_SET_CHANNELS(iMode,bVal)         if (bVal) iMode |= 0x10; else iMode &= ~0x10   /* (un)set mode bit 4 */
-#define SYNTHESIS_MODE_SET_IMPLEMENTATION(iMode,bVal)   if (bVal) iMode |= 0x20; else iMode &= ~0x20   /* (un)set mode bit 5 */
-#define SYNTHESIS_MODE_SET_PROFILING(iMode,bVal)   	if (bVal) iMode |= 0x40; else iMode &= ~0x40   /* (un)set mode bit 6 */
-
-#define SYNTHESIS_MODE_GET_CONSTPITCH(iMode)            iMode & 0x01
-#define SYNTHESIS_MODE_GET_LOOP(iMode)                  iMode & 0x02
-#define SYNTHESIS_MODE_GET_INTERPOLATE(iMode)           iMode & 0x04
-#define SYNTHESIS_MODE_GET_FILTER(iMode)                iMode & 0x08
-#define SYNTHESIS_MODE_GET_CHANNELS(iMode)              iMode & 0x10
-#define SYNTHESIS_MODE_GET_IMPLEMENTATION(iMode)        iMode & 0x20
-
-// that's usually gig::Voice of course, but we make it a macro so we can
-// include this code for our synthesis benchmark which uses fake data
-// structures
-#ifndef VOICE
-# define VOICE Voice
-#endif // VOICE
+#define SYNTHESIS_MODE_SET_INTERPOLATE(iMode,bVal)      if (bVal) iMode |= 0x01; else iMode &= ~0x01   /* (un)set mode bit 0 */
+#define SYNTHESIS_MODE_SET_FILTER(iMode,bVal)           if (bVal) iMode |= 0x02; else iMode &= ~0x02   /* (un)set mode bit 1 */
+#define SYNTHESIS_MODE_SET_LOOP(iMode,bVal)             if (bVal) iMode |= 0x04; else iMode &= ~0x04   /* (un)set mode bit 2 */
+#define SYNTHESIS_MODE_SET_CHANNELS(iMode,bVal)         if (bVal) iMode |= 0x08; else iMode &= ~0x08   /* (un)set mode bit 3 */
+#define SYNTHESIS_MODE_SET_IMPLEMENTATION(iMode,bVal)   if (bVal) iMode |= 0x10; else iMode &= ~0x10   /* (un)set mode bit 4 */
+#define SYNTHESIS_MODE_SET_PROFILING(iMode,bVal)        if (bVal) iMode |= 0x20; else iMode &= ~0x20   /* (un)set mode bit 5 */
+
+#define SYNTHESIS_MODE_GET_INTERPOLATE(iMode)           iMode & 0x01
+#define SYNTHESIS_MODE_GET_FILTER(iMode)                iMode & 0x02
+#define SYNTHESIS_MODE_GET_LOOP(iMode)                  iMode & 0x04
+#define SYNTHESIS_MODE_GET_CHANNELS(iMode)              iMode & 0x08
+#define SYNTHESIS_MODE_GET_IMPLEMENTATION(iMode)        iMode & 0x10
 
 namespace LinuxSampler { namespace gig {
 
-    typedef void SynthesizeFragment_Fn(VOICE&, uint, sample_t*, uint);
+    typedef void SynthesizeFragment_Fn(SynthesisParam* pFinalParam, Loop* pLoop);
 
     void* GetSynthesisFunction(const int SynthesisMode);
-    void RunSynthesisFunction(const int SynthesisMode, VOICE& voice, uint Samples, sample_t* pSrc, uint Skip);
+    void RunSynthesisFunction(const int SynthesisMode, SynthesisParam* pFinalParam, Loop* pLoop);
 
     enum channels_t {
         MONO,
         STEREO
     };
 
-    template<implementation_t IMPLEMENTATION, channels_t CHANNELS, bool USEFILTER, bool INTERPOLATE, bool DOLOOP, bool CONSTPITCH>
-    class Synthesizer : public __RTMath<IMPLEMENTATION>, public LinuxSampler::Resampler<INTERPOLATE> {
-        public:
-            template<typename VOICE_T>
-            inline static void SynthesizeFragment(VOICE_T& Voice, uint Samples, sample_t* pSrc, uint i) {
-                const float panLeft  = Mul(Voice.PanLeft,  Voice.pEngineChannel->GlobalPanLeft);
-                const float panRight = Mul(Voice.PanRight, Voice.pEngineChannel->GlobalPanRight);
-                if (IMPLEMENTATION == ASM_X86_MMX_SSE) {
-                    float fPos = (float) Voice.Pos;
-                    SynthesizeFragment(Voice, Samples, pSrc, i, Voice.pSample->LoopPlayCount,
-                                       Voice.pSample->LoopStart,
-                                       Voice.pSample->LoopEnd,
-                                       Voice.pSample->LoopSize,
-                                       Voice.LoopCyclesLeft,
-                                       (void *)&fPos,
-                                       Voice.PitchBase,
-                                       Voice.PitchBend,
-                                       &panLeft, &panRight);
-                    #if  ARCH_X86
-                    if (INTERPOLATE) EMMS;
-                    #endif
-                    Voice.Pos = (double) fPos;
-                } else {
-                    SynthesizeFragment(Voice, Samples, pSrc, i, Voice.pSample->LoopPlayCount,
-                                       Voice.pSample->LoopStart,
-                                       Voice.pSample->LoopEnd,
-                                       Voice.pSample->LoopSize,
-                                       Voice.LoopCyclesLeft,
-                                       (void *)&Voice.Pos,
-                                       Voice.PitchBase,
-                                       Voice.PitchBend,
-                                       &panLeft, &panRight);
-                }
-            }
+    /** @brief Main Synthesis algorithms for the gig::Engine
+     *
+     * Implementation of the main synthesis algorithms of the Gigasampler
+     * format capable sampler engine. This means resampling / interpolation
+     * for pitching the audio signal, looping, filter and amplification.
+     */
+    template<channels_t CHANNELS, bool DOLOOP, bool USEFILTER, bool INTERPOLATE>
+    class Synthesizer : public __RTMath<CPP>, public LinuxSampler::Resampler<INTERPOLATE> {
+
+            // declarations of derived functions (see "Name lookup,
+            // templates, and accessing members of base classes" in
+            // the gcc manual for an explanation of why this is
+            // needed).
+            //using LinuxSampler::Resampler<INTERPOLATE>::GetNextSampleMonoCPP;
+            //using LinuxSampler::Resampler<INTERPOLATE>::GetNextSampleStereoCPP;
+            using LinuxSampler::Resampler<INTERPOLATE>::Interpolate1StepMonoCPP;
+            using LinuxSampler::Resampler<INTERPOLATE>::Interpolate1StepStereoCPP;
 
+        public:
         //protected:
 
-            template<typename VOICE_T>
-            inline static void SynthesizeFragment(VOICE_T& Voice, uint Samples, sample_t* pSrc, uint& i, uint& LoopPlayCount, uint LoopStart, uint LoopEnd, uint LoopSize, uint& LoopCyclesLeft, void* Pos, float& PitchBase, float& PitchBend, const float* PanLeft, const float* PanRight) {
-                const float loopEnd = Float(LoopEnd);
-                const float PBbyPB = Mul(PitchBase, PitchBend);
-                const float f_LoopStart = Float(LoopStart);
-                const float f_LoopSize = Float(LoopSize);
+            static void SynthesizeSubFragment(SynthesisParam* pFinalParam, Loop* pLoop) {
                 if (DOLOOP) {
-                    if (LoopPlayCount) {
+                    const float fLoopEnd   = Float(pLoop->uiEnd);
+                    const float fLoopStart = Float(pLoop->uiStart);
+                    const float fLoopSize  = Float(pLoop->uiSize);
+                    if (pLoop->uiTotalCycles) {
                         // render loop (loop count limited)
-                        while (i < Samples && LoopCyclesLeft) {
-                            if (CONSTPITCH) {
-                                const uint processEnd = Min(Samples, i + DiffToLoopEnd(loopEnd,Pos, PBbyPB) + 1); //TODO: instead of +1 we could also round up
-                                while (i < processEnd) Synthesize(Voice, Pos, pSrc, i, PanLeft, PanRight);
-                            }
-                            else Synthesize(Voice, Pos, pSrc, i, PanLeft, PanRight);
-                            if (WrapLoop(f_LoopStart, f_LoopSize, loopEnd, Pos)) LoopCyclesLeft--;
+                        for (; pFinalParam->uiToGo > 0 && pLoop->uiCyclesLeft; pLoop->uiCyclesLeft -= WrapLoop(fLoopStart, fLoopSize, fLoopEnd, &pFinalParam->dPos)) {
+                            const uint uiToGo = Min(pFinalParam->uiToGo, DiffToLoopEnd(fLoopEnd, &pFinalParam->dPos, pFinalParam->fFinalPitch) + 1); //TODO: instead of +1 we could also round up
+                            SynthesizeSubSubFragment(pFinalParam, uiToGo);
                         }
                         // render on without loop
-                        while (i < Samples) Synthesize(Voice, Pos, pSrc, i, PanLeft, PanRight);
-                    }
-                    else { // render loop (endless loop)
-                        while (i < Samples) {
-                            if (CONSTPITCH) {
-                                const uint processEnd = Min(Samples, i + DiffToLoopEnd(loopEnd, Pos, PBbyPB) + 1); //TODO: instead of +1 we could also round up
-                                while (i < processEnd) Synthesize(Voice, Pos, pSrc, i, PanLeft, PanRight);
-                            }
-                            else Synthesize(Voice, Pos, pSrc, i, PanLeft, PanRight);
-                            WrapLoop(f_LoopStart, f_LoopSize, loopEnd, Pos);
+                        SynthesizeSubSubFragment(pFinalParam, pFinalParam->uiToGo);
+                    } else { // render loop (endless loop)
+                        for (; pFinalParam->uiToGo > 0; WrapLoop(fLoopStart, fLoopSize, fLoopEnd, &pFinalParam->dPos)) {
+                            const uint uiToGo = Min(pFinalParam->uiToGo, DiffToLoopEnd(fLoopEnd, &pFinalParam->dPos, pFinalParam->fFinalPitch) + 1); //TODO: instead of +1 we could also round up
+                            SynthesizeSubSubFragment(pFinalParam, uiToGo);
                         }
                     }
+                } else { // no looping
+                    SynthesizeSubSubFragment(pFinalParam, pFinalParam->uiToGo);
                 }
-                else { // no looping
-                    while (i < Samples) { Synthesize(Voice, Pos, pSrc, i, PanLeft, PanRight);}
-                }
-            }
-
-            template<typename VOICE_T>
-            inline static void Synthesize(VOICE_T& Voice, void* Pos, sample_t* pSrc, uint& i, const float* PanLeft, const float* PanRight) {
-                Synthesize(pSrc, Pos,
-                           Voice.pEngine->pSynthesisParameters[Event::destination_vco][i],
-                           Voice.pEngineChannel->pOutputLeft,
-                           Voice.pEngineChannel->pOutputRight,
-                           i,
-                           Voice.pEngine->pSynthesisParameters[Event::destination_vca],
-                           PanLeft,
-                           PanRight,
-                           Voice.FilterLeft,
-                           Voice.FilterRight,
-                           Voice.pEngine->pBasicFilterParameters[i],
-                           Voice.pEngine->pMainFilterParameters[i]);
             }
 
+            /**
+             * Returns the difference to the sample's loop end.
+             */
             inline static int DiffToLoopEnd(const float& LoopEnd, const void* Pos, const float& Pitch) {
-                switch (IMPLEMENTATION) {
-                    // pure C++ implementation (thus platform independent)
-                    case CPP: {
-                        return uint((LoopEnd - *((double *)Pos)) / Pitch);
-                    }
-                    #if ARCH_X86
-                    case ASM_X86_MMX_SSE: {
-                        int result;
-                        __asm__ __volatile__ (
-                            "movss    (%1), %%xmm0  #read loopend\n\t"
-                            "subss    (%2), %%xmm0  #sub  pos\n\t"
-                            "divss    (%3), %%xmm0  #div  by pitch\n\t"
-                            "cvtss2si %%xmm0, %0    #convert to int\n\t"
-                            : "=r" (result)   /* %0 */
-                            : "r" (&LoopEnd), /* %1 */
-                              "r" (Pos),      /* %2 */
-                              "r" (&Pitch)    /* %3 */
-                        );
-                        return result;
-                    }
-                    #endif // ARCH_X86
-                }
+                return uint((LoopEnd - *((double *)Pos)) / Pitch);
             }
 
+#if 0
+            //TODO: this method is not in use yet, it's intended to be used for pitch=x.0f where we could use integer instead of float as playback position variable
+            inline static int WrapLoop(const int& LoopStart, const int& LoopSize, const int& LoopEnd, int& Pos) {
+                //TODO: we can easily eliminate the branch here
+                if (Pos < LoopEnd) return 0;
+                Pos = (Pos - LoopEnd) % LoopSize + LoopStart;
+                return 1;
+            }
+#endif
+
+            /**
+             * This method handles looping of the RAM playback part of the
+             * sample, thus repositioning the playback position once the
+             * loop limit was reached. Note: looping of the disk streaming
+             * part is handled by libgig (ReadAndLoop() method which will
+             * be called by the DiskThread).
+             */
             inline static int WrapLoop(const float& LoopStart, const float& LoopSize, const float& LoopEnd, void* vPos) {
-                switch (IMPLEMENTATION) {
-                    // pure C++ implementation (thus platform independent)
-                    case CPP: {
-                        double * Pos = (double *)vPos;
-                        if (*Pos < LoopEnd) return 0;
-                        *Pos = fmod(*Pos - LoopEnd, LoopSize) + LoopStart;
-                        return 1;
-                    }
-                    #if ARCH_X86
-                    case ASM_X86_MMX_SSE: {
-                        int result = 0;
-                        __asm__ __volatile__ (
-                            "movss  (%2), %%xmm0          # load LoopEnd\n\t"
-                            "movss  (%1), %%xmm1          # load Pos\n\t"
-                            "comiss %%xmm0, %%xmm1      # LoopEnd <> Pos\n\t"
-                            "jb     1f                  # jump if no work needs to be done\n\t"
-                            "movss    (%3), %%xmm2        # load LoopSize\n\t"
-                            "subss    %%xmm0, %%xmm1    # Pos - LoopEnd\n\t"
-                            //now the fmodf
-                            "movss    %%xmm1, %%xmm3    # xmm3 = (Pos - LoopEnd)\n\t"
-                            "divss    %%xmm2, %%xmm1    # (Pos - LoopEnd) / LoopSize\n\t"
-                            "cvttss2si %%xmm1, %%eax    # convert to int\n\t"
-                            "cvtsi2ss  %%eax, %%xmm1    # convert back to float\n\t"
-                            "movss    (%4), %%xmm0      # load LoopStart\n\t"
-                            "mulss    %%xmm2, %%xmm1    # LoopSize * int((Pos-LoopEnd)/LoopSize)\n\t"
-                            "subss    %%xmm1, %%xmm3    # xmm2 = fmodf(Pos - LoopEnd, LoopSize)\n\t"
-                            //done with fmodf
-                            "addss    %%xmm0, %%xmm3      # add LoopStart\n\t"
-                            "movss    %%xmm3, (%1)        # update Pos\n\t"
-                            "movl    $1, (%0)             # result = 1\n\t"
-                            ".balign 16 \n\t"
-                            "1:\n\t"
-                            :: "r" (&result),   /* %0 */
-                              "r"  (vPos),      /* %1 */
-                              "r"  (&LoopEnd),  /* %2 */
-                              "r"  (&LoopSize), /* %3 */
-                              "r"  (&LoopStart) /* %4 */
-                        );
-                        return result;
-                    }
-                    #endif // ARCH_X86
-                }
-            }
-
-            inline static void Synthesize(sample_t* pSrc, void* Pos, float& Pitch, float* pOutL, float* pOutR, uint& i, float* Volume, const float* PanL, const float* PanR, Filter& FilterL, Filter& FilterR, biquad_param_t& bqBase, biquad_param_t& bqMain) {
-                switch (IMPLEMENTATION) {
-                    // pure C++ implementation (thus platform independent)
-                    case CPP: {
-                        switch (CHANNELS) {
-                            case MONO: {
-                                float samplePoint = GetNextSampleMonoCPP(pSrc, (double *)Pos, Pitch);
-                                if (USEFILTER) samplePoint = FilterL.Apply(&bqBase, &bqMain, samplePoint);
-                                pOutL[i] += samplePoint * Volume[i] * *PanL;
-                                pOutR[i] += samplePoint * Volume[i] * *PanR;
-                                i++;
-                                break;
+                double * Pos = (double *)vPos;
+                if (*Pos < LoopEnd) return 0;
+                *Pos = fmod(*Pos - LoopEnd, LoopSize) + LoopStart;
+                return 1;
+            }
+
+            static void SynthesizeSubSubFragment(SynthesisParam* pFinalParam, uint uiToGo) {
+                float fVolumeL = pFinalParam->fFinalVolumeLeft;
+                float fVolumeR = pFinalParam->fFinalVolumeRight;
+                sample_t* pSrc = pFinalParam->pSrc;
+                float* pOutL   = pFinalParam->pOutLeft;
+                float* pOutR   = pFinalParam->pOutRight;
+#ifdef CONFIG_INTERPOLATE_VOLUME
+                float fDeltaL  = pFinalParam->fFinalVolumeDeltaLeft;
+                float fDeltaR  = pFinalParam->fFinalVolumeDeltaRight;
+#endif
+                switch (CHANNELS) {
+                    case MONO: {
+                        float samplePoint;
+                        if (INTERPOLATE) {
+                            double dPos    = pFinalParam->dPos;
+                            float fPitch   = pFinalParam->fFinalPitch;
+                            if (USEFILTER) {
+                                Filter filterL = pFinalParam->filterLeft;
+                                for (int i = 0; i < uiToGo; ++i) {
+                                    samplePoint = Interpolate1StepMonoCPP(pSrc, &dPos, fPitch);
+                                    samplePoint = filterL.Apply(samplePoint);
+#ifdef CONFIG_INTERPOLATE_VOLUME
+                                    fVolumeL += fDeltaL;
+                                    fVolumeR += fDeltaR;
+#endif
+                                    pOutL[i] += samplePoint * fVolumeL;
+                                    pOutR[i] += samplePoint * fVolumeR;
+                                }
+                            } else { // no filter needed
+                                for (int i = 0; i < uiToGo; ++i) {
+                                    samplePoint = Interpolate1StepMonoCPP(pSrc, &dPos, fPitch);
+#ifdef CONFIG_INTERPOLATE_VOLUME
+                                    fVolumeL += fDeltaL;
+                                    fVolumeR += fDeltaR;
+#endif
+                                    pOutL[i] += samplePoint * fVolumeL;
+                                    pOutR[i] += samplePoint * fVolumeR;
+                                }
                             }
-                            case STEREO: {
-                                stereo_sample_t samplePoint = GetNextSampleStereoCPP(pSrc, (double *)Pos, Pitch);
-                                if (USEFILTER) {
-                                    samplePoint.left  = FilterL.Apply(&bqBase, &bqMain, samplePoint.left);
-                                    samplePoint.right = FilterR.Apply(&bqBase, &bqMain, samplePoint.right);
+                            pFinalParam->dPos = dPos;
+                        } else { // no interpolation
+                            int pos_offset = (int) pFinalParam->dPos;
+                            if (USEFILTER) {
+                                Filter filterL = pFinalParam->filterLeft;
+                                for (int i = 0; i < uiToGo; ++i) {
+                                    samplePoint = pSrc[i + pos_offset];
+                                    samplePoint = filterL.Apply(samplePoint);
+#ifdef CONFIG_INTERPOLATE_VOLUME
+                                    fVolumeL += fDeltaL;
+                                    fVolumeR += fDeltaR;
+#endif
+                                    pOutL[i] += samplePoint * fVolumeL;
+                                    pOutR[i] += samplePoint * fVolumeR;
+                                }
+                            } else { // no filter needed
+                                for (int i = 0; i < uiToGo; ++i) {
+                                    samplePoint = pSrc[i + pos_offset];
+#ifdef CONFIG_INTERPOLATE_VOLUME
+                                    fVolumeL += fDeltaL;
+                                    fVolumeR += fDeltaR;
+#endif
+                                    pOutL[i] += samplePoint * fVolumeL;
+                                    pOutR[i] += samplePoint * fVolumeR;
                                 }
-                                pOutL[i] += samplePoint.left  * Volume[i] * *PanL;
-                                pOutR[i] += samplePoint.right * Volume[i] * *PanR;
-                                i++;
-                                break;
                             }
+                            pFinalParam->dPos += uiToGo;
                         }
                         break;
                     }
-                    #if ARCH_X86
-                    // Assembly optimization using the MMX & SSE(1) instruction set (thus only for x86)
-                    case ASM_X86_MMX_SSE: {
-                        const int ii = i & 0xfffffffc;
-                        i += 4;
-                        switch (CHANNELS) {
-                            case MONO: {
-                                GetNext4SamplesMonoMMXSSE(pSrc, (float *)Pos, Pitch); // outputs samples in xmm2
-                                if (USEFILTER) {
-                                    /* prepare filter input */
-                                    __asm__ __volatile__ (
-                                        "movaps %xmm2,%xmm0"
-                                    );
-                                    FilterL.Apply4StepsSSE(&bqBase, &bqMain); // xmm0 input, xmm7 output
-                                    __asm__ __volatile__ (
-                                        "movaps %xmm7,%xmm2       # mono filter result -> xmm2"
-                                    );
+                    case STEREO: {
+                        stereo_sample_t samplePoint;
+                        if (INTERPOLATE) {
+                            double dPos    = pFinalParam->dPos;
+                            float fPitch   = pFinalParam->fFinalPitch;
+                            if (USEFILTER) {
+                                Filter filterL = pFinalParam->filterLeft;
+                                Filter filterR = pFinalParam->filterRight;
+                                for (int i = 0; i < uiToGo; ++i) {
+                                    samplePoint = Interpolate1StepStereoCPP(pSrc, &dPos, fPitch);
+                                    samplePoint.left  = filterL.Apply(samplePoint.left);
+                                    samplePoint.right = filterR.Apply(samplePoint.right);
+#ifdef CONFIG_INTERPOLATE_VOLUME
+                                    fVolumeL += fDeltaL;
+                                    fVolumeR += fDeltaR;
+#endif
+                                    pOutL[i] += samplePoint.left  * fVolumeL;
+                                    pOutR[i] += samplePoint.right * fVolumeR;
+                                }
+                            } else { // no filter needed
+                                for (int i = 0; i < uiToGo; ++i) {
+                                    samplePoint = Interpolate1StepStereoCPP(pSrc, &dPos, fPitch);
+#ifdef CONFIG_INTERPOLATE_VOLUME
+                                    fVolumeL += fDeltaL;
+                                    fVolumeR += fDeltaR;
+#endif
+                                    pOutL[i] += samplePoint.left  * fVolumeL;
+                                    pOutR[i] += samplePoint.right * fVolumeR;
                                 }
-                                /* apply panorama and volume factors */
-                                __asm__ __volatile__ (
-                                    "movss    (%1),%%xmm0             # load pan left\n\t"
-                                    "movss    (%2),%%xmm1             # load pan right\n\t"
-                                    "movaps   (%0),%%xmm4             # load vca\n\t"
-                                    "shufps   $0x00,%%xmm0,%%xmm0     # copy pan left to the other 3 cells\n\t"
-                                    "shufps   $0x00,%%xmm1,%%xmm1     # copy pan right to the other 3 cells\n\t"
-                                    "mulps    %%xmm2,%%xmm0           # left  = sample * pan_left\n\t"
-                                    "mulps    %%xmm2,%%xmm1           # right = sample * pan_right\n\t"
-                                    "mulps    %%xmm4,%%xmm0           # left  = vca * (sample * pan_left)\n\t"
-                                    "mulps    %%xmm4,%%xmm1           # right = vca * (sample * pan_right)\n\t"
-                                    : /* no output */
-                                    : "r" (&Volume[ii]), /* %0 */
-                                      "r" (PanL),   /* %1 */
-                                      "r" (PanR)    /* %2 */
-                                    : "xmm0", /* holds final left  sample (for the 4 samples) at the end */
-                                      "xmm1"  /* holds final right sample (for the 4 samples) at the end */
-                                );
-                                break;
                             }
-                            case STEREO: {
-                                GetNext4SamplesStereoMMXSSE(pSrc, (float *)Pos, Pitch); // outputs samples in xmm2 (left channel) and xmm3 (right channel)
-                                if (USEFILTER) {
-                                    __asm__ __volatile__ (
-                                        "movaps %xmm2,%xmm0     # prepare left channel for filter\n\t"
-                                        "movaps %xmm3,%xmm1     # save right channel not to get overwritten by filter algorithms\n\t"
-                                    );
-                                    FilterL.Apply4StepsSSE(&bqBase, &bqMain); // xmm0 input, xmm7 output
-                                    __asm__ __volatile__ (
-                                        "movaps %xmm1,%xmm0     # prepare right channel for filter\n\t"
-                                        "movaps %xmm7,%xmm1     # save filter output for left channel\n\t"
-                                    );
-                                    FilterR.Apply4StepsSSE(&bqBase, &bqMain); // xmm0 input, xmm7 output
-                                    __asm__ __volatile__ (
-                                        "movaps %xmm1,%xmm2     # result left channel -> xmm2\n\t"
-                                        "movaps %xmm7,%xmm3     # result right channel -> xmm3\n\t"
-                                    );
+                            pFinalParam->dPos = dPos;
+                        } else { // no interpolation
+                            int pos_offset = ((int) pFinalParam->dPos) << 1;
+                            if (USEFILTER) {
+                                Filter filterL = pFinalParam->filterLeft;
+                                Filter filterR = pFinalParam->filterRight;
+                                for (int i = 0, ii = 0; i < uiToGo; ++i, ii+=2) {
+                                    samplePoint.left  = pSrc[ii + pos_offset];
+                                    samplePoint.right = pSrc[ii + pos_offset + 1];
+                                    samplePoint.left  = filterL.Apply(samplePoint.left);
+                                    samplePoint.right = filterR.Apply(samplePoint.right);
+#ifdef CONFIG_INTERPOLATE_VOLUME
+                                    fVolumeL += fDeltaL;
+                                    fVolumeR += fDeltaR;
+#endif
+                                    pOutL[i] += samplePoint.left  * fVolumeL;
+                                    pOutR[i] += samplePoint.right * fVolumeR;
+                                }
+                            } else { // no filter needed
+                                for (int i = 0, ii = 0; i < uiToGo; ++i, ii+=2) {
+                                    samplePoint.left  = pSrc[ii + pos_offset];
+                                    samplePoint.right = pSrc[ii + pos_offset + 1];
+#ifdef CONFIG_INTERPOLATE_VOLUME
+                                    fVolumeL += fDeltaL;
+                                    fVolumeR += fDeltaR;
+#endif
+                                    pOutL[i] += samplePoint.left  * fVolumeL;
+                                    pOutR[i] += samplePoint.right * fVolumeR;
                                 }
-                                /* apply panorama and volume factors */
-                                __asm__ __volatile__ (
-                                    "movss    (%1),%%xmm0             # load pan left\n\t"
-                                    "movss    (%2),%%xmm1             # load pan right\n\t"
-                                    "movaps   (%0),%%xmm4             # load vca\n\t"
-                                    "shufps   $0x00,%%xmm0,%%xmm0     # copy pan left to the other 3 cells\n\t"
-                                    "shufps   $0x00,%%xmm1,%%xmm1     # copy pan right to the other 3 cells\n\t"
-                                    "mulps    %%xmm2,%%xmm0           # left  = sample_left  * pan_left\n\t"
-                                    "mulps    %%xmm3,%%xmm1           # right = sample_right * pan_right\n\t"
-                                    "mulps    %%xmm4,%%xmm0           # left  = vca * (sample_left  * pan_left)\n\t"
-                                    "mulps    %%xmm4,%%xmm1           # right = vca * (sample_right * pan_right)\n\t"
-                                    : /* no output */
-                                    : "r" (&Volume[ii]), /* %0 */
-                                      "r" (PanL),   /* %1 */
-                                      "r" (PanR)    /* %2 */
-                                );
-                                break;
                             }
+                            pFinalParam->dPos += uiToGo;
                         }
-                        /* mix the 4 samples to the output channels */
-                        __asm__ __volatile__ (
-                            "addps  (%0),%%xmm0       # mix calculated sample(s) to output left\n\t"
-                            "movaps %%xmm0,(%0)       # output to left channel\n\t"
-                            "addps  (%1),%%xmm1       # mix calculated sample(s) to output right\n\t"
-                            "movaps %%xmm1,(%1)       # output to right channel\n\t"
-                            : /* no output */
-                            : "r" (&pOutL[ii]), /* %0 - must be 16 byte aligned ! */
-                              "r" (&pOutR[ii])  /* %1 - must be 16 byte aligned ! */
-                        );
+                        break;
                     }
-                    #endif // ARCH_X86
                 }
+                pFinalParam->fFinalVolumeLeft = fVolumeL;
+                pFinalParam->fFinalVolumeRight = fVolumeR;
+                pFinalParam->pOutRight += uiToGo;
+                pFinalParam->pOutLeft  += uiToGo;
+                pFinalParam->uiToGo    -= uiToGo;
             }
     };