engines/gig/Synthesizer.h

/***************************************************************************
 *                                                                         *
 *   LinuxSampler - modular, streaming capable sampler                     *
 *                                                                         *
 *   Copyright (C) 2003, 2004 by Benno Senoner and Christian Schoenebeck   *
 *   Copyright (C) 2005 Christian Schoenebeck                              *
 *                                                                         *
 *   This program is free software; you can redistribute it and/or modify  *
 *   it under the terms of the GNU General Public License as published by  *
 *   the Free Software Foundation; either version 2 of the License, or     *
 *   (at your option) any later version.                                   *
 *                                                                         *
 *   This program is distributed in the hope that it will be useful,       *
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
 *   GNU General Public License for more details.                          *
 *                                                                         *
 *   You should have received a copy of the GNU General Public License     *
 *   along with this program; if not, write to the Free Software           *
 *   Foundation, Inc., 59 Temple Place, Suite 330, Boston,                 *
 *   MA  02111-1307  USA                                                   *
 ***************************************************************************/

#ifndef __LS_GIG_SYNTHESIZER_H__
#define __LS_GIG_SYNTHESIZER_H__

#include "../../common/global.h"
#include "../../common/RTMath.h"
#include "../common/Resampler.h"
#include "../common/BiquadFilter.h"
#include "Filter.h"
#include "Voice.h"


#define SYNTHESIS_MODE_SET_INTERPOLATE(iMode,bVal)      if (bVal) iMode |= 0x01; else iMode &= ~0x01   /* (un)set mode bit 0 */
#define SYNTHESIS_MODE_SET_FILTER(iMode,bVal)           if (bVal) iMode |= 0x02; else iMode &= ~0x02   /* (un)set mode bit 1 */
#define SYNTHESIS_MODE_SET_LOOP(iMode,bVal)             if (bVal) iMode |= 0x04; else iMode &= ~0x04   /* (un)set mode bit 2 */
#define SYNTHESIS_MODE_SET_CHANNELS(iMode,bVal)         if (bVal) iMode |= 0x08; else iMode &= ~0x08   /* (un)set mode bit 3 */
#define SYNTHESIS_MODE_SET_IMPLEMENTATION(iMode,bVal)   if (bVal) iMode |= 0x10; else iMode &= ~0x10   /* (un)set mode bit 4 */
#define SYNTHESIS_MODE_SET_PROFILING(iMode,bVal)        if (bVal) iMode |= 0x20; else iMode &= ~0x20   /* (un)set mode bit 5 */

#define SYNTHESIS_MODE_GET_INTERPOLATE(iMode)           iMode & 0x01
#define SYNTHESIS_MODE_GET_FILTER(iMode)                iMode & 0x02
#define SYNTHESIS_MODE_GET_LOOP(iMode)                  iMode & 0x04
#define SYNTHESIS_MODE_GET_CHANNELS(iMode)              iMode & 0x08
#define SYNTHESIS_MODE_GET_IMPLEMENTATION(iMode)        iMode & 0x10

// that's usually gig::Voice of course, but we make it a macro so we can
// include this code for our synthesis benchmark which uses fake data
// structures
#ifndef VOICE
# define VOICE Voice
#endif // VOICE

namespace LinuxSampler { namespace gig {

    typedef void SynthesizeFragment_Fn(VOICE&, uint, sample_t*, uint);

    void* GetSynthesisFunction(const int SynthesisMode);
    void RunSynthesisFunction(const int SynthesisMode, VOICE& voice, uint Samples, sample_t* pSrc, uint Skip);

    enum channels_t {
        MONO,
        STEREO
    };

    /** @brief Main Synthesis algorithms for the gig::Engine
     *
     * Implementation of the main synthesis algorithms of the Gigasampler
     * format capable sampler engine. This means resampling / interpolation
     * for pitching the audio signal, looping, filter and amplification.
     */
    template<implementation_t IMPLEMENTATION, channels_t CHANNELS, bool DOLOOP, bool USEFILTER, bool INTERPOLATE>
    class Synthesizer : public __RTMath<IMPLEMENTATION>, public LinuxSampler::Resampler<INTERPOLATE> {

            // declarations of derived functions (see "Name lookup,
            // templates, and accessing members of base classes" in
            // the gcc manual for an explanation of why this is
            // needed).
            using __RTMath<IMPLEMENTATION>::Mul;
            using __RTMath<IMPLEMENTATION>::Float;
            using LinuxSampler::Resampler<INTERPOLATE>::GetNextSampleMonoCPP;
            using LinuxSampler::Resampler<INTERPOLATE>::GetNextSampleStereoCPP;
#if CONFIG_ASM && ARCH_X86
            using LinuxSampler::Resampler<INTERPOLATE>::GetNext4SamplesMonoMMXSSE;
            using LinuxSampler::Resampler<INTERPOLATE>::GetNext4SamplesStereoMMXSSE;
#endif

        public:
            /**
             * Render audio for the current fragment for the given voice.
             * This is the toplevel method of this class.
             */             
            template<typename VOICE_T>
            inline static void SynthesizeSubFragment(VOICE_T& Voice, uint Samples, sample_t* pSrc, uint i) {
                const float panLeft  = Mul(Voice.fFinalVolume, Mul(Voice.PanLeft,  Voice.pEngineChannel->GlobalPanLeft));
                const float panRight = Mul(Voice.fFinalVolume, Mul(Voice.PanRight, Voice.pEngineChannel->GlobalPanRight));
                if (IMPLEMENTATION == ASM_X86_MMX_SSE) {
                    float fPos = (float) Voice.Pos;
                    SynthesizeSubFragment(Voice, Samples, pSrc, i, Voice.pSample->LoopPlayCount,
                                       Voice.pSample->LoopStart,
                                       Voice.pSample->LoopEnd,
                                       Voice.pSample->LoopSize,
                                       Voice.LoopCyclesLeft,
                                       (void *)&fPos,
                                       &Voice.fFinalPitch,
                                       &panLeft, &panRight);
                    #if CONFIG_ASM && ARCH_X86
                    if (INTERPOLATE) EMMS;
                    #endif
                    Voice.Pos = (double) fPos;
                } else {
                    SynthesizeSubFragment(Voice, Samples, pSrc, i, Voice.pSample->LoopPlayCount,
                                       Voice.pSample->LoopStart,
                                       Voice.pSample->LoopEnd,
                                       Voice.pSample->LoopSize,
                                       Voice.LoopCyclesLeft,
                                       (void *)&Voice.Pos,
                                       &Voice.fFinalPitch,
                                       &panLeft, &panRight);
                }
            }

        //protected:

            /**
             * Render audio for the current fragment for the given voice.
             * Will be called by the toplevel SynthesizeFragment() method.
             */   
            template<typename VOICE_T>
            inline static void SynthesizeSubFragment(VOICE_T& Voice, uint Samples, sample_t* pSrc, uint& i, uint& LoopPlayCount, uint LoopStart, uint LoopEnd, uint LoopSize, uint& LoopCyclesLeft, void* Pos, const float* Pitch, const float* PanLeft, const float* PanRight) {
                const float loopEnd = Float(LoopEnd);
                const float f_LoopStart = Float(LoopStart);
                const float f_LoopSize = Float(LoopSize);
                if (DOLOOP) {
                    if (LoopPlayCount) {
                        // render loop (loop count limited)
                        while (i < Samples && LoopCyclesLeft) {
                            const uint processEnd = Min(Samples, i + DiffToLoopEnd(loopEnd,Pos, *Pitch) + 1); //TODO: instead of +1 we could also round up
                            while (i < processEnd) Synthesize(Voice, Pos, pSrc, i, PanLeft, PanRight);
                            LoopCyclesLeft -= WrapLoop(f_LoopStart, f_LoopSize, loopEnd, Pos);
                        }
                        // render on without loop
                        while (i < Samples) Synthesize(Voice, Pos, pSrc, i, PanLeft, PanRight);
                    }
                    else { // render loop (endless loop)
                        while (i < Samples) {
                            const uint processEnd = Min(Samples, i + DiffToLoopEnd(loopEnd, Pos, *Pitch) + 1); //TODO: instead of +1 we could also round up
                            while (i < processEnd) Synthesize(Voice, Pos, pSrc, i, PanLeft, PanRight);
                            WrapLoop(f_LoopStart, f_LoopSize, loopEnd, Pos);
                        }
                    }
                }
                else { // no looping
                    while (i < Samples) { Synthesize(Voice, Pos, pSrc, i, PanLeft, PanRight); }
                }
            }

            /**
             * Atomicly render a piece for the voice. For the C++
             * implementation this means rendering exactly one sample
             * point, whereas for the MMX/SSE implementation this means
             * rendering 4 sample points.
             */
            template<typename VOICE_T>
            inline static void Synthesize(VOICE_T& Voice, void* Pos, sample_t* pSrc, uint& i, const float* PanLeft, const float* PanRight) {
                Synthesize(pSrc, Pos,
                           Voice.fFinalPitch,
                           Voice.pEngineChannel->pOutputLeft,
                           Voice.pEngineChannel->pOutputRight,
                           i,
                           PanLeft,
                           PanRight,
                           Voice.FilterLeft,
                           Voice.FilterRight);
            }

            /**
             * Returns the difference to the sample's loop end.
             */
            inline static int DiffToLoopEnd(const float& LoopEnd, const void* Pos, const float& Pitch) {
                switch (IMPLEMENTATION) {
                    #if CONFIG_ASM && ARCH_X86
                    case ASM_X86_MMX_SSE: {
                        int result;
                        __asm__ __volatile__ (
                            "movss    (%1), %%xmm0  #read loopend\n\t"
                            "subss    (%2), %%xmm0  #sub  pos\n\t"
                            "divss    (%3), %%xmm0  #div  by pitch\n\t"
                            "cvtss2si %%xmm0, %0    #convert to int\n\t"
                            : "=r" (result)   /* %0 */
                            : "r" (&LoopEnd), /* %1 */
                              "r" (Pos),      /* %2 */
                              "r" (&Pitch)    /* %3 */
                        );
                        return result;
                    }
                    #endif // CONFIG_ASM && ARCH_X86
                    // pure C++ implementation (thus platform independent)
                    default: {
                        return uint((LoopEnd - *((double *)Pos)) / Pitch);
                    }
                }
            }

            //TODO: this method is not in use yet, it's intended to be used for pitch=x.0f where we could use integer instead of float as playback position variable
            inline static int WrapLoop(const int& LoopStart, const int& LoopSize, const int& LoopEnd, int& Pos) {
                switch (IMPLEMENTATION) {
                    // pure C++ implementation (thus platform independent)
                    default: { //TODO: we can easily eliminate the branch here
                        if (Pos < LoopEnd) return 0;
                        Pos = (Pos - LoopEnd) % LoopSize + LoopStart;
                        return 1;
                    }
                }
            }

            /**
             * This method handles looping of the RAM playback part of the
             * sample, thus repositioning the playback position once the
             * loop limit was reached. Note: looping of the disk streaming
             * part is handled by libgig (ReadAndLoop() method which will
             * be called by the DiskThread).
             */
            inline static int WrapLoop(const float& LoopStart, const float& LoopSize, const float& LoopEnd, void* vPos) {
                switch (IMPLEMENTATION) {
                    #if CONFIG_ASM && ARCH_X86
                    case ASM_X86_MMX_SSE: {
                        int result = 0;
                        __asm__ __volatile__ (
                            "movss  (%2), %%xmm0          # load LoopEnd\n\t"
                            "movss  (%1), %%xmm1          # load Pos\n\t"
                            "comiss %%xmm0, %%xmm1      # LoopEnd <> Pos\n\t"
                            "jb     1f                  # jump if no work needs to be done\n\t"
                            "movss    (%3), %%xmm2        # load LoopSize\n\t"
                            "subss    %%xmm0, %%xmm1    # Pos - LoopEnd\n\t"
                            //now the fmodf
                            "movss    %%xmm1, %%xmm3    # xmm3 = (Pos - LoopEnd)\n\t"
                            "divss    %%xmm2, %%xmm1    # (Pos - LoopEnd) / LoopSize\n\t"
                            "cvttss2si %%xmm1, %2    # convert to int\n\t"
                            "cvtsi2ss  %2, %%xmm1    # convert back to float\n\t"
                            "movss    (%4), %%xmm0      # load LoopStart\n\t"
                            "mulss    %%xmm2, %%xmm1    # LoopSize * int((Pos-LoopEnd)/LoopSize)\n\t"
                            "subss    %%xmm1, %%xmm3    # xmm2 = fmodf(Pos - LoopEnd, LoopSize)\n\t"
                            //done with fmodf
                            "addss    %%xmm0, %%xmm3      # add LoopStart\n\t"
                            "movss    %%xmm3, (%1)        # update Pos\n\t"
                            "movl    $1, (%0)             # result = 1\n\t"
                            ".balign 16 \n\t"
                            "1:\n\t"
                            :: "r" (&result),   /* %0 */
                              "r"  (vPos),      /* %1 */
                              "r"  (&LoopEnd),  /* %2 */
                              "r"  (&LoopSize), /* %3 */
                              "r"  (&LoopStart) /* %4 */
                        );
                        return result;
                    }
                    #endif // CONFIG_ASM && ARCH_X86
                    // pure C++ implementation (thus platform independent)
                    default: {
                        double * Pos = (double *)vPos;
                        if (*Pos < LoopEnd) return 0;
                        *Pos = fmod(*Pos - LoopEnd, LoopSize) + LoopStart;
                        return 1;
                    }
                }
            }

            /**
             * Atomicly render a piece for the voice. For the C++
             * implementation this means rendering exactly one sample
             * point, whereas for the MMX/SSE implementation this means
             * rendering 4 sample points.
             */
            inline static void Synthesize(sample_t* pSrc, void* Pos, float& Pitch, float* pOutL, float* pOutR, uint& i, const float* PanL, const float* PanR, Filter& FilterL, Filter& FilterR) {
                switch (IMPLEMENTATION) {
                    // pure C++ implementation (thus platform independent)
                    case CPP: {
                        switch (CHANNELS) {
                            case MONO: {
                                float samplePoint = GetNextSampleMonoCPP(pSrc, (double *)Pos, Pitch);
                                if (USEFILTER) samplePoint = FilterL.Apply(samplePoint);
                                pOutL[i] += samplePoint * *PanL;
                                pOutR[i] += samplePoint * *PanR;
                                i++;
                                break;
                            }
                            case STEREO: {
                                stereo_sample_t samplePoint = GetNextSampleStereoCPP(pSrc, (double *)Pos, Pitch);
                                if (USEFILTER) {
                                    samplePoint.left  = FilterL.Apply(samplePoint.left);
                                    samplePoint.right = FilterR.Apply(samplePoint.right);
                                }
                                pOutL[i] += samplePoint.left  * *PanL;
                                pOutR[i] += samplePoint.right * *PanR;
                                i++;
                                break;
                            }
                        }
                        break;
                    }
                    #if CONFIG_ASM && ARCH_X86
                    // Assembly optimization using the MMX & SSE(1) instruction set (thus only for x86)
                    case ASM_X86_MMX_SSE: {
                        const int ii = i & 0xfffffffc;
                        i += 4;
                        switch (CHANNELS) {
                            case MONO: {
                                GetNext4SamplesMonoMMXSSE(pSrc, (float *)Pos, Pitch); // outputs samples in xmm2
                                if (USEFILTER) {
                                    /* prepare filter input */
                                    __asm__ __volatile__ (
                                        "movaps %xmm2,%xmm0"
                                    );
                                    FilterL.Apply4StepsSSE(&bqBase, &bqMain); // xmm0 input, xmm7 output
                                    __asm__ __volatile__ (
                                        "movaps %xmm7,%xmm2       # mono filter result -> xmm2"
                                    );
                                }
                                /* apply panorama and volume factors */
                                __asm__ __volatile__ (
                                    "movss    (%1),%%xmm0             # load pan left\n\t"
                                    "movss    (%2),%%xmm1             # load pan right\n\t"
                                    "movaps   (%0),%%xmm4             # load vca\n\t"
                                    "shufps   $0x00,%%xmm0,%%xmm0     # copy pan left to the other 3 cells\n\t"
                                    "shufps   $0x00,%%xmm1,%%xmm1     # copy pan right to the other 3 cells\n\t"
                                    "mulps    %%xmm2,%%xmm0           # left  = sample * pan_left\n\t"
                                    "mulps    %%xmm2,%%xmm1           # right = sample * pan_right\n\t"
                                    "mulps    %%xmm4,%%xmm0           # left  = vca * (sample * pan_left)\n\t"
                                    "mulps    %%xmm4,%%xmm1           # right = vca * (sample * pan_right)\n\t"
                                    : /* no output */
                                    : "r" (&Volume[ii]), /* %0 */
                                      "r" (PanL),   /* %1 */
                                      "r" (PanR)    /* %2 */
                                    : "xmm0", /* holds final left  sample (for the 4 samples) at the end */
                                      "xmm1"  /* holds final right sample (for the 4 samples) at the end */
                                );
                                break;
                            }
                            case STEREO: {
                                GetNext4SamplesStereoMMXSSE(pSrc, (float *)Pos, Pitch); // outputs samples in xmm2 (left channel) and xmm3 (right channel)
                                if (USEFILTER) {
                                    __asm__ __volatile__ (
                                        "movaps %xmm2,%xmm0     # prepare left channel for filter\n\t"
                                        "movaps %xmm3,%xmm1     # save right channel not to get overwritten by filter algorithms\n\t"
                                    );
                                    FilterL.Apply4StepsSSE(&bqBase, &bqMain); // xmm0 input, xmm7 output
                                    __asm__ __volatile__ (
                                        "movaps %xmm1,%xmm0     # prepare right channel for filter\n\t"
                                        "movaps %xmm7,%xmm1     # save filter output for left channel\n\t"
                                    );
                                    FilterR.Apply4StepsSSE(&bqBase, &bqMain); // xmm0 input, xmm7 output
                                    __asm__ __volatile__ (
                                        "movaps %xmm1,%xmm2     # result left channel -> xmm2\n\t"
                                        "movaps %xmm7,%xmm3     # result right channel -> xmm3\n\t"
                                    );
                                }
                                /* apply panorama and volume factors */
                                __asm__ __volatile__ (
                                    "movss    (%1),%%xmm0             # load pan left\n\t"
                                    "movss    (%2),%%xmm1             # load pan right\n\t"
                                    "movaps   (%0),%%xmm4             # load vca\n\t"
                                    "shufps   $0x00,%%xmm0,%%xmm0     # copy pan left to the other 3 cells\n\t"
                                    "shufps   $0x00,%%xmm1,%%xmm1     # copy pan right to the other 3 cells\n\t"
                                    "mulps    %%xmm2,%%xmm0           # left  = sample_left  * pan_left\n\t"
                                    "mulps    %%xmm3,%%xmm1           # right = sample_right * pan_right\n\t"
                                    "mulps    %%xmm4,%%xmm0           # left  = vca * (sample_left  * pan_left)\n\t"
                                    "mulps    %%xmm4,%%xmm1           # right = vca * (sample_right * pan_right)\n\t"
                                    : /* no output */
                                    : "r" (&Volume[ii]), /* %0 */
                                      "r" (PanL),   /* %1 */
                                      "r" (PanR)    /* %2 */
                                );
                                break;
                            }
                        }
                        /* mix the 4 samples to the output channels */
                        __asm__ __volatile__ (
                            "addps  (%0),%%xmm0       # mix calculated sample(s) to output left\n\t"
                            "movaps %%xmm0,(%0)       # output to left channel\n\t"
                            "addps  (%1),%%xmm1       # mix calculated sample(s) to output right\n\t"
                            "movaps %%xmm1,(%1)       # output to right channel\n\t"
                            : /* no output */
                            : "r" (&pOutL[ii]), /* %0 - must be 16 byte aligned ! */
                              "r" (&pOutR[ii])  /* %1 - must be 16 byte aligned ! */
                        );
                    }
                    #endif // CONFIG_ASM && ARCH_X86
                }
            }
    };

}} // namespace LinuxSampler::gig

#endif // __LS_GIG_SYNTHESIZER_H__
1	/***************************************************************************
2	* *
3	* LinuxSampler - modular, streaming capable sampler *
4	* *
5	* Copyright (C) 2003, 2004 by Benno Senoner and Christian Schoenebeck *
6	* Copyright (C) 2005 Christian Schoenebeck *
7	* *
8	* This program is free software; you can redistribute it and/or modify *
9	* it under the terms of the GNU General Public License as published by *
10	* the Free Software Foundation; either version 2 of the License, or *
11	* (at your option) any later version. *
12	* *
13	* This program is distributed in the hope that it will be useful, *
14	* but WITHOUT ANY WARRANTY; without even the implied warranty of *
15	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
16	* GNU General Public License for more details. *
17	* *
18	* You should have received a copy of the GNU General Public License *
19	* along with this program; if not, write to the Free Software *
20	* Foundation, Inc., 59 Temple Place, Suite 330, Boston, *
21	* MA 02111-1307 USA *
22	***************************************************************************/
23
24	#ifndef __LS_GIG_SYNTHESIZER_H__
25	#define __LS_GIG_SYNTHESIZER_H__
26
27	#include "../../common/global.h"
28	#include "../../common/RTMath.h"
29	#include "../common/Resampler.h"
30	#include "../common/BiquadFilter.h"
31	#include "Filter.h"
32	#include "Voice.h"
33
34
35	#define SYNTHESIS_MODE_SET_INTERPOLATE(iMode,bVal) if (bVal) iMode \|= 0x01; else iMode &= ~0x01 /* (un)set mode bit 0 */
36	#define SYNTHESIS_MODE_SET_FILTER(iMode,bVal) if (bVal) iMode \|= 0x02; else iMode &= ~0x02 /* (un)set mode bit 1 */
37	#define SYNTHESIS_MODE_SET_LOOP(iMode,bVal) if (bVal) iMode \|= 0x04; else iMode &= ~0x04 /* (un)set mode bit 2 */
38	#define SYNTHESIS_MODE_SET_CHANNELS(iMode,bVal) if (bVal) iMode \|= 0x08; else iMode &= ~0x08 /* (un)set mode bit 3 */
39	#define SYNTHESIS_MODE_SET_IMPLEMENTATION(iMode,bVal) if (bVal) iMode \|= 0x10; else iMode &= ~0x10 /* (un)set mode bit 4 */
40	#define SYNTHESIS_MODE_SET_PROFILING(iMode,bVal) if (bVal) iMode \|= 0x20; else iMode &= ~0x20 /* (un)set mode bit 5 */
41
42	#define SYNTHESIS_MODE_GET_INTERPOLATE(iMode) iMode & 0x01
43	#define SYNTHESIS_MODE_GET_FILTER(iMode) iMode & 0x02
44	#define SYNTHESIS_MODE_GET_LOOP(iMode) iMode & 0x04
45	#define SYNTHESIS_MODE_GET_CHANNELS(iMode) iMode & 0x08
46	#define SYNTHESIS_MODE_GET_IMPLEMENTATION(iMode) iMode & 0x10
47
48	// that's usually gig::Voice of course, but we make it a macro so we can
49	// include this code for our synthesis benchmark which uses fake data
50	// structures
51	#ifndef VOICE
52	# define VOICE Voice
53	#endif // VOICE
54
55	namespace LinuxSampler { namespace gig {
56
57	typedef void SynthesizeFragment_Fn(VOICE&, uint, sample_t*, uint);
58
59	void* GetSynthesisFunction(const int SynthesisMode);
60	void RunSynthesisFunction(const int SynthesisMode, VOICE& voice, uint Samples, sample_t* pSrc, uint Skip);
61
62	enum channels_t {
63	MONO,
64	STEREO
65	};
66
67	/** @brief Main Synthesis algorithms for the gig::Engine
68	*
69	* Implementation of the main synthesis algorithms of the Gigasampler
70	* format capable sampler engine. This means resampling / interpolation
71	* for pitching the audio signal, looping, filter and amplification.
72	*/
73	template<implementation_t IMPLEMENTATION, channels_t CHANNELS, bool DOLOOP, bool USEFILTER, bool INTERPOLATE>
74	class Synthesizer : public __RTMath<IMPLEMENTATION>, public LinuxSampler::Resampler<INTERPOLATE> {
75
76	// declarations of derived functions (see "Name lookup,
77	// templates, and accessing members of base classes" in
78	// the gcc manual for an explanation of why this is
79	// needed).
80	using __RTMath<IMPLEMENTATION>::Mul;
81	using __RTMath<IMPLEMENTATION>::Float;
82	using LinuxSampler::Resampler<INTERPOLATE>::GetNextSampleMonoCPP;
83	using LinuxSampler::Resampler<INTERPOLATE>::GetNextSampleStereoCPP;
84	#if CONFIG_ASM && ARCH_X86
85	using LinuxSampler::Resampler<INTERPOLATE>::GetNext4SamplesMonoMMXSSE;
86	using LinuxSampler::Resampler<INTERPOLATE>::GetNext4SamplesStereoMMXSSE;
87	#endif
88
89	public:
90	/**
91	* Render audio for the current fragment for the given voice.
92	* This is the toplevel method of this class.
93	*/
94	template<typename VOICE_T>
95	inline static void SynthesizeSubFragment(VOICE_T& Voice, uint Samples, sample_t* pSrc, uint i) {
96	const float panLeft = Mul(Voice.fFinalVolume, Mul(Voice.PanLeft, Voice.pEngineChannel->GlobalPanLeft));
97	const float panRight = Mul(Voice.fFinalVolume, Mul(Voice.PanRight, Voice.pEngineChannel->GlobalPanRight));
98	if (IMPLEMENTATION == ASM_X86_MMX_SSE) {
99	float fPos = (float) Voice.Pos;
100	SynthesizeSubFragment(Voice, Samples, pSrc, i, Voice.pSample->LoopPlayCount,
101	Voice.pSample->LoopStart,
102	Voice.pSample->LoopEnd,
103	Voice.pSample->LoopSize,
104	Voice.LoopCyclesLeft,
105	(void *)&fPos,
106	&Voice.fFinalPitch,
107	&panLeft, &panRight);
108	#if CONFIG_ASM && ARCH_X86
109	if (INTERPOLATE) EMMS;
110	#endif
111	Voice.Pos = (double) fPos;
112	} else {
113	SynthesizeSubFragment(Voice, Samples, pSrc, i, Voice.pSample->LoopPlayCount,
114	Voice.pSample->LoopStart,
115	Voice.pSample->LoopEnd,
116	Voice.pSample->LoopSize,
117	Voice.LoopCyclesLeft,
118	(void *)&Voice.Pos,
119	&Voice.fFinalPitch,
120	&panLeft, &panRight);
121	}
122	}
123
124	//protected:
125
126	/**
127	* Render audio for the current fragment for the given voice.
128	* Will be called by the toplevel SynthesizeFragment() method.
129	*/
130	template<typename VOICE_T>
131	inline static void SynthesizeSubFragment(VOICE_T& Voice, uint Samples, sample_t* pSrc, uint& i, uint& LoopPlayCount, uint LoopStart, uint LoopEnd, uint LoopSize, uint& LoopCyclesLeft, void* Pos, const float* Pitch, const float* PanLeft, const float* PanRight) {
132	const float loopEnd = Float(LoopEnd);
133	const float f_LoopStart = Float(LoopStart);
134	const float f_LoopSize = Float(LoopSize);
135	if (DOLOOP) {
136	if (LoopPlayCount) {
137	// render loop (loop count limited)
138	while (i < Samples && LoopCyclesLeft) {
139	const uint processEnd = Min(Samples, i + DiffToLoopEnd(loopEnd,Pos, *Pitch) + 1); //TODO: instead of +1 we could also round up
140	while (i < processEnd) Synthesize(Voice, Pos, pSrc, i, PanLeft, PanRight);
141	LoopCyclesLeft -= WrapLoop(f_LoopStart, f_LoopSize, loopEnd, Pos);
142	}
143	// render on without loop
144	while (i < Samples) Synthesize(Voice, Pos, pSrc, i, PanLeft, PanRight);
145	}
146	else { // render loop (endless loop)
147	while (i < Samples) {
148	const uint processEnd = Min(Samples, i + DiffToLoopEnd(loopEnd, Pos, *Pitch) + 1); //TODO: instead of +1 we could also round up
149	while (i < processEnd) Synthesize(Voice, Pos, pSrc, i, PanLeft, PanRight);
150	WrapLoop(f_LoopStart, f_LoopSize, loopEnd, Pos);
151	}
152	}
153	}
154	else { // no looping
155	while (i < Samples) { Synthesize(Voice, Pos, pSrc, i, PanLeft, PanRight); }
156	}
157	}
158
159	/**
160	* Atomicly render a piece for the voice. For the C++
161	* implementation this means rendering exactly one sample
162	* point, whereas for the MMX/SSE implementation this means
163	* rendering 4 sample points.
164	*/
165	template<typename VOICE_T>
166	inline static void Synthesize(VOICE_T& Voice, void* Pos, sample_t* pSrc, uint& i, const float* PanLeft, const float* PanRight) {
167	Synthesize(pSrc, Pos,
168	Voice.fFinalPitch,
169	Voice.pEngineChannel->pOutputLeft,
170	Voice.pEngineChannel->pOutputRight,
171	i,
172	PanLeft,
173	PanRight,
174	Voice.FilterLeft,
175	Voice.FilterRight);
176	}
177
178	/**
179	* Returns the difference to the sample's loop end.
180	*/
181	inline static int DiffToLoopEnd(const float& LoopEnd, const void* Pos, const float& Pitch) {
182	switch (IMPLEMENTATION) {
183	#if CONFIG_ASM && ARCH_X86
184	case ASM_X86_MMX_SSE: {
185	int result;
186	__asm__ __volatile__ (
187	"movss (%1), %%xmm0 #read loopend\n\t"
188	"subss (%2), %%xmm0 #sub pos\n\t"
189	"divss (%3), %%xmm0 #div by pitch\n\t"
190	"cvtss2si %%xmm0, %0 #convert to int\n\t"
191	: "=r" (result) /* %0 */
192	: "r" (&LoopEnd), /* %1 */
193	"r" (Pos), /* %2 */
194	"r" (&Pitch) /* %3 */
195	);
196	return result;
197	}
198	#endif // CONFIG_ASM && ARCH_X86
199	// pure C++ implementation (thus platform independent)
200	default: {
201	return uint((LoopEnd - ((double )Pos)) / Pitch);
202	}
203	}
204	}
205
206	//TODO: this method is not in use yet, it's intended to be used for pitch=x.0f where we could use integer instead of float as playback position variable
207	inline static int WrapLoop(const int& LoopStart, const int& LoopSize, const int& LoopEnd, int& Pos) {
208	switch (IMPLEMENTATION) {
209	// pure C++ implementation (thus platform independent)
210	default: { //TODO: we can easily eliminate the branch here
211	if (Pos < LoopEnd) return 0;
212	Pos = (Pos - LoopEnd) % LoopSize + LoopStart;
213	return 1;
214	}
215	}
216	}
217
218	/**
219	* This method handles looping of the RAM playback part of the
220	* sample, thus repositioning the playback position once the
221	* loop limit was reached. Note: looping of the disk streaming
222	* part is handled by libgig (ReadAndLoop() method which will
223	* be called by the DiskThread).
224	*/
225	inline static int WrapLoop(const float& LoopStart, const float& LoopSize, const float& LoopEnd, void* vPos) {
226	switch (IMPLEMENTATION) {
227	#if CONFIG_ASM && ARCH_X86
228	case ASM_X86_MMX_SSE: {
229	int result = 0;
230	__asm__ __volatile__ (
231	"movss (%2), %%xmm0 # load LoopEnd\n\t"
232	"movss (%1), %%xmm1 # load Pos\n\t"
233	"comiss %%xmm0, %%xmm1 # LoopEnd <> Pos\n\t"
234	"jb 1f # jump if no work needs to be done\n\t"
235	"movss (%3), %%xmm2 # load LoopSize\n\t"
236	"subss %%xmm0, %%xmm1 # Pos - LoopEnd\n\t"
237	//now the fmodf
238	"movss %%xmm1, %%xmm3 # xmm3 = (Pos - LoopEnd)\n\t"
239	"divss %%xmm2, %%xmm1 # (Pos - LoopEnd) / LoopSize\n\t"
240	"cvttss2si %%xmm1, %2 # convert to int\n\t"
241	"cvtsi2ss %2, %%xmm1 # convert back to float\n\t"
242	"movss (%4), %%xmm0 # load LoopStart\n\t"
243	"mulss %%xmm2, %%xmm1 # LoopSize * int((Pos-LoopEnd)/LoopSize)\n\t"
244	"subss %%xmm1, %%xmm3 # xmm2 = fmodf(Pos - LoopEnd, LoopSize)\n\t"
245	//done with fmodf
246	"addss %%xmm0, %%xmm3 # add LoopStart\n\t"
247	"movss %%xmm3, (%1) # update Pos\n\t"
248	"movl $1, (%0) # result = 1\n\t"
249	".balign 16 \n\t"
250	"1:\n\t"
251	:: "r" (&result), /* %0 */
252	"r" (vPos), /* %1 */
253	"r" (&LoopEnd), /* %2 */
254	"r" (&LoopSize), /* %3 */
255	"r" (&LoopStart) /* %4 */
256	);
257	return result;
258	}
259	#endif // CONFIG_ASM && ARCH_X86
260	// pure C++ implementation (thus platform independent)
261	default: {
262	double * Pos = (double *)vPos;
263	if (*Pos < LoopEnd) return 0;
264	Pos = fmod(Pos - LoopEnd, LoopSize) + LoopStart;
265	return 1;
266	}
267	}
268	}
269
270	/**
271	* Atomicly render a piece for the voice. For the C++
272	* implementation this means rendering exactly one sample
273	* point, whereas for the MMX/SSE implementation this means
274	* rendering 4 sample points.
275	*/
276	inline static void Synthesize(sample_t* pSrc, void* Pos, float& Pitch, float* pOutL, float* pOutR, uint& i, const float* PanL, const float* PanR, Filter& FilterL, Filter& FilterR) {
277	switch (IMPLEMENTATION) {
278	// pure C++ implementation (thus platform independent)
279	case CPP: {
280	switch (CHANNELS) {
281	case MONO: {
282	float samplePoint = GetNextSampleMonoCPP(pSrc, (double *)Pos, Pitch);
283	if (USEFILTER) samplePoint = FilterL.Apply(samplePoint);
284	pOutL[i] += samplePoint * *PanL;
285	pOutR[i] += samplePoint * *PanR;
286	i++;
287	break;
288	}
289	case STEREO: {
290	stereo_sample_t samplePoint = GetNextSampleStereoCPP(pSrc, (double *)Pos, Pitch);
291	if (USEFILTER) {
292	samplePoint.left = FilterL.Apply(samplePoint.left);
293	samplePoint.right = FilterR.Apply(samplePoint.right);
294	}
295	pOutL[i] += samplePoint.left * *PanL;
296	pOutR[i] += samplePoint.right * *PanR;
297	i++;
298	break;
299	}
300	}
301	break;
302	}
303	#if CONFIG_ASM && ARCH_X86
304	// Assembly optimization using the MMX & SSE(1) instruction set (thus only for x86)
305	case ASM_X86_MMX_SSE: {
306	const int ii = i & 0xfffffffc;
307	i += 4;
308	switch (CHANNELS) {
309	case MONO: {
310	GetNext4SamplesMonoMMXSSE(pSrc, (float *)Pos, Pitch); // outputs samples in xmm2
311	if (USEFILTER) {
312	/* prepare filter input */
313	__asm__ __volatile__ (
314	"movaps %xmm2,%xmm0"
315	);
316	FilterL.Apply4StepsSSE(&bqBase, &bqMain); // xmm0 input, xmm7 output
317	__asm__ __volatile__ (
318	"movaps %xmm7,%xmm2 # mono filter result -> xmm2"
319	);
320	}
321	/* apply panorama and volume factors */
322	__asm__ __volatile__ (
323	"movss (%1),%%xmm0 # load pan left\n\t"
324	"movss (%2),%%xmm1 # load pan right\n\t"
325	"movaps (%0),%%xmm4 # load vca\n\t"
326	"shufps $0x00,%%xmm0,%%xmm0 # copy pan left to the other 3 cells\n\t"
327	"shufps $0x00,%%xmm1,%%xmm1 # copy pan right to the other 3 cells\n\t"
328	"mulps %%xmm2,%%xmm0 # left = sample * pan_left\n\t"
329	"mulps %%xmm2,%%xmm1 # right = sample * pan_right\n\t"
330	"mulps %%xmm4,%%xmm0 # left = vca * (sample * pan_left)\n\t"
331	"mulps %%xmm4,%%xmm1 # right = vca * (sample * pan_right)\n\t"
332	: /* no output */
333	: "r" (&Volume[ii]), /* %0 */
334	"r" (PanL), /* %1 */
335	"r" (PanR) /* %2 */
336	: "xmm0", /* holds final left sample (for the 4 samples) at the end */
337	"xmm1" /* holds final right sample (for the 4 samples) at the end */
338	);
339	break;
340	}
341	case STEREO: {
342	GetNext4SamplesStereoMMXSSE(pSrc, (float *)Pos, Pitch); // outputs samples in xmm2 (left channel) and xmm3 (right channel)
343	if (USEFILTER) {
344	__asm__ __volatile__ (
345	"movaps %xmm2,%xmm0 # prepare left channel for filter\n\t"
346	"movaps %xmm3,%xmm1 # save right channel not to get overwritten by filter algorithms\n\t"
347	);
348	FilterL.Apply4StepsSSE(&bqBase, &bqMain); // xmm0 input, xmm7 output
349	__asm__ __volatile__ (
350	"movaps %xmm1,%xmm0 # prepare right channel for filter\n\t"
351	"movaps %xmm7,%xmm1 # save filter output for left channel\n\t"
352	);
353	FilterR.Apply4StepsSSE(&bqBase, &bqMain); // xmm0 input, xmm7 output
354	__asm__ __volatile__ (
355	"movaps %xmm1,%xmm2 # result left channel -> xmm2\n\t"
356	"movaps %xmm7,%xmm3 # result right channel -> xmm3\n\t"
357	);
358	}
359	/* apply panorama and volume factors */
360	__asm__ __volatile__ (
361	"movss (%1),%%xmm0 # load pan left\n\t"
362	"movss (%2),%%xmm1 # load pan right\n\t"
363	"movaps (%0),%%xmm4 # load vca\n\t"
364	"shufps $0x00,%%xmm0,%%xmm0 # copy pan left to the other 3 cells\n\t"
365	"shufps $0x00,%%xmm1,%%xmm1 # copy pan right to the other 3 cells\n\t"
366	"mulps %%xmm2,%%xmm0 # left = sample_left * pan_left\n\t"
367	"mulps %%xmm3,%%xmm1 # right = sample_right * pan_right\n\t"
368	"mulps %%xmm4,%%xmm0 # left = vca * (sample_left * pan_left)\n\t"
369	"mulps %%xmm4,%%xmm1 # right = vca * (sample_right * pan_right)\n\t"
370	: /* no output */
371	: "r" (&Volume[ii]), /* %0 */
372	"r" (PanL), /* %1 */
373	"r" (PanR) /* %2 */
374	);
375	break;
376	}
377	}
378	/* mix the 4 samples to the output channels */
379	__asm__ __volatile__ (
380	"addps (%0),%%xmm0 # mix calculated sample(s) to output left\n\t"
381	"movaps %%xmm0,(%0) # output to left channel\n\t"
382	"addps (%1),%%xmm1 # mix calculated sample(s) to output right\n\t"
383	"movaps %%xmm1,(%1) # output to right channel\n\t"
384	: /* no output */
385	: "r" (&pOutL[ii]), /* %0 - must be 16 byte aligned ! */
386	"r" (&pOutR[ii]) /* %1 - must be 16 byte aligned ! */
387	);
388	}
389	#endif // CONFIG_ASM && ARCH_X86
390	}
391	}
392	};
393
394	}} // namespace LinuxSampler::gig
395
396	#endif // __LS_GIG_SYNTHESIZER_H__