/*
 * Musepack audio compression
 * Copyright (C) 1999-2004 Buschmann/Klemm/Piecha/Wolf
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

/*
 *  PCM Synthesis (quantized subband samples => PCM output)
 *  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 *  - synth.c/synth.h
 *      portable synthesis routines
 *      nonportable synthesis routines using subroutines in synth_asm.nas
 *
 *  - synth_tab.c/synth_tab.h
 *      2 tables for synthesis routines in C
 *
 *  - synth_asm.nas/synth.h
 *      special subroutines for AMD's 3DNow! and Intels SIMD
 *      another 2 tables for synthesis routines (reordered tables from synth_tab.c)
 *
 *  Note: To fully understand this module you must understand the math behind
 *  subband analysis and subband synthesis.
 */

#include <string.h>
#include "mppdec.h"


#ifdef HAVE_IEEE754_FLOAT
# define ROUND32(x)   ( floattmp = (x) + (Int32_t)0x00FD8000L, *(Int32_t*)(&floattmp) - (Int32_t)0x4B7D8000L )
#else
# define ROUND32(x)   ( (Int32_t) floor ((x) + 0.5) )
#endif

#ifdef HAVE_IEEE754_DOUBLE
# define ROUND64(x)   ( doubletmp = (x) + Dither.Add + (Int64_t)0x001FFFFD80000000L, *(Int64_t*)(&doubletmp) - (Int64_t)0x433FFFFD80000000L )
#else
# define ROUND64(x)   ( (Int64_t) floor ((x) + Dither.Add) )
#endif


static dither_t  Dither;

#if !defined USE_ASM  ||  defined MAKE_16BIT  ||  defined MAKE_24BIT  ||  defined MAKE_32BIT

static void
Calculate_New_V ( register const Float* Sample, register Float* V )
{
    // Calculating of new V-buffer values for left channel (see ISO-11172-3, p. 39)
    // based on an algorithm by Byeong Gi Lee

    register const Float*  C = Cos64;
    Float         A00, A01, A02, A03, A04, A05, A06, A07, A08, A09, A10, A11, A12, A13, A14, A15;
    Float         B00, B01, B02, B03, B04, B05, B06, B07, B08, B09, B10, B11, B12, B13, B14, B15;
    Float         tmp;
    Float         tmp2;
    Uint32_t      togg;

    ENTER(163);

    V += 32;

    A00 = Sample[ 0] + Sample[31];
    A01 = Sample[ 1] + Sample[30];
    A02 = Sample[ 3] + Sample[28];
    A03 = Sample[ 2] + Sample[29];
    A04 = Sample[ 7] + Sample[24];
    A05 = Sample[ 6] + Sample[25];
    A06 = Sample[ 4] + Sample[27];
    A07 = Sample[ 5] + Sample[26];
    A08 = Sample[15] + Sample[16];
    A09 = Sample[14] + Sample[17];
    A10 = Sample[12] + Sample[19];
    A11 = Sample[13] + Sample[18];
    A12 = Sample[ 8] + Sample[23];
    A13 = Sample[ 9] + Sample[22];
    A14 = Sample[11] + Sample[20];
    A15 = Sample[10] + Sample[21];

    B00 =  A00 + A08;
    B01 =  A01 + A09;
    B02 =  A02 + A10;
    B03 =  A03 + A11;
    B04 =  A04 + A12;
    B05 =  A05 + A13;
    B06 =  A06 + A14;
    B07 =  A07 + A15;
    B08 = (A00 - A08) * C[ 2];
    B09 = (A01 - A09) * C[ 6];
    B10 = (A02 - A10) * C[14];
    B11 = (A03 - A11) * C[10];
    B12 = (A04 - A12) * C[30];
    B13 = (A05 - A13) * C[26];
    B14 = (A06 - A14) * C[18];
    B15 = (A07 - A15) * C[22];

    A00 =  B00 + B04;
    A01 =  B01 + B05;
    A02 =  B02 + B06;
    A03 =  B03 + B07;
    A04 = (B00 - B04) * C[ 4];
    A05 = (B01 - B05) * C[12];
    A06 = (B02 - B06) * C[28];
    A07 = (B03 - B07) * C[20];
    A08 =  B08 + B12;
    A09 =  B09 + B13;
    A10 =  B10 + B14;
    A11 =  B11 + B15;
    A12 = (B08 - B12) * C[ 4];
    A13 = (B09 - B13) * C[12];
    A14 = (B10 - B14) * C[28];
    A15 = (B11 - B15) * C[20];

    B00 =  A00 + A02;
    B01 =  A01 + A03;
    B02 = (A00 - A02) * C[ 8];
    B03 = (A01 - A03) * C[24];
    B04 =  A04 + A06;
    B05 =  A05 + A07;
    B06 = (A04 - A06) * C[ 8];
    B07 = (A05 - A07) * C[24];
    B08 =  A08 + A10;
    B09 =  A09 + A11;
    B10 = (A08 - A10) * C[ 8];
    B11 = (A09 - A11) * C[24];
    B12 =  A12 + A14;
    B13 =  A13 + A15;
    B14 = (A12 - A14) * C[ 8];
    B15 = (A13 - A15) * C[24];

    A00 =  B00 + B01;
    A01 = (B00 - B01) * C[16];
    A02 =  B02 + B03;
    A03 = (B02 - B03) * C[16];
    A04 =  B04 + B05;
    A05 = (B04 - B05) * C[16];
    A06 =  B06 + B07;
    A07 = (B06 - B07) * C[16];
    A08 =  B08 + B09;
    A09 = (B08 - B09) * C[16];
    A10 =  B10 + B11;
    A11 = (B10 - B11) * C[16];
    A12 =  B12 + B13;
    A13 = (B12 - B13) * C[16];
    A14 =  B14 + B15;
    A15 = (B14 - B15) * C[16];

    V[48-32] = -A00;
    V[ 0-32] =  A01;
    V[40-32] = -A02 - (V[ 8-32] = A03);

    V[36-32] = -((V[ 4-32] = A05 + (V[12-32] = A07)) + A06);
    V[44-32] = - A04 - A06 - A07;

    V[ 6-32] = (V[10-32] = A11 + (V[14-32] = A15)) + A13;
    V[38-32] = (V[34-32] = -(V[ 2-32] = A09 + A13 + A15) - A14) + A09 - A10 - A11;
    V[46-32] = (tmp = -(A12 + A14 + A15)) - A08;
    V[42-32] = tmp - A10 - A11;

    A00 = (Sample[ 0] - Sample[31]) * C[ 1];
    A01 = (Sample[ 1] - Sample[30]) * C[ 3];
    A02 = (Sample[ 3] - Sample[28]) * C[ 7];
    A03 = (Sample[ 2] - Sample[29]) * C[ 5];
    A04 = (Sample[ 7] - Sample[24]) * C[15];
    A05 = (Sample[ 6] - Sample[25]) * C[13];
    A06 = (Sample[ 4] - Sample[27]) * C[ 9];
    A07 = (Sample[ 5] - Sample[26]) * C[11];
    A08 = (Sample[15] - Sample[16]) * C[31];
    A09 = (Sample[14] - Sample[17]) * C[29];
    A10 = (Sample[12] - Sample[19]) * C[25];
    A11 = (Sample[13] - Sample[18]) * C[27];
    A12 = (Sample[ 8] - Sample[23]) * C[17];
    A13 = (Sample[ 9] - Sample[22]) * C[19];
    A14 = (Sample[11] - Sample[20]) * C[23];
    A15 = (Sample[10] - Sample[21]) * C[21];

    B00 =  A00 + A08;
    B01 =  A01 + A09;
    B02 =  A02 + A10;
    B03 =  A03 + A11;
    B04 =  A04 + A12;
    B05 =  A05 + A13;
    B06 =  A06 + A14;
    B07 =  A07 + A15;
    B08 = (A00 - A08) * C[ 2];
    B09 = (A01 - A09) * C[ 6];
    B10 = (A02 - A10) * C[14];
    B11 = (A03 - A11) * C[10];
    B12 = (A04 - A12) * C[30];
    B13 = (A05 - A13) * C[26];
    B14 = (A06 - A14) * C[18];
    B15 = (A07 - A15) * C[22];

    A00 =  B00 + B04;
    A01 =  B01 + B05;
    A02 =  B02 + B06;
    A03 =  B03 + B07;
    A04 = (B00 - B04) * C[ 4];
    A05 = (B01 - B05) * C[12];
    A06 = (B02 - B06) * C[28];
    A07 = (B03 - B07) * C[20];
    A08 =  B08 + B12;
    A09 =  B09 + B13;
    A10 =  B10 + B14;
    A11 =  B11 + B15;
    A12 = (B08 - B12) * C[ 4];
    A13 = (B09 - B13) * C[12];
    A14 = (B10 - B14) * C[28];
    A15 = (B11 - B15) * C[20];

    B00 =  A00 + A02;
    B01 =  A01 + A03;
    B02 = (A00 - A02) * C[ 8];
    B03 = (A01 - A03) * C[24];
    B04 =  A04 + A06;
    B05 =  A05 + A07;
    B06 = (A04 - A06) * C[ 8];
    B07 = (A05 - A07) * C[24];
    B08 =  A08 + A10;
    B09 =  A09 + A11;
    B10 = (A08 - A10) * C[ 8];
    B11 = (A09 - A11) * C[24];
    B12 =  A12 + A14;
    B13 =  A13 + A15;
    B14 = (A12 - A14) * C[ 8];
    B15 = (A13 - A15) * C[24];

    A00 =  B00 + B01;
    A01 = (B00 - B01) * C[16];
    A02 =  B02 + B03;
    A03 = (B02 - B03) * C[16];
    A04 =  B04 + B05;
    A05 = (B04 - B05) * C[16];
    A06 =  B06 + B07;
    A07 = (B06 - B07) * C[16];
    A08 =  B08 + B09;
    A09 = (B08 - B09) * C[16];
    A10 =  B10 + B11;
    A11 = (B10 - B11) * C[16];
    A12 =  B12 + B13;
    A13 = (B12 - B13) * C[16];
    A14 =  B14 + B15;
    A15 = (B14 - B15) * C[16];

    V[ 5-32] = (V[11-32] = (V[13-32] = A07 + (V[15-32] = A15)) + A11) + A05 + A13;
    V[ 7-32] = (V[ 9-32] = A03 + A11 + A15) + A13;
    V[33-32] = -(V[ 1-32] = A01 + (tmp = A09 + A13 + A15)) - A14;
    V[35-32] = -(V[ 3-32] = A05 + A07 + tmp) - A06 - A14;

    V[37-32] = (tmp = -(A10 + A11 + A13 + A14 + A15)) - A05 - A06 - A07;
    V[39-32] = tmp - A02 - A03;
    V[41-32] = (tmp += A13 - A12) - A02 - A03;
    V[43-32] = tmp - (tmp2 = A04 + A06 + A07);

    V[47-32] = (tmp = -(A08 + A12 + A14 + A15)) - A00;
    V[45-32] = tmp - tmp2;

#if 1  &&  (SIZEOF_Float == 4)
    // a little improvement in speed is possible if both values directly stored by the code above
    // this would make this code unnecessary, but enlarges code above.
    togg = (Uint32_t)0x80000000L;
    ((Uint32_t*)V)[32-32] = togg + ((Uint32_t*)V)[ 0-32];
    ((Uint32_t*)V)[31-32] = togg + ((Uint32_t*)V)[ 1-32];
    ((Uint32_t*)V)[30-32] = togg + ((Uint32_t*)V)[ 2-32];
    ((Uint32_t*)V)[29-32] = togg + ((Uint32_t*)V)[ 3-32];
    ((Uint32_t*)V)[28-32] = togg + ((Uint32_t*)V)[ 4-32];
    ((Uint32_t*)V)[27-32] = togg + ((Uint32_t*)V)[ 5-32];
    ((Uint32_t*)V)[26-32] = togg + ((Uint32_t*)V)[ 6-32];
    ((Uint32_t*)V)[25-32] = togg + ((Uint32_t*)V)[ 7-32];
    ((Uint32_t*)V)[24-32] = togg + ((Uint32_t*)V)[ 8-32];
    ((Uint32_t*)V)[23-32] = togg + ((Uint32_t*)V)[ 9-32];
    ((Uint32_t*)V)[22-32] = togg + ((Uint32_t*)V)[10-32];
    ((Uint32_t*)V)[21-32] = togg + ((Uint32_t*)V)[11-32];
    ((Uint32_t*)V)[20-32] = togg + ((Uint32_t*)V)[12-32];
    ((Uint32_t*)V)[19-32] = togg + ((Uint32_t*)V)[13-32];
    ((Uint32_t*)V)[18-32] = togg + ((Uint32_t*)V)[14-32];
    ((Uint32_t*)V)[17-32] = togg + ((Uint32_t*)V)[15-32];

    ((Uint32_t*)V)[63-32] = ((Uint32_t*)V)[33-32];
    ((Uint32_t*)V)[62-32] = ((Uint32_t*)V)[34-32];
    ((Uint32_t*)V)[61-32] = ((Uint32_t*)V)[35-32];
    ((Uint32_t*)V)[60-32] = ((Uint32_t*)V)[36-32];
    ((Uint32_t*)V)[59-32] = ((Uint32_t*)V)[37-32];
    ((Uint32_t*)V)[58-32] = ((Uint32_t*)V)[38-32];
    ((Uint32_t*)V)[57-32] = ((Uint32_t*)V)[39-32];
    ((Uint32_t*)V)[56-32] = ((Uint32_t*)V)[40-32];
    ((Uint32_t*)V)[55-32] = ((Uint32_t*)V)[41-32];
    ((Uint32_t*)V)[54-32] = ((Uint32_t*)V)[42-32];
    ((Uint32_t*)V)[53-32] = ((Uint32_t*)V)[43-32];
    ((Uint32_t*)V)[52-32] = ((Uint32_t*)V)[44-32];
    ((Uint32_t*)V)[51-32] = ((Uint32_t*)V)[45-32];
    ((Uint32_t*)V)[50-32] = ((Uint32_t*)V)[46-32];
    ((Uint32_t*)V)[49-32] = ((Uint32_t*)V)[47-32];
#else
    V[32-32] = -V[ 0-32];
    V[31-32] = -V[ 1-32];
    V[30-32] = -V[ 2-32];
    V[29-32] = -V[ 3-32];
    V[28-32] = -V[ 4-32];
    V[27-32] = -V[ 5-32];
    V[26-32] = -V[ 6-32];
    V[25-32] = -V[ 7-32];
    V[24-32] = -V[ 8-32];
    V[23-32] = -V[ 9-32];
    V[22-32] = -V[10-32];
    V[21-32] = -V[11-32];
    V[20-32] = -V[12-32];
    V[19-32] = -V[13-32];
    V[18-32] = -V[14-32];
    V[17-32] = -V[15-32];

    V[63-32] =  V[33-32];
    V[62-32] =  V[34-32];
    V[61-32] =  V[35-32];
    V[60-32] =  V[36-32];
    V[59-32] =  V[37-32];
    V[58-32] =  V[38-32];
    V[57-32] =  V[39-32];
    V[56-32] =  V[40-32];
    V[55-32] =  V[41-32];
    V[54-32] =  V[42-32];
    V[53-32] =  V[43-32];
    V[52-32] =  V[44-32];
    V[51-32] =  V[45-32];
    V[50-32] =  V[46-32];
    V[49-32] =  V[47-32];
#endif
    LEAVE(163);
}

#endif


#if !defined USE_ASM  &&  !defined MAKE_16BIT  &&  !defined MAKE_24BIT  &&  !defined MAKE_32BIT


void
Synthese_Filter_16_C ( register Int2x16_t* Stream, Int* const offset, Float* Vi, const FloatArray* Yi )
{
    Int               n;
    register Int      k;
    register Int32_t  Sum;
    const Float       (*D)[16];
    const Float*      V;
#ifdef HAVE_IEEE754_FLOAT
    Float32_t         floattmp;
#endif

    ENTER(164);
    for ( n = 0; n < 36; n++, Yi++ ) {
        // shifting 64 indices upwards
        if ( (*offset -= 64) < 0 ) {
            *offset += 64*VIRT_SHIFT;
            ENTER(170);
            memmove ( Vi+64*VIRT_SHIFT, Vi, (16-1)*64*sizeof(*Vi) );
            LEAVE(170);
        }

        Calculate_New_V ( *Yi, (Float*)(V = Vi + *offset) );

        // vectoring & windowing & calculating PCM-Output
        D = Di_opt;
        for ( k = 0; k < 32; k++, V++, D++ ) {
            Sum = ROUND32 ( V [  0] * D [0][ 0]
                          + V [ 96] * D [0][ 1]
                          + V [128] * D [0][ 2]
                          + V [224] * D [0][ 3]
                          + V [256] * D [0][ 4]
                          + V [352] * D [0][ 5]
                          + V [384] * D [0][ 6]
                          + V [480] * D [0][ 7]
                          + V [512] * D [0][ 8]
                          + V [608] * D [0][ 9]
                          + V [640] * D [0][10]
                          + V [736] * D [0][11]
                          + V [768] * D [0][12]
                          + V [864] * D [0][13]
                          + V [896] * D [0][14]
                          + V [992] * D [0][15] );

            // copy to PCM
            if (Sum != (Int16_t)Sum ) {            // prevent from wrap around
                Dither.Overdrives++;
                if ( Sum > +Dither.MaxLevel ) Dither.MaxLevel = +Sum;
                if ( Sum < -Dither.MaxLevel ) Dither.MaxLevel = -Sum;
                Sum = (Sum >> 31) ^ 0x7FFF;
            }

            Stream [0] [0] = (Int16_t)Sum;
            Stream++;
        }
    }
    LEAVE(164);
    return;
}


#endif


#if defined USE_ASM  &&  !defined MAKE_16BIT  &&  !defined MAKE_24BIT  &&  !defined MAKE_32BIT

static void
Synthese_Filter_16_i387 ( Int2x16_t* Stream, Int* const offset, Float* Vi, const Float Yi[][32] )
{
    Int           n;
    Int           k;
    register Int32_t  Sum;
    const Float*  V;
    Int32_t       scratch [32+8];
    Int32_t*      scratchptr = (Int32_t*) ALIGN ( scratch, 0x20 );

    ENTER(171);
    for ( n = 0; n < 36; n++, Yi++ ) {
        // shifting 64 indices upwards
        if ( (*offset -= 64) < 0 ) {
            *offset += 64*VIRT_SHIFT;
            ENTER(170);
            memmove ( Vi+64*VIRT_SHIFT, Vi, (16-1)*64*sizeof(*Vi) );
            LEAVE(170);
        }

        ENTER(172);
        Calculate_New_V_i387 ( *Yi, (Float*)(V = Vi + *offset) );
        LEAVE(172);

        // vectoring & windowing & calculating PCM-Output
        VectorMult_i387 ( scratchptr, V );
        for ( k = 0; k < 32; k++ ) {
            Sum = scratchptr[k] - (Int32_t)0x4B7D8000L;
            if (Sum != (Int16_t)Sum ) {            // prevent from wrap around
                Dither.Overdrives++;
                if ( Sum > +Dither.MaxLevel ) Dither.MaxLevel = +Sum;
                if ( Sum < -Dither.MaxLevel ) Dither.MaxLevel = -Sum;
                Sum = (Sum >> 31) ^ 0x7FFF;
            }

            Stream [0] [0] = (Int16_t) Sum;
            Stream++;
        }
    }

    LEAVE(171);
    return;
}


static void
Synthese_Filter_16_3DNow ( Int2x16_t* Stream, Int* const offset, Float* Vi, const Float Yi[][32] )
{
    Int           n;
    Int           k;
    const Float*  V;
    Int32_t       scratch [32+8];
    Int32_t*      scratchptr = (Int32_t*) ALIGN ( scratch, 0x20 );

    ENTER(171);
    for ( n = 0; n < 36; n++, Yi++ ) {
        // shifting 64 indices upwards
        if ( (*offset -= 64) < 0 ) {
            *offset += 64*VIRT_SHIFT;
            ENTER(170);
            memcpy_dn_MMX ( Vi+64*VIRT_SHIFT, Vi, (16-1)*sizeof(*Vi) );
            //memmove ( Vi+64*VIRT_SHIFT, Vi, (16-1)*64*sizeof(*Vi) );
            LEAVE(170);
        }

        ENTER(172);
        Calculate_New_V_3DNow ( *Yi, (Float*)(V = Vi + *offset) );
        LEAVE(172);

        // vectoring & windowing & calculating PCM-Output
        VectorMult_3DNow ( scratchptr, V );
        for ( k = 0; k < 32; k++ ) {
            Stream [0] [0] = (Int16_t)(scratchptr[k] >> 16);  // access differently to make shifts obsolete ???
            Stream++;
        }
    }
    Reset_FPU_3DNow ();

    LEAVE(171);
    return;
}


static void
Calculate_New_V_SIMD ( const Float* Sample, Float* V )
{
    // Calculating of new V-buffer values for left channel (see ISO-11172-3, p. 39)
    // based on an algorithm by Byeong Gi Lee

    Float     __A [32 + 8];
    Float*    A = (Float*) ALIGN ( __A, 0x20 );
    Float     tmp;
    Float     tmp2;
    Uint32_t  togg;

    ENTER(163);

    V += 32;

    New_V_Helper2 ( A, Sample );

    V[48-32] = -A[ 0];
    V[ 0-32] =  A[ 1];
    V[40-32] = -A[ 2] - (V[ 8-32] = A[ 3]);

    V[36-32] = -((V[ 4-32] = A[ 5] + (V[12-32] = A[ 7])) + A[ 6]);
    V[44-32] = - A[ 4] - A[ 6] - A[ 7];

    V[ 6-32] = (V[10-32] = A[11] + (V[14-32] = A[15])) + A[13];
    V[38-32] = (V[34-32] = -(V[ 2-32] = A[ 9] + A[13] + A[15]) - A[14]) + A[ 9] - A[10] - A[11];
    V[46-32] = (tmp = -(A[12] + A[14] + A[15])) - A[ 8];
    V[42-32] = tmp - A[10] - A[11];

    New_V_Helper3 ( A, Sample );

    V[ 5-32] = (V[11-32] = (V[13-32] = A[ 7] + (V[15-32] = A[15])) + A[11]) + A[ 5] + A[13];
    V[ 7-32] = (V[ 9-32] = A[ 3] + A[11] + A[15]) + A[13];
    V[33-32] = -(V[ 1-32] = A[ 1] + (tmp = A[ 9] + A[13] + A[15])) - A[14];
    V[35-32] = -(V[ 3-32] = A[ 5] + A[ 7] + tmp) - A[ 6] - A[14];

    V[37-32] = (tmp = -(A[10] + A[11] + A[13] + A[14] + A[15])) - A[ 5] - A[ 6] - A[ 7];
    V[39-32] = tmp - A[ 2] - A[ 3];
    V[41-32] = (tmp += A[13] - A[12]) - A[ 2] - A[ 3];
    V[43-32] = tmp - (tmp2 = A[ 4] + A[ 6] + A[ 7]);

    V[47-32] = (tmp = -(A[ 8] + A[12] + A[14] + A[15])) - A[ 0];
    V[45-32] = tmp - tmp2;

#if 0
    New_V_Helper4 ( V );                                            // slower
#elif 1
    // a little improvement in speed is possible if both values directly stored by the code above
    // this would make this code unnecessary, but enlarges code above.
    togg = (Uint32_t)0x80000000L;
    ((Uint32_t*)V)[32-32] = togg + ((Uint32_t*)V)[ 0-32];
    ((Uint32_t*)V)[31-32] = togg + ((Uint32_t*)V)[ 1-32];
    ((Uint32_t*)V)[30-32] = togg + ((Uint32_t*)V)[ 2-32];
    ((Uint32_t*)V)[29-32] = togg + ((Uint32_t*)V)[ 3-32];
    ((Uint32_t*)V)[28-32] = togg + ((Uint32_t*)V)[ 4-32];
    ((Uint32_t*)V)[27-32] = togg + ((Uint32_t*)V)[ 5-32];
    ((Uint32_t*)V)[26-32] = togg + ((Uint32_t*)V)[ 6-32];
    ((Uint32_t*)V)[25-32] = togg + ((Uint32_t*)V)[ 7-32];
    ((Uint32_t*)V)[24-32] = togg + ((Uint32_t*)V)[ 8-32];
    ((Uint32_t*)V)[23-32] = togg + ((Uint32_t*)V)[ 9-32];
    ((Uint32_t*)V)[22-32] = togg + ((Uint32_t*)V)[10-32];
    ((Uint32_t*)V)[21-32] = togg + ((Uint32_t*)V)[11-32];
    ((Uint32_t*)V)[20-32] = togg + ((Uint32_t*)V)[12-32];
    ((Uint32_t*)V)[19-32] = togg + ((Uint32_t*)V)[13-32];
    ((Uint32_t*)V)[18-32] = togg + ((Uint32_t*)V)[14-32];
    ((Uint32_t*)V)[17-32] = togg + ((Uint32_t*)V)[15-32];

    ((Uint32_t*)V)[63-32] = ((Uint32_t*)V)[33-32];
    ((Uint32_t*)V)[62-32] = ((Uint32_t*)V)[34-32];
    ((Uint32_t*)V)[61-32] = ((Uint32_t*)V)[35-32];
    ((Uint32_t*)V)[60-32] = ((Uint32_t*)V)[36-32];
    ((Uint32_t*)V)[59-32] = ((Uint32_t*)V)[37-32];
    ((Uint32_t*)V)[58-32] = ((Uint32_t*)V)[38-32];
    ((Uint32_t*)V)[57-32] = ((Uint32_t*)V)[39-32];
    ((Uint32_t*)V)[56-32] = ((Uint32_t*)V)[40-32];
    ((Uint32_t*)V)[55-32] = ((Uint32_t*)V)[41-32];
    ((Uint32_t*)V)[54-32] = ((Uint32_t*)V)[42-32];
    ((Uint32_t*)V)[53-32] = ((Uint32_t*)V)[43-32];
    ((Uint32_t*)V)[52-32] = ((Uint32_t*)V)[44-32];
    ((Uint32_t*)V)[51-32] = ((Uint32_t*)V)[45-32];
    ((Uint32_t*)V)[50-32] = ((Uint32_t*)V)[46-32];
    ((Uint32_t*)V)[49-32] = ((Uint32_t*)V)[47-32];
#else
    V[32-32] = -V[ 0-32];
    V[31-32] = -V[ 1-32];
    V[30-32] = -V[ 2-32];
    V[29-32] = -V[ 3-32];
    V[28-32] = -V[ 4-32];
    V[27-32] = -V[ 5-32];
    V[26-32] = -V[ 6-32];
    V[25-32] = -V[ 7-32];
    V[24-32] = -V[ 8-32];
    V[23-32] = -V[ 9-32];
    V[22-32] = -V[10-32];
    V[21-32] = -V[11-32];
    V[20-32] = -V[12-32];
    V[19-32] = -V[13-32];
    V[18-32] = -V[14-32];
    V[17-32] = -V[15-32];

    V[63-32] =  V[33-32];
    V[62-32] =  V[34-32];
    V[61-32] =  V[35-32];
    V[60-32] =  V[36-32];
    V[59-32] =  V[37-32];
    V[58-32] =  V[38-32];
    V[57-32] =  V[39-32];
    V[56-32] =  V[40-32];
    V[55-32] =  V[41-32];
    V[54-32] =  V[42-32];
    V[53-32] =  V[43-32];
    V[52-32] =  V[44-32];
    V[51-32] =  V[45-32];
    V[50-32] =  V[46-32];
    V[49-32] =  V[47-32];
#endif
    LEAVE(163);
}


static void
Synthese_Filter_16_SIMD ( Int2x16_t* Stream, Int* const offset, Float* Vi, const Float Yi[][32] )
{
    Int           n;
    Int           k;
    Int32_t       Sum;
    const Float*  V;
    Int32_t       scratch [32+8];
    Int32_t*      scratchptr = (Int32_t*) ALIGN ( scratch, 0x20 );

    ENTER(241);
    for ( n = 0; n < 36; n++, Yi++ ) {
        // shifting 64 indices upwards
        if ( (*offset -= 64) < 0 ) {
            *offset += 64*VIRT_SHIFT;
            ENTER(240);
#if 0
            memcpy_dn_SIMD ( Vi+64*VIRT_SHIFT, Vi, (16-1)*sizeof(*Vi)/2 );  // is slower!!!
#else
            memcpy_dn_MMX ( Vi+64*VIRT_SHIFT, Vi, (16-1)*sizeof(*Vi) );
            Reset_FPU ();
#endif
            LEAVE(240);
        }

        ENTER(242);
        Calculate_New_V_SIMD ( *Yi, (Float*)(V = Vi + *offset) );
        LEAVE(242);

        // vectoring & windowing & calculating PCM-Output
        VectorMult_SIMD ( scratchptr, V );
        for ( k = 0; k < 32; k++ ) {
            Sum = scratchptr [k] - (Int32_t)0x537D8000L;
            if (Sum != (Int16_t)Sum ) {            // prevent from wrap around
                Dither.Overdrives++;
                if ( Sum > +Dither.MaxLevel ) Dither.MaxLevel = +Sum;
                if ( Sum < -Dither.MaxLevel ) Dither.MaxLevel = -Sum;
                Sum = (Sum >> 31) ^ 0x7FFF;
            }

            Stream [0] [0] = (Int16_t)Sum;
            Stream++;
        }
    }

    LEAVE(241);
    return;
}

#pragma warning ( disable: 4550 )
SyntheseFilter16_t
Get_Synthese_Filter ( void )
{
#ifdef USE_ASM
    if ( Has_3DNow () )
        return Synthese_Filter_16_3DNow;
    if ( Has_SIMD  () )
        return Synthese_Filter_16_SIMD;
    return Synthese_Filter_16_i387;
#else
    return Synthese_Filter_16_C;
#endif
}
#pragma warning ( default: 4550 )


#endif


static const  Uchar    Parity [256] = {  // parity
    0,1,1,0,1,0,0,1,1,0,0,1,0,1,1,0,1,0,0,1,0,1,1,0,0,1,1,0,1,0,0,1,
    1,0,0,1,0,1,1,0,0,1,1,0,1,0,0,1,0,1,1,0,1,0,0,1,1,0,0,1,0,1,1,0,
    1,0,0,1,0,1,1,0,0,1,1,0,1,0,0,1,0,1,1,0,1,0,0,1,1,0,0,1,0,1,1,0,
    0,1,1,0,1,0,0,1,1,0,0,1,0,1,1,0,1,0,0,1,0,1,1,0,0,1,1,0,1,0,0,1,
    1,0,0,1,0,1,1,0,0,1,1,0,1,0,0,1,0,1,1,0,1,0,0,1,1,0,0,1,0,1,1,0,
    0,1,1,0,1,0,0,1,1,0,0,1,0,1,1,0,1,0,0,1,0,1,1,0,0,1,1,0,1,0,0,1,
    0,1,1,0,1,0,0,1,1,0,0,1,0,1,1,0,1,0,0,1,0,1,1,0,0,1,1,0,1,0,0,1,
    1,0,0,1,0,1,1,0,0,1,1,0,1,0,0,1,0,1,1,0,1,0,0,1,1,0,0,1,0,1,1,0
};

static Uint32_t  __r1 = 1;
static Uint32_t  __r2 = 1;


/*
 *  This is a simple random number generator with good quality for audio purposes.
 *  It consists of two polycounters with opposite rotation direction and different
 *  periods. The periods are coprime, so the total period is the product of both.
 *
 *     -------------------------------------------------------------------------------------------------
 * +-> |31:30:29:28:27:26:25:24:23:22:21:20:19:18:17:16:15:14:13:12:11:10: 9: 8: 7: 6: 5: 4: 3: 2: 1: 0|
 * |   -------------------------------------------------------------------------------------------------
 * |                                                                             |  |  |  |     |     |
 * |                                                                             +--+--+-XOR----+-----+
 * |                                                                                      |
 * +--------------------------------------------------------------------------------------+
 *
 *     -------------------------------------------------------------------------------------------------
 *     |31:30:29:28:27:26:25:24:23:22:21:20:19:18:17:16:15:14:13:12:11:10: 9: 8: 7: 6: 5: 4: 3: 2: 1: 0| <-+
 *     -------------------------------------------------------------------------------------------------   |
 *       |  |           |  |                                                                               |
 *       +--+----XOR----+--+                                                                               |
 *                |                                                                                        |
 *                +----------------------------------------------------------------------------------------+
 *
 *
 *  The first has an period of 3*5*17*257*65537, the second of 7*47*73*178481,
 *  which gives a period of 18.410.713.077.675.721.215. The result is the
 *  XORed values of both generators.
 */

static void
set_seed ( Uint32_t  seed )
{
    __r1 = seed != 0 ? seed : 1;                // 0 is a forbidden value which locks the generator
    __r2 = 1;                                   // the are several (8389119) forbidden codes. 1 is not one of them
}


Uint32_t
random_int ( void )
{
#if 1
    Uint32_t  t1, t2, t3, t4;

    t3   = t1 = __r1;   t4   = t2 = __r2;       // Parity calculation is done via table lookup, this is also available
    t1  &= 0xF5;        t2 >>= 25;              // on CPUs without parity, can be implemented in C and avoid unpredictable
    t1   = Parity [t1]; t2  &= 0x63;            // jumps and slow rotate through the carry flag operations.
    t1 <<= 31;          t2   = Parity [t2];

    return (__r1 = (t3 >> 1) | t1 ) ^ (__r2 = (t4 + t4) | t2 );
#else
    return (__r1 = (__r1 >> 1) | ((Uint32_t)Parity [__r1 & 0xF5] << 31) ) ^
           (__r2 = (__r2 << 1) |  (Uint32_t)Parity [(__r2 >> 25) & 0x63] );
#endif
}


#if defined MAKE_16BIT  ||  defined MAKE_24BIT  ||  defined MAKE_32BIT

/***********************************************************************************************************************/

static Double
Random_Equi ( Double mult )                     // gives a equal distributed random number
{                                               // between -2^31*mult and +2^31*mult
    return mult * (Int32_t) random_int ();
}

static Double
Random_Triangular ( Double mult )               // gives a triangular distributed random number
{                                               // between -2^32*mult and +2^32*mult
    return mult * ( (Double) (Int32_t) random_int () + (Double) (Int32_t) random_int () );
}


/*********************************************************************************************************************/

static const Float  F44_0 [16 + 32] = {
    (Float)0, (Float)0, (Float)0, (Float)0, (Float)0, (Float)0, (Float)0, (Float)0,
    (Float)0, (Float)0, (Float)0, (Float)0, (Float)0, (Float)0, (Float)0, (Float)0,

    (Float)0, (Float)0, (Float)0, (Float)0, (Float)0, (Float)0, (Float)0, (Float)0,
    (Float)0, (Float)0, (Float)0, (Float)0, (Float)0, (Float)0, (Float)0, (Float)0,

    (Float)0, (Float)0, (Float)0, (Float)0, (Float)0, (Float)0, (Float)0, (Float)0,
    (Float)0, (Float)0, (Float)0, (Float)0, (Float)0, (Float)0, (Float)0, (Float)0
};


static const Float  F44_1 [16 + 32] = {  /* SNR(w) = 4.843163 dB, SNR = -3.192134 dB */
    (Float) 0.85018292704024355931, (Float) 0.29089597350995344721, (Float)-0.05021866022121039450, (Float)-0.23545456294599161833,
    (Float)-0.58362726442227032096, (Float)-0.67038978965193036429, (Float)-0.38566861572833459221, (Float)-0.15218663390367969967,
    (Float)-0.02577543084864530676, (Float) 0.14119295297688728127, (Float) 0.22398848581628781612, (Float) 0.15401727203382084116,
    (Float) 0.05216161232906000929, (Float)-0.00282237820999675451, (Float)-0.03042794608323867363, (Float)-0.03109780942998826024,

    (Float) 0.85018292704024355931, (Float) 0.29089597350995344721, (Float)-0.05021866022121039450, (Float)-0.23545456294599161833,
    (Float)-0.58362726442227032096, (Float)-0.67038978965193036429, (Float)-0.38566861572833459221, (Float)-0.15218663390367969967,
    (Float)-0.02577543084864530676, (Float) 0.14119295297688728127, (Float) 0.22398848581628781612, (Float) 0.15401727203382084116,
    (Float) 0.05216161232906000929, (Float)-0.00282237820999675451, (Float)-0.03042794608323867363, (Float)-0.03109780942998826024,

    (Float) 0.85018292704024355931, (Float) 0.29089597350995344721, (Float)-0.05021866022121039450, (Float)-0.23545456294599161833,
    (Float)-0.58362726442227032096, (Float)-0.67038978965193036429, (Float)-0.38566861572833459221, (Float)-0.15218663390367969967,
    (Float)-0.02577543084864530676, (Float) 0.14119295297688728127, (Float) 0.22398848581628781612, (Float) 0.15401727203382084116,
    (Float) 0.05216161232906000929, (Float)-0.00282237820999675451, (Float)-0.03042794608323867363, (Float)-0.03109780942998826024,
};


static const Float  F44_2 [16 + 32] = {  /* SNR(w) = 10.060213 dB, SNR = -12.766730 dB */
    (Float) 1.78827593892108555290, (Float) 0.95508210637394326553, (Float)-0.18447626783899924429, (Float)-0.44198126506275016437,
    (Float)-0.88404052492547413497, (Float)-1.42218907262407452967, (Float)-1.02037566838362314995, (Float)-0.34861755756425577264,
    (Float)-0.11490230170431934434, (Float) 0.12498899339968611803, (Float) 0.38065885268563131927, (Float) 0.31883491321310506562,
    (Float) 0.10486838686563442765, (Float)-0.03105361685110374845, (Float)-0.06450524884075370758, (Float)-0.02939198261121969816,

    (Float) 1.78827593892108555290, (Float) 0.95508210637394326553, (Float)-0.18447626783899924429, (Float)-0.44198126506275016437,
    (Float)-0.88404052492547413497, (Float)-1.42218907262407452967, (Float)-1.02037566838362314995, (Float)-0.34861755756425577264,
    (Float)-0.11490230170431934434, (Float) 0.12498899339968611803, (Float) 0.38065885268563131927, (Float) 0.31883491321310506562,
    (Float) 0.10486838686563442765, (Float)-0.03105361685110374845, (Float)-0.06450524884075370758, (Float)-0.02939198261121969816,

    (Float) 1.78827593892108555290, (Float) 0.95508210637394326553, (Float)-0.18447626783899924429, (Float)-0.44198126506275016437,
    (Float)-0.88404052492547413497, (Float)-1.42218907262407452967, (Float)-1.02037566838362314995, (Float)-0.34861755756425577264,
    (Float)-0.11490230170431934434, (Float) 0.12498899339968611803, (Float) 0.38065885268563131927, (Float) 0.31883491321310506562,
    (Float) 0.10486838686563442765, (Float)-0.03105361685110374845, (Float)-0.06450524884075370758, (Float)-0.02939198261121969816,
};


static const Float  F44_3 [16 + 32] = {  /* SNR(w) = 15.382598 dB, SNR = -29.402334 dB */
    (Float) 2.89072132015058161445, (Float) 2.68932810943698754106, (Float) 0.21083359339410251227, (Float)-0.98385073324997617515,
    (Float)-1.11047823227097316719, (Float)-2.18954076314139673147, (Float)-2.36498032881953056225, (Float)-0.95484132880101140785,
    (Float)-0.23924057925542965158, (Float)-0.13865235703915925642, (Float) 0.43587843191057992846, (Float) 0.65903257226026665927,
    (Float) 0.24361815372443152787, (Float)-0.00235974960154720097, (Float) 0.01844166574603346289, (Float) 0.01722945988740875099,

    (Float) 2.89072132015058161445, (Float) 2.68932810943698754106, (Float) 0.21083359339410251227, (Float)-0.98385073324997617515,
    (Float)-1.11047823227097316719, (Float)-2.18954076314139673147, (Float)-2.36498032881953056225, (Float)-0.95484132880101140785,
    (Float)-0.23924057925542965158, (Float)-0.13865235703915925642, (Float) 0.43587843191057992846, (Float) 0.65903257226026665927,
    (Float) 0.24361815372443152787, (Float)-0.00235974960154720097, (Float) 0.01844166574603346289, (Float) 0.01722945988740875099,

    (Float) 2.89072132015058161445, (Float) 2.68932810943698754106, (Float) 0.21083359339410251227, (Float)-0.98385073324997617515,
    (Float)-1.11047823227097316719, (Float)-2.18954076314139673147, (Float)-2.36498032881953056225, (Float)-0.95484132880101140785,
    (Float)-0.23924057925542965158, (Float)-0.13865235703915925642, (Float) 0.43587843191057992846, (Float) 0.65903257226026665927,
    (Float) 0.24361815372443152787, (Float)-0.00235974960154720097, (Float) 0.01844166574603346289, (Float) 0.01722945988740875099
};


static Double
scalar16 ( const Float* x, const Float* y )
{
    return x[ 0]*y[ 0] + x[ 1]*y[ 1] + x[ 2]*y[ 2] + x[ 3]*y[ 3]
         + x[ 4]*y[ 4] + x[ 5]*y[ 5] + x[ 6]*y[ 6] + x[ 7]*y[ 7]
         + x[ 8]*y[ 8] + x[ 9]*y[ 9] + x[10]*y[10] + x[11]*y[11]
         + x[12]*y[12] + x[13]*y[13] + x[14]*y[14] + x[15]*y[15];
}


void
Init_Dither ( Int bits, int shapingtype, Double dither )
{
    static Uint8_t        default_dither [] = { 92, 92, 88, 84, 81, 78, 74, 67,  0,  0 };
    static const Float*   F              [] = { F44_0, F44_1, F44_2, F44_3 };
    int                   index;

    if (bits > SAMPLE_SIZE)
        bits = SAMPLE_SIZE;

    if (shapingtype < 0) shapingtype = 0;
    if (shapingtype > 3) shapingtype = 3;
    index = bits - 11 - shapingtype;
    if (index < 0) index = 0;
    if (index > 9) index = 9;

    memset ( Dither.ErrorHistory , 0, sizeof (Dither.ErrorHistory ) );
    memset ( Dither.DitherHistory, 0, sizeof (Dither.DitherHistory) );

    Dither.FilterCoeff = F [shapingtype];
    Dither.Mask   = ((Uint64_t)-1) << (32 - bits);
#ifdef HAVE_IEEE754_DOUBLE
    Dither.Add    = 0.5     * ((1L << (32 - bits)) - 1);
#else
    Dither.Add    = 0.5     * ((1L << (32 - bits)) - 0);
#endif
    Dither.Dither = (dither >= 0.00 ? dither : 0.01*default_dither[index]) / (((Int64_t)1) << bits);
    Dither.NoShaping = shapingtype == 0;
}


void
Synthese_Filter_32_C ( register Int2x32_t* Stream, Int* const offset, Float* Vi, const FloatArray* Yi, Uint channel )
{
    Int               n;
    register Int      k;
    Double            Sum;
    Double            Sum2;
    Int64_t           val;
    const Float       (*D)[16];
    const Float*      V;
#ifdef HAVE_IEEE754_DOUBLE
    Float64_t         doubletmp;
#endif

    ENTER(167);
    for ( n = 0; n < 36; n++, Yi++ ) {

        // shifting upwards by 64 Indices
        if ( (*offset -= 64) < 0 ) {
            *offset += 64*VIRT_SHIFT;
            ENTER(170);
            memmove ( Vi+64*VIRT_SHIFT, Vi, (16-1)*64*sizeof(*Vi) );
            LEAVE(170);
        }

        Calculate_New_V ( *Yi, (Float*)(V = Vi + *offset) );

        // vectoring & windowing & calculating PCM-Output
        D = Di_opt;
        for ( k = 0; k < 32; k++, V++, D++ ) {
            Sum = ( V [  0] * D [0][ 0]
                  + V [ 96] * D [0][ 1]
                  + V [128] * D [0][ 2]
                  + V [224] * D [0][ 3]
                  + V [256] * D [0][ 4]
                  + V [352] * D [0][ 5]
                  + V [384] * D [0][ 6]
                  + V [480] * D [0][ 7]
                  + V [512] * D [0][ 8]
                  + V [608] * D [0][ 9]
                  + V [640] * D [0][10]
                  + V [736] * D [0][11]
                  + V [768] * D [0][12]
                  + V [864] * D [0][13]
                  + V [896] * D [0][14]
                  + V [992] * D [0][15] ) * 65536.;

#if 1
            // Noise shaping and dithering
            if ( Dither.NoShaping ) {
                Double  tmp = Random_Equi ( Dither.Dither );
                Sum2 = tmp - Dither.LastRandomNumber [channel];
                Dither.LastRandomNumber [channel] = tmp;
                Sum2 = Sum += Sum2;
            }
            else {
                Sum2  = Random_Triangular ( Dither.Dither ) - scalar16 ( Dither.DitherHistory[channel], Dither.FilterCoeff + k );
                Sum  += Dither.DitherHistory [channel] [(-1-k)&15] = Sum2;
                Sum2  = Sum + scalar16 ( Dither.ErrorHistory [channel], Dither.FilterCoeff + k );
            }

            val = ROUND64 (Sum2)  &  Dither.Mask;
            if ( val != (Int32_t) val ) {
                memset ( Dither.ErrorHistory [channel], 0, sizeof (Dither.ErrorHistory[0]) );
                Dither.Overdrives++;
                if ( val > +Dither.MaxLevel ) Dither.MaxLevel = +val;
                if ( val < -Dither.MaxLevel ) Dither.MaxLevel = -val;
                val = (val >> 63) ^ 0x7FFFFFFF;
            } else {
                Dither.ErrorHistory [channel] [(-1-k)&15] = (Float)(Sum - val);
            }
#else
            // Noise shaping and dithering
            Sum2  = Random ( Dither.Dither ) - scalar16 ( Dither.DitherHistory[channel], Dither.FilterCoeff );
            memmove ( Dither.DitherHistory [channel] + 1, Dither.DitherHistory [channel] + 0, sizeof(Dither.DitherHistory[0]) - sizeof(Dither.DitherHistory[0][0]) );
            Dither.DitherHistory [channel] [0] = Sum2;

            Sum   = Sum + Dither.DitherHistory [channel] [0];
            Sum2  = Sum + scalar16 ( Dither.ErrorHistory [channel], Dither.FilterCoeff );
            val   = (Int64_t) (floor ( Sum2 + Dither.Add ))  &  Dither.Mask;
            if ( val != (Int32_t) val ) {
                memset ( Dither.ErrorHistory [channel], 0, sizeof (Dither.ErrorHistory[0]) );
                Dither.Overdrives++;
                if ( val > +Dither.MaxLevel ) Dither.MaxLevel = +val;
                if ( val < -Dither.MaxLevel ) Dither.MaxLevel = -val;
                val = (val >> 63) ^ 0x7FFFFFFF;
            } else {
                memmove ( Dither.ErrorHistory [channel] + 1, Dither.ErrorHistory [channel] + 0, sizeof(Dither.ErrorHistory[0]) - sizeof(Dither.ErrorHistory[0][0]) );
                Dither.ErrorHistory [channel] [0] = Sum - val;
            }
#endif

            // copy to PCM
            Stream [0] [0] = (Int32_t)val;
            Stream++;
        }
    }
    LEAVE(167);
    return;
}

#endif /* defined MAKE_16BIT  ||  defined MAKE_24BIT  ||  defined MAKE_32BIT */


void
OverdriveReport ( void )
{
    if ( Dither.Overdrives > 0 ) {
#if defined MAKE_16BIT  ||  defined MAKE_24BIT  ||  defined MAKE_32BIT
        stderr_printf ( "\n%5lu Overdrive%s, maximum level %.1f, rerun with --scale %.5f\n",
                        (Ulong)Dither.Overdrives, Dither.Overdrives==1 ? "" : "s", (Double)(Int64_t)Dither.MaxLevel*(1/65536.), (Int64_t)(Dither.Mask & 0x7FFFFFFFL)/((Double)(Int64_t)Dither.MaxLevel+1) - 0.5e-5 );
#else
        stderr_printf ( "\n%5lu Overdrive%s, maximum level %lu, rerun with --scale %.5f\n",
                        (Ulong)Dither.Overdrives, Dither.Overdrives==1 ? "" : "s", (Ulong)Dither.MaxLevel, (Double)0x7FFF/(Dither.MaxLevel+1) - 0.5e-5 );
#endif
        Dither.Overdrives = 0;
        Dither.MaxLevel   = 0;
    }
}

/* end of synth.c */