/*
 * Mesa 3-D graphics library
 * Version:  4.0.3
 *
 * Copyright (C) 1999-2002  Brian Paul   All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
 * to deal in the Software without restriction, including without limitation
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 * and/or sell copies of the Software, and to permit persons to whom the
 * Software is furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included
 * in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
 * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */


/*
 * Faster arithmetic functions.  If the FAST_MATH preprocessor symbol is
 * defined on the command line (-DFAST_MATH) then we'll use some (hopefully)
 * faster functions for sqrt(), etc.
 */


#ifndef MMATH_H
#define MMATH_H


#include "glheader.h"
/* Do not reference mtypes.h from this file.
 */

/*
 * Set the x86 FPU control word to guarentee only 32 bits of presision
 * are stored in registers.  Allowing the FPU to store more introduces
 * differences between situations where numbers are pulled out of memory
 * vs. situations where the compiler is able to optimize register usage.
 *
 * In the worst case, we force the compiler to use a memory access to
 * truncate the float, by specifying the 'volatile' keyword.
 */
#if defined(__GNUC__) && defined(__i386__)

/* Hardware default: All exceptions masked, extended double precision,
 * round to nearest.  IEEE compliant.
 */
#define DEFAULT_X86_FPU		0x037f

/* All exceptions masked, single precision, round to nearest.
 */
#define FAST_X86_FPU		0x003f

/* Set it up how we want it.  The fldcw instruction will cause any
 * pending FP exceptions to be raised prior to entering the block, and
 * we clear any pending exceptions before exiting the block.  Hence, asm
 * code has free reign over the FPU while in the fast math block.
 */
#if defined(NO_FAST_MATH)
#define START_FAST_MATH(x)						\
do {									\
   static GLuint mask = DEFAULT_X86_FPU;				\
   __asm__ ( "fnstcw %0" : "=m" (*&(x)) );				\
   __asm__ ( "fldcw %0" : : "m" (mask) );				\
} while (0)
#else
#define START_FAST_MATH(x)						\
do {									\
   static GLuint mask = FAST_X86_FPU;					\
   __asm__ ( "fnstcw %0" : "=m" (*&(x)) );				\
   __asm__ ( "fldcw %0" : : "m" (mask) );				\
} while (0)
#endif

/* Put it back how the application had it, and clear any exceptions that
 * may have occurred in the FAST_MATH block.
 */
#define END_FAST_MATH(x)						\
do {									\
   __asm__ ( "fnclex ; fldcw %0" : : "m" (*&(x)) );			\
} while (0)

#define HAVE_FAST_MATH

#elif defined(__WATCOMC__) && !defined(NO_FAST_MATH)

/* This is the watcom specific inline assembly version of setcw and getcw */

void START_FAST_MATH2(unsigned short *x);
#pragma aux START_FAST_MATH2 =          \
    "fstcw   word ptr [esi]"            \
    "or      word ptr [esi], 0x3f"      \
    "fldcw   word ptr [esi]"            \
    parm [esi]                          \
    modify exact [];

void END_FAST_MATH2(unsigned short *x);
#pragma aux END_FAST_MATH2 =            \
    "fldcw   word ptr [esi]"            \
    parm [esi]                          \
    modify exact [];

#define START_FAST_MATH(x)  START_FAST_MATH2(& x)
#define END_FAST_MATH(x)  END_FAST_MATH2(& x)

/*
__inline START_FAST_MATH(unsigned short x)
    {
    _asm {
        fstcw   ax
        mov     x , ax
        or      ax, 0x3f
        fldcw   ax
        }
    }

__inline END_FAST_MATH(unsigned short x)
    {
    _asm {
        fldcw   x
        }
    }
*/
#define HAVE_FAST_MATH

#else
#define START_FAST_MATH(x) (void)(x)
#define END_FAST_MATH(x)   (void)(x)

/* The mac float really is a float, with the same precision as a
 * single precision 387 float.
 */
#if defined(macintosh) || defined(__powerpc__)
#define HAVE_FAST_MATH
#endif

#endif


/*
 * Square root
 */

extern float gl_sqrt(float x);

#ifdef FAST_MATH
#if defined(__WATCOMC__) && defined(USE_X86_ASM)
#  define GL_SQRT(X)  asm_sqrt(X)
#else
#  define GL_SQRT(X)  gl_sqrt(X)
#endif
#else
#  define GL_SQRT(X)  sqrt(X)
#endif


/*
 * Normalize a 3-element vector to unit length.
 */
#define NORMALIZE_3FV( V )			\
do {						\
   GLfloat len = LEN_SQUARED_3FV(V);		\
   if (len) {					\
      len = (GLfloat) (1.0 / GL_SQRT(len));	\
      (V)[0] = (GLfloat) ((V)[0] * len);		\
      (V)[1] = (GLfloat) ((V)[1] * len);		\
      (V)[2] = (GLfloat) ((V)[2] * len);		\
   }						\
} while(0)

#define LEN_3FV( V ) (GL_SQRT((V)[0]*(V)[0]+(V)[1]*(V)[1]+(V)[2]*(V)[2]))
#define LEN_2FV( V ) (GL_SQRT((V)[0]*(V)[0]+(V)[1]*(V)[1]))

#define LEN_SQUARED_3FV( V ) ((V)[0]*(V)[0]+(V)[1]*(V)[1]+(V)[2]*(V)[2])
#define LEN_SQUARED_2FV( V ) ((V)[0]*(V)[0]+(V)[1]*(V)[1])


/*
 * Single precision ceiling, floor, and absolute value functions
 */
#if defined(__sparc__) /* XXX this probably isn't the ideal test */
#define CEILF(x)   ceil(x)
#define FLOORF(x)  floor(x)
#define FABSF(x)   fabs(x)
#else
#define CEILF(x)   ceilf(x)
#define FLOORF(x)  floorf(x)
#define FABSF(x)   fabsf(x)
#endif


#if defined(__i386__) || defined(__sparc__) || defined(__s390x__) || \
    defined(__powerpc__) || defined(__x86_64__) || \
    ( defined(__alpha__) && ( defined(__IEEE_FLOAT) || !defined(VMS) ) )
#define USE_IEEE
#endif


#define GET_FLOAT_BITS(x) ((fi_type *) &(x))->i

/*
 * Float -> Int conversions (rounding, floor, ceiling)
 */

#if defined(USE_SPARC_ASM) && defined(__GNUC__) && defined(__sparc__)

static INLINE int iround(float f)
{
       int r;
       __asm__ ("fstoi %1, %0" : "=f" (r) : "f" (f));
       return r;
}

#define IROUND(x)  iround(x)

#elif defined(USE_X86_ASM) && defined(__GNUC__) && defined(__i386__)


static INLINE int iround(float f)
{
   int r;
   __asm__ ("fistpl %0" : "=m" (r) : "t" (f) : "st");
   return r;
}

#define IROUND(x)  iround(x)

/*
 * IEEE floor for computers that round to nearest or even.
 * 'f' must be between -4194304 and 4194303.
 * This floor operation is done by "(iround(f + .5) + iround(f - .5)) >> 1",
 * but uses some IEEE specific tricks for better speed.
 * Contributed by Josh Vanderhoof
 */
static INLINE int ifloor(float f)
{
   int ai, bi;
   double af, bf;
   af = (3 << 22) + 0.5 + (double)f;
   bf = (3 << 22) + 0.5 - (double)f;
   /* GCC generates an extra fstp/fld without this. */
   __asm__ ("fstps %0" : "=m" (ai) : "t" (af) : "st");
   __asm__ ("fstps %0" : "=m" (bi) : "t" (bf) : "st");
   return (ai - bi) >> 1;
}

#define IFLOOR(x)  ifloor(x)

/*
 * IEEE ceil for computers that round to nearest or even.
 * 'f' must be between -4194304 and 4194303.
 * This ceil operation is done by "(iround(f + .5) + iround(f - .5) + 1) >> 1",
 * but uses some IEEE specific tricks for better speed.
 * Contributed by Josh Vanderhoof
 */
static INLINE int iceil(float f)
{
   int ai, bi;
   double af, bf;
   af = (3 << 22) + 0.5 + (double)f;
   bf = (3 << 22) + 0.5 - (double)f;
   /* GCC generates an extra fstp/fld without this. */
   __asm__ ("fstps %0" : "=m" (ai) : "t" (af) : "st");
   __asm__ ("fstps %0" : "=m" (bi) : "t" (bf) : "st");
   return (ai - bi + 1) >> 1;
}

#define ICEIL(x)  iceil(x)


#elif defined(USE_X86_ASM) && defined(__MSC__) && defined(__WIN32__)


static INLINE int iround(float f)
{
   int r;
   _asm {
	 fld f
	 fistp r
	}
   return r;
}

#define IROUND(x)  iround(x)


#elif defined(USE_X86_ASM) && defined(__WATCOMC__)


long iround(float f);
#pragma aux iround =                        \
	"push   eax"                        \
	"fistp  dword ptr [esp]"            \
	"pop    eax"                        \
	parm [8087]                         \
	value [eax]                         \
	modify exact [eax];

#define IROUND(x)  iround(x)

float asm_sqrt (float x);
#pragma aux asm_sqrt =                      \
	"fsqrt"                             \
	parm [8087]                         \
	value [8087]                        \
	modify exact [];


#endif /* assembly/optimized IROUND, IROUND_POS, IFLOOR, ICEIL macros */


/* default IROUND macro */
#ifndef IROUND
#define IROUND(f)  ((int) (((f) >= 0.0F) ? ((f) + 0.5F) : ((f) - 0.5F)))
#endif


/* default IROUND_POS macro */
#ifndef IROUND_POS
#ifdef DEBUG
#define IROUND_POS(f) (ASSERT((f) >= 0.0F), IROUND(f))
#else
#define IROUND_POS(f) (IROUND(f))
#endif
#endif /* IROUND_POS */


/* default IFLOOR macro */
#ifndef IFLOOR
static INLINE int ifloor(float f)
{
#ifdef USE_IEEE
   int ai, bi;
   double af, bf;
   union { int i; float f; } u;

   af = (3 << 22) + 0.5 + (double)f;
   bf = (3 << 22) + 0.5 - (double)f;
   u.f = af; ai = u.i;
   u.f = bf; bi = u.i;
   return (ai - bi) >> 1;
#else
   int i = IROUND(f);
   return (i > f) ? i - 1 : i;
#endif
}
#define IFLOOR(x)  ifloor(x)
#endif /* IFLOOR */


/* default ICEIL macro */
#ifndef ICEIL
static INLINE int iceil(float f)
{
#ifdef USE_IEEE
   int ai, bi;
   double af, bf;
   union { int i; float f; } u;
   af = (3 << 22) + 0.5 + (double)f;
   bf = (3 << 22) + 0.5 - (double)f;
   u.f = af; ai = u.i;
   u.f = bf; bi = u.i;
   return (ai - bi + 1) >> 1;
#else
   int i = IROUND(f);
   return (i < f) ? i + 1 : i;
#endif
}
#define ICEIL(x)  iceil(x)
#endif /* ICEIL */


/*
 * Convert unclamped or clamped ([0,1]) floats to ubytes for color
 * conversion only.  These functions round to the nearest int.
 */
#define IEEE_ONE 0x3f800000
#define IEEE_0996 0x3f7f0000	/* 0.996 or something??? used in macro
                                   below only */

#if defined(USE_IEEE) && !defined(DEBUG)

/*
 * This function/macro is sensitive to precision.  Test carefully
 * if you change it.
 */
#define UNCLAMPED_FLOAT_TO_UBYTE(b, f)					\
        do {								\
           union { GLfloat r; GLuint i; } __tmp;			\
           __tmp.r = (f);						\
           b = ((__tmp.i >= IEEE_0996)				\
               ? ((GLint)__tmp.i < 0) ? (GLubyte)0 : (GLubyte)255	\
               : (__tmp.r = __tmp.r*(255.0F/256.0F) + 32768.0F,		\
                  (GLubyte)__tmp.i));					\
        } while (0)

#define CLAMPED_FLOAT_TO_UBYTE(b, f) \
        UNCLAMPED_FLOAT_TO_UBYTE(b, f)

#define COPY_FLOAT( dst, src )					\
	((fi_type *) &(dst))->i = ((fi_type *) &(src))->i

#else /* USE_IEEE */

#define UNCLAMPED_FLOAT_TO_UBYTE(b, f) \
	b = ((GLubyte) IROUND(CLAMP(f, 0.0F, 1.0F) * 255.0F))

#define CLAMPED_FLOAT_TO_UBYTE(b, f) \
	b = ((GLubyte) IROUND((f) * 255.0F))

#define COPY_FLOAT( dst, src )		(dst) = (src)

#endif /* USE_IEEE */


/*
 * Integer / float conversion for colors, normals, etc.
 */

/* Convert GLubyte in [0,255] to GLfloat in [0.0,1.0] */
extern float _mesa_ubyte_to_float_color_tab[256];
#define UBYTE_TO_FLOAT(u) _mesa_ubyte_to_float_color_tab[(unsigned int)(u)]

/* Convert GLfloat in [0.0,1.0] to GLubyte in [0,255] */
#define FLOAT_TO_UBYTE(X)	((GLubyte) (GLint) ((X) * 255.0F))


/* Convert GLbyte in [-128,127] to GLfloat in [-1.0,1.0] */
#define BYTE_TO_FLOAT(B)	((2.0F * (B) + 1.0F) * (1.0F/255.0F))

/* Convert GLfloat in [-1.0,1.0] to GLbyte in [-128,127] */
#define FLOAT_TO_BYTE(X)	( (((GLint) (255.0F * (X))) - 1) / 2 )


/* Convert GLushort in [0,65536] to GLfloat in [0.0,1.0] */
#define USHORT_TO_FLOAT(S)	((GLfloat) (S) * (1.0F / 65535.0F))

/* Convert GLfloat in [0.0,1.0] to GLushort in [0,65536] */
#define FLOAT_TO_USHORT(X)	((GLushort) (GLint) ((X) * 65535.0F))


/* Convert GLshort in [-32768,32767] to GLfloat in [-1.0,1.0] */
#define SHORT_TO_FLOAT(S)	((2.0F * (S) + 1.0F) * (1.0F/65535.0F))

/* Convert GLfloat in [0.0,1.0] to GLshort in [-32768,32767] */
#define FLOAT_TO_SHORT(X)	( (((GLint) (65535.0F * (X))) - 1) / 2 )


/* Convert GLuint in [0,4294967295] to GLfloat in [0.0,1.0] */
#define UINT_TO_FLOAT(U)	((GLfloat) (U) * (1.0F / 4294967295.0F))

/* Convert GLfloat in [0.0,1.0] to GLuint in [0,4294967295] */
#define FLOAT_TO_UINT(X)	((GLuint) ((X) * 4294967295.0))


/* Convert GLint in [-2147483648,2147483647] to GLfloat in [-1.0,1.0] */
#define INT_TO_FLOAT(I)		((2.0F * (I) + 1.0F) * (1.0F/4294967294.0F))

/* Convert GLfloat in [-1.0,1.0] to GLint in [-2147483648,2147483647] */
/* causes overflow:
#define FLOAT_TO_INT(X)		( (((GLint) (4294967294.0F * (X))) - 1) / 2 )
*/
/* a close approximation: */
#define FLOAT_TO_INT(X)		( (GLint) (2147483647.0 * (X)) )


#define BYTE_TO_UBYTE(b)   ((GLubyte) ((b) < 0 ? 0 : (GLubyte) (b)))
#define SHORT_TO_UBYTE(s)  ((GLubyte) ((s) < 0 ? 0 : (GLubyte) ((s) >> 7)))
#define USHORT_TO_UBYTE(s) ((GLubyte) ((s) >> 8))
#define INT_TO_UBYTE(i)    ((GLubyte) ((i) < 0 ? 0 : (GLubyte) ((i) >> 23)))
#define UINT_TO_UBYTE(i)   ((GLubyte) ((i) >> 24))


#define BYTE_TO_USHORT(b)  ((b) < 0 ? 0 : ((GLushort) (((b) * 65535) / 255)))
#define UBYTE_TO_USHORT(b) (((GLushort) (b) << 8) | (GLushort) (b))
#define SHORT_TO_USHORT(s) ((s) < 0 ? 0 : ((GLushort) (((s) * 65535 / 32767))))
#define INT_TO_USHORT(i)   ((i) < 0 ? 0 : ((GLushort) ((i) >> 15)))
#define UINT_TO_USHORT(i)  ((i) < 0 ? 0 : ((GLushort) ((i) >> 16)))
#define UNCLAMPED_FLOAT_TO_USHORT(us, f)   us = (GLushort) ((f) * 65535.0F)


/*
 * Linear interpolation
 * NOTE:  OUT argument is evaluated twice!
 * NOTE:  Be wary of using *coord++ as an argument to any of these macros!
 */
#define LINTERP(T, OUT, IN)	((OUT) + (T) * ((IN) - (OUT)))

/* Can do better with integer math:
 */
#define INTERP_UB( t, dstub, outub, inub )	\
do {						\
   GLfloat inf = UBYTE_TO_FLOAT( inub );	\
   GLfloat outf = UBYTE_TO_FLOAT( outub );	\
   GLfloat dstf = LINTERP( t, outf, inf );	\
   UNCLAMPED_FLOAT_TO_UBYTE( dstub, dstf );	\
} while (0)

#define INTERP_CHAN( t, dstc, outc, inc )	\
do {						\
   GLfloat inf = CHAN_TO_FLOAT( inc );		\
   GLfloat outf = CHAN_TO_FLOAT( outc );	\
   GLfloat dstf = LINTERP( t, outf, inf );	\
   UNCLAMPED_FLOAT_TO_CHAN( dstc, dstf );	\
} while (0)

#define INTERP_UI( t, dstui, outui, inui )	\
   dstui = (GLuint) (GLint) LINTERP( t, (GLfloat) (outui), (GLfloat) (inui) )

#define INTERP_F( t, dstf, outf, inf )		\
   dstf = LINTERP( t, outf, inf )

#define INTERP_4F( t, dst, out, in )			\
do {							\
   (dst)[0] = LINTERP( (t), (out)[0], (in)[0] );	\
   (dst)[1] = LINTERP( (t), (out)[1], (in)[1] );	\
   (dst)[2] = LINTERP( (t), (out)[2], (in)[2] );	\
   (dst)[3] = LINTERP( (t), (out)[3], (in)[3] );	\
} while (0)

#define INTERP_3F( t, dst, out, in )			\
do {							\
   (dst)[0] = LINTERP( (t), (out)[0], (in)[0] );	\
   (dst)[1] = LINTERP( (t), (out)[1], (in)[1] );	\
   (dst)[2] = LINTERP( (t), (out)[2], (in)[2] );	\
} while (0)

#define INTERP_4CHAN( t, dst, out, in )			\
do {							\
   INTERP_CHAN( (t), (dst)[0], (out)[0], (in)[0] );	\
   INTERP_CHAN( (t), (dst)[1], (out)[1], (in)[1] );	\
   INTERP_CHAN( (t), (dst)[2], (out)[2], (in)[2] );	\
   INTERP_CHAN( (t), (dst)[3], (out)[3], (in)[3] );	\
} while (0)

#define INTERP_3CHAN( t, dst, out, in )			\
do {							\
   INTERP_CHAN( (t), (dst)[0], (out)[0], (in)[0] );	\
   INTERP_CHAN( (t), (dst)[1], (out)[1], (in)[1] );	\
   INTERP_CHAN( (t), (dst)[2], (out)[2], (in)[2] );	\
} while (0)

#define INTERP_SZ( t, vec, to, out, in, sz )				\
do {							       		\
   switch (sz) {							\
   case 4: (vec)[to][3] = LINTERP( (t), (vec)[out][3], (vec)[in][3] );	\
   case 3: (vec)[to][2] = LINTERP( (t), (vec)[out][2], (vec)[in][2] );	\
   case 2: (vec)[to][1] = LINTERP( (t), (vec)[out][1], (vec)[in][1] );	\
   case 1: (vec)[to][0] = LINTERP( (t), (vec)[out][0], (vec)[in][0] );	\
   }									\
} while(0)


/*
 * Fixed point arithmetic macros
 */
#define FIXED_ONE       0x00000800
#define FIXED_HALF      0x00000400
#define FIXED_FRAC_MASK 0x000007FF
#define FIXED_INT_MASK  (~FIXED_FRAC_MASK)
#define FIXED_EPSILON   1
#define FIXED_SCALE     2048.0f
#define FIXED_SHIFT     11
#define FloatToFixed(X) (IROUND((X) * FIXED_SCALE))
#define IntToFixed(I)   ((I) << FIXED_SHIFT)
#define FixedToInt(X)   ((X) >> FIXED_SHIFT)
#define FixedToUns(X)   (((unsigned int)(X)) >> FIXED_SHIFT)
#define FixedCeil(X)    (((X) + FIXED_ONE - FIXED_EPSILON) & FIXED_INT_MASK)
#define FixedFloor(X)   ((X) & FIXED_INT_MASK)
#define FixedToFloat(X) ((X) * (1.0F / FIXED_SCALE))
#define PosFloatToFixed(X)      FloatToFixed(X)
#define SignedFloatToFixed(X)   FloatToFixed(X)

#ifdef USE_IEEE
/* Returns TRUE for x == Inf or x == NaN. */
static INLINE int IS_INF_OR_NAN( float x )
{
   union {float f; int i;} tmp;
   tmp.f = x;
   return !(int)((unsigned int)((tmp.i & 0x7fffffff)-0x7f800000) >> 31);
}
#else
#define IS_INF_OR_NAN(x)        (!finite(x)) 
#endif

extern void
_mesa_init_math(void);


extern GLuint
_mesa_bitcount(GLuint n);


#endif