/*
 * Copyright (c) 1994 Regents of the University of California.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *	This product includes software developed by the Network Research
 *	Group at Lawrence Berkeley Laboratory.
 * 4. Neither the name of the University nor of the Laboratory may be used
 *    to endorse or promote products derived from this software without
 *    specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#ifndef lint
static char rcsid[] =
    "@(#) $Header: /cs/research/mice/starship/src/local/CVS_repository/vic/codec/dct.cpp,v 1.1 1999/09/09 12:45:39 piers Exp $ (LBL)";
#endif

#include <sys/types.h>
#include "bsd-endian.h"
#include "dct.h"

/*
 * Macros for fix-point (integer) arithmetic.  FP_NBITS gives the number
 * of binary digits past the decimal point.  FP_MUL computes the product
 * of two fixed point numbers.  A fixed point number and an integer
 * can be directly multiplied to give a fixed point number.  FP_SCALE
 * converts a floating point number to fixed point (and is used only
 * at startup, not by the dct engine).  FP_NORM converts a fixed
 * point number to scalar by rounding to the closest integer.
 * FP_JNORM is similar except it folds the jpeg bias of 128 into the
 * rounding addition.
 */
#define FP_NBITS 15
#define FP_MUL(a, b)	((((a) >> 5) * ((b) >> 5)) >> (FP_NBITS - 10))
#define FP_SCALE(v)	(int)((double)(v) * double(1 << FP_NBITS) + 0.5)
#define FP_NORM(v)	((v) + (1 << (FP_NBITS-1)) >> FP_NBITS)
#define FP_JNORM(v)	((v) + (257 << (FP_NBITS-1)) >> FP_NBITS)

#define M(n) ((m0 >> (n)) & 1)

/*
 * This macro stolen from nv.
 */
/* Sick little macro which will limit x to [0..255] with logical ops */
#define LIMIT8(x, t) ((t = (x)), (t &= ~(t>>31)), (t | ~((t-256) >> 31)))
#define LIMIT(x, t) (LIMIT8((x), t) & 0xff)

/* row order */
const u_char ROWZAG[] = {
	0,  1,  8, 16,  9,  2,  3, 10,
	17, 24, 32, 25, 18, 11,  4,  5,
	12, 19, 26, 33, 40, 48, 41, 34,
	27, 20, 13,  6,  7, 14, 21, 28,
	35, 42, 49, 56, 57, 50, 43, 36,
	29, 22, 15, 23, 30, 37, 44, 51,
	58, 59, 52, 45, 38, 31, 39, 46,
	53, 60, 61, 54, 47, 55, 62, 63,
	0,  0,  0,  0,  0,  0,  0,  0,
	0,  0,  0,  0,  0,  0,  0,  0
};
/* column order */
const u_char COLZAG[] = {
	0, 8, 1, 2, 9, 16, 24, 17,
	10, 3, 4, 11, 18, 25, 32, 40,
	33, 26, 19, 12, 5, 6, 13, 20,
	27, 34, 41, 48, 56, 49, 42, 35,
	28, 21, 14, 7, 15, 22, 29, 36,
	43, 50, 57, 58, 51, 44, 37, 30,
	23, 31, 38, 45, 52, 59, 60, 53,
	46, 39, 47, 54, 61, 62, 55, 63,
	0,  0,  0,  0,  0,  0,  0,  0,
	0,  0,  0,  0,  0,  0,  0,  0
};

#define A1 FP_SCALE(0.7071068)
#define A2 FP_SCALE(0.5411961)
#define A3 A1
#define A4 FP_SCALE(1.3065630)
#define A5 FP_SCALE(0.3826834)

#define FA1 (0.707106781f)
#define FA2 (0.541196100f)
#define FA3 FA1
#define FA4 (1.306562965f)
#define FA5 (0.382683433f)

/*
 * these magic numbers are scaling factors for each coef of the 1-d
 * AA&N DCT.  The scale factor for coef 0 is 1 and coef 1<=n<=7 is
 * cos(n*PI/16)*sqrt(2).  There is also a normalization of sqrt(8).
 * Formally you divide by the scale factor but we multiply by the
 * inverse because it's faster.  So the numbers below are the inverse
 * of what was just described.
 */
#define B0 0.35355339059327376220
#define B1 0.25489778955207958447
#define B2 0.27059805007309849220
#define B3 0.30067244346752264027
#define B4 0.35355339059327376220
#define B5 0.44998811156820785231
#define B6 0.65328148243818826392
#define B7 1.28145772387075308943

/*
 * Output multipliers for AA&N DCT
 * (i.e., first stage multipliers for inverse DCT).
 */
static const double first_stage[8] = { B0, B1, B2, B3, B4, B5, B6, B7, };

/*
 * The first_stage array crossed with itself.  This allows us
 * to embed the first stage multipliers of the row pass by
 * computing scaled versions of the columns.
 */
static const int cross_stage[64] = {
	FP_SCALE(B0 * B0),
	FP_SCALE(B0 * B1),
	FP_SCALE(B0 * B2),
	FP_SCALE(B0 * B3),
	FP_SCALE(B0 * B4),
	FP_SCALE(B0 * B5),
	FP_SCALE(B0 * B6),
	FP_SCALE(B0 * B7),

	FP_SCALE(B1 * B0),
	FP_SCALE(B1 * B1),
	FP_SCALE(B1 * B2),
	FP_SCALE(B1 * B3),
	FP_SCALE(B1 * B4),
	FP_SCALE(B1 * B5),
	FP_SCALE(B1 * B6),
	FP_SCALE(B1 * B7),

	FP_SCALE(B2 * B0),
	FP_SCALE(B2 * B1),
	FP_SCALE(B2 * B2),
	FP_SCALE(B2 * B3),
	FP_SCALE(B2 * B4),
	FP_SCALE(B2 * B5),
	FP_SCALE(B2 * B6),
	FP_SCALE(B2 * B7),

	FP_SCALE(B3 * B0),
	FP_SCALE(B3 * B1),
	FP_SCALE(B3 * B2),
	FP_SCALE(B3 * B3),
	FP_SCALE(B3 * B4),
	FP_SCALE(B3 * B5),
	FP_SCALE(B3 * B6),
	FP_SCALE(B3 * B7),

	FP_SCALE(B4 * B0),
	FP_SCALE(B4 * B1),
	FP_SCALE(B4 * B2),
	FP_SCALE(B4 * B3),
	FP_SCALE(B4 * B4),
	FP_SCALE(B4 * B5),
	FP_SCALE(B4 * B6),
	FP_SCALE(B4 * B7),

	FP_SCALE(B5 * B0),
	FP_SCALE(B5 * B1),
	FP_SCALE(B5 * B2),
	FP_SCALE(B5 * B3),
	FP_SCALE(B5 * B4),
	FP_SCALE(B5 * B5),
	FP_SCALE(B5 * B6),
	FP_SCALE(B5 * B7),

	FP_SCALE(B6 * B0),
	FP_SCALE(B6 * B1),
	FP_SCALE(B6 * B2),
	FP_SCALE(B6 * B3),
	FP_SCALE(B6 * B4),
	FP_SCALE(B6 * B5),
	FP_SCALE(B6 * B6),
	FP_SCALE(B6 * B7),

	FP_SCALE(B7 * B0),
	FP_SCALE(B7 * B1),
	FP_SCALE(B7 * B2),
	FP_SCALE(B7 * B3),
	FP_SCALE(B7 * B4),
	FP_SCALE(B7 * B5),
	FP_SCALE(B7 * B6),
	FP_SCALE(B7 * B7),
};
static const float f_cross_stage[64] = {
	float(B0 * B0),
	float(B0 * B1),
	float(B0 * B2),
	float(B0 * B3),
	float(B0 * B4),
	float(B0 * B5),
	float(B0 * B6),
	float(B0 * B7),

	float(B1 * B0),
	float(B1 * B1),
	float(B1 * B2),
	float(B1 * B3),
	float(B1 * B4),
	float(B1 * B5),
	float(B1 * B6),
	float(B1 * B7),

	float(B2 * B0),
	float(B2 * B1),
	float(B2 * B2),
	float(B2 * B3),
	float(B2 * B4),
	float(B2 * B5),
	float(B2 * B6),
	float(B2 * B7),

	float(B3 * B0),
	float(B3 * B1),
	float(B3 * B2),
	float(B3 * B3),
	float(B3 * B4),
	float(B3 * B5),
	float(B3 * B6),
	float(B3 * B7),

	float(B4 * B0),
	float(B4 * B1),
	float(B4 * B2),
	float(B4 * B3),
	float(B4 * B4),
	float(B4 * B5),
	float(B4 * B6),
	float(B4 * B7),

	float(B5 * B0),
	float(B5 * B1),
	float(B5 * B2),
	float(B5 * B3),
	float(B5 * B4),
	float(B5 * B5),
	float(B5 * B6),
	float(B5 * B7),

	float(B6 * B0),
	float(B6 * B1),
	float(B6 * B2),
	float(B6 * B3),
	float(B6 * B4),
	float(B6 * B5),
	float(B6 * B6),
	float(B6 * B7),

	float(B7 * B0),
	float(B7 * B1),
	float(B7 * B2),
	float(B7 * B3),
	float(B7 * B4),
	float(B7 * B5),
	float(B7 * B6),
	float(B7 * B7),
};

/*
 * Map a quantization table in natural, row-order,
 * into the qt input expected by rdct().
 */
void
rdct_fold_q(const int* in, int* out)
{
	for (int i = 0; i < 64; ++i) {
		/*
		 * Fold column and row passes of the dct.
		 * By scaling each column DCT independently,
		 * we pre-bias all the row DCT's so the
		 * first multiplier is already embedded
		 * in the temporary result.  Thanks to
		 * Martin Vetterli for explaining how
		 * to do this.
		 */
		double v = double(in[i]);
		v *= first_stage[i & 7];
		v *= first_stage[i >> 3];
		out[i] = FP_SCALE(v);
	}
}

/*
 * Just like rdct_fold_q() but we divide by the quantizer.
 */
void
fdct_fold_q(const int* in, float* out)
{
	for (int i = 0; i < 64; ++i) {
		double v = first_stage[i >> 3];
		v *= first_stage[i & 7];
		double q = double(in[i]);
		out[i] = v / q;
	}
}

void dcsum(int dc, u_char* in, u_char* out, int stride)
{
	for (int k = 8; --k >= 0; ) {
		int t;
#ifdef INT_64
		/*XXX assume little-endian */
		INT_64 i = *(INT_64*)in;
		INT_64 o = (INT_64)LIMIT(dc + (i >> 56), t) << 56;
		o |=  (INT_64)LIMIT(dc + (i >> 48 & 0xff), t) << 48;
		o |=  (INT_64)LIMIT(dc + (i >> 40 & 0xff), t) << 40;
		o |=  (INT_64)LIMIT(dc + (i >> 32 & 0xff), t) << 32;
		o |=  (INT_64)LIMIT(dc + (i >> 24 & 0xff), t) << 24;
		o |=  (INT_64)LIMIT(dc + (i >> 16 & 0xff), t) << 16;
		o |=  (INT_64)LIMIT(dc + (i >> 8 & 0xff), t) << 8;
		o |=  (INT_64)LIMIT(dc + (i & 0xff), t);
		*(INT_64*)out = o;
#else
		u_int o = 0;
		u_int i = *(u_int*)in;
		SPLICE(o, LIMIT(dc + EXTRACT(i, 24), t), 24);
		SPLICE(o, LIMIT(dc + EXTRACT(i, 16), t), 16);
		SPLICE(o, LIMIT(dc + EXTRACT(i, 8), t), 8);
		SPLICE(o, LIMIT(dc + EXTRACT(i, 0), t), 0);
		*(u_int*)out = o;

		o = 0;
		i = *(u_int*)(in + 4);
		SPLICE(o, LIMIT(dc + EXTRACT(i, 24),  t), 24);
		SPLICE(o, LIMIT(dc + EXTRACT(i, 16), t), 16);
		SPLICE(o, LIMIT(dc + EXTRACT(i, 8), t), 8);
		SPLICE(o, LIMIT(dc + EXTRACT(i, 0), t), 0);
		*(u_int*)(out + 4) = o;
#endif
		in += stride;
		out += stride;
	}
}

void dcsum2(int dc, u_char* in, u_char* out, int stride)
{
	for (int k = 8; --k >= 0; ) {
		int t;
		u_int o = 0;
		SPLICE(o, LIMIT(dc + in[0], t), 24);
		SPLICE(o, LIMIT(dc + in[1], t), 16);
		SPLICE(o, LIMIT(dc + in[2], t), 8);
		SPLICE(o, LIMIT(dc + in[3], t), 0);
		*(u_int*)out = o;

		o = 0;
		SPLICE(o, LIMIT(dc + in[4], t), 24);
		SPLICE(o, LIMIT(dc + in[5], t), 16);
		SPLICE(o, LIMIT(dc + in[6], t), 8);
		SPLICE(o, LIMIT(dc + in[7], t), 0);
		*(u_int*)(out + 4) = o;

		in += stride;
		out += stride;
	}
}

void dcfill(int DC, u_char* out, int stride)
{
	int t;
	u_int dc = DC;
	dc = LIMIT(dc, t);
	dc |= dc << 8;
	dc |= dc << 16;
#ifdef INT_64
	INT_64 xdc = dc;
	xdc |= xdc << 32;
	*(INT_64 *)out = xdc;
	out += stride;
	*(INT_64 *)out = xdc;
	out += stride;
	*(INT_64 *)out = xdc;
	out += stride;
	*(INT_64 *)out = xdc;
	out += stride;
	*(INT_64 *)out = xdc;
	out += stride;
	*(INT_64 *)out = xdc;
	out += stride;
	*(INT_64 *)out = xdc;
	out += stride;
	*(INT_64 *)out = xdc;
#else
	*(u_int*)out = dc;
	*(u_int*)(out + 4) = dc;
	out += stride;
	*(u_int*)out = dc;
	*(u_int*)(out + 4) = dc;
	out += stride;
	*(u_int*)out = dc;
	*(u_int*)(out + 4) = dc;
	out += stride;
	*(u_int*)out = dc;
	*(u_int*)(out + 4) = dc;
	out += stride;
	*(u_int*)out = dc;
	*(u_int*)(out + 4) = dc;
	out += stride;
	*(u_int*)out = dc;
	*(u_int*)(out + 4) = dc;
	out += stride;
	*(u_int*)out = dc;
	*(u_int*)(out + 4) = dc;
	out += stride;
	*(u_int*)out = dc;
	*(u_int*)(out + 4) = dc;
#endif
}

/*
 * This routine mixes the DC & AC components of an 8x8 block of
 * pixels.  This routine is called for every block decoded so it
 * needs to be efficient.  It tries to do as many pixels in parallel
 * as will fit in a word.  The one complication is that it has to
 * deal with overflow (sum > 255) and underflow (sum < 0).  Underflow
 * & overflow are only possible if both terms have the same sign and
 * are indicated by the result having a different sign than the terms.
 * Note that underflow is more worrisome than overflow since it results
 * in bright white dots in a black field.
 * The DC term and sum are biased by 128 so a negative number has the
 * 2^7 bit = 0.  The AC term is not biased so a negative number has
 * the 2^7 bit = 1.  So underflow is indicated by (DC & AC & sum) != 0;
 */
#define MIX_LOGIC(sum, a, b, omask, uflo) \
{ \
	sum = a + b; \
	uflo = (a ^ b) & (a ^ sum) & omask; \
	if (uflo) { \
		if ((b = uflo & a) != 0) { \
			/* integer overflows */ \
			b |= b >> 1; \
			b |= b >> 2; \
			b |= b >> 4; \
			sum |= b; \
		} \
		if ((uflo &=~ b) != 0) { \
			/* integer underflow(s) */ \
			uflo |= uflo >> 1; \
			uflo |= uflo >> 2; \
			uflo |= uflo >> 4; \
			sum &= ~uflo; \
		} \
	} \
}
/*
 * Table of products of 8-bit scaled coefficients
 * and idct coefficients (there are only 33 unique
 * coefficients so we index via a compact ID).
 */
extern "C" u_char multab[];
/*
 * Array of coefficient ID's used to index multab.
 */
extern "C" u_int dct_basis[64][64 / sizeof(u_int)];

/*XXX*/
#define LIMIT_512(s) ((s) > 511 ? 511 : (s) < -512 ? -512 : (s))

void
bv_rdct1(int dc, short* bp, int acx, u_char* out, int stride)
{
	u_int omask = 0x80808080;
	u_int uflo;
	u_int* vp = dct_basis[acx];
	int s = LIMIT_512(bp[acx]);
	s = (s >> 2) & 0xff;
	/* 66 unique coefficients require 7 bits */
	char* mt = (char*)&multab[s << 7];

	dc |= dc << 8;
	dc |= dc << 16;
	for (int k = 8; --k >= 0; ) {
		u_int v = *vp++;
		u_int m = mt[v >> 24] << SHIFT(24) |
			mt[v >> 16 & 0xff] << SHIFT(16) |
			mt[v >> 8 & 0xff] << SHIFT(8) |
			mt[v & 0xff] << SHIFT(0);
		MIX_LOGIC(v, dc, m, omask, uflo);
		*(u_int*)out = v;
		v = *vp++;
		m = mt[v >> 24] << SHIFT(24) |
			mt[v >> 16 & 0xff] << SHIFT(16) |
			mt[v >> 8 & 0xff] << SHIFT(8) |
			mt[v & 0xff] << SHIFT(0);
		MIX_LOGIC(v, dc, m, omask, uflo);
		*(u_int*)(out + 4) = v;
		out += stride;
	}
}

/* XXX this version has to be exact */
void
bv_rdct2(int dc, short* bp, int ac0, u_char* in, u_char* out, int stride)
{
	int s0 = LIMIT_512(bp[ac0]);
	s0 = (s0 >> 2) & 0xff;
	/* 66 unique coefficients require 7 bits */
	const char* mt = (const char*)&multab[s0 << 7];
	const u_int* vp0 = dct_basis[ac0];

	dc |= dc << 8;
	dc |= dc << 16;
	u_int omask = 0x80808080;
	u_int uflo;
	for (int k = 8; --k >= 0; ) {
		u_int v, m, i;
		v = *vp0++;
		m = mt[v >> 24] << SHIFT(24) | mt[v >> 16 & 0xff] << SHIFT(16) |
		    mt[v >> 8 & 0xff] << SHIFT(8) | mt[v & 0xff] << SHIFT(0);
		MIX_LOGIC(v, dc, m, omask, uflo);
		i = in[0] << SHIFT(24) | in[1] << SHIFT(16) |
		    in[2] << SHIFT(8) | in[3] << SHIFT(0);
		MIX_LOGIC(m, i, v, omask, uflo);
		*(u_int*)out = m;

		v = *vp0++;
		m = mt[v >> 24] << SHIFT(24) | mt[v >> 16 & 0xff] << SHIFT(16) |
		    mt[v >> 8 & 0xff] << SHIFT(8) | mt[v & 0xff] << SHIFT(0);
		MIX_LOGIC(v, dc, m, omask, uflo);
		i = in[4] << SHIFT(24) | in[5] << SHIFT(16) |
		    in[6] << SHIFT(8) | in[7] << SHIFT(0);
		MIX_LOGIC(m, i, v, omask, uflo);
		*(u_int*)(out + 4) = m;
		out += stride;
		in += stride;
	}
}

/* XXX this version has to be exact */
void
bv_rdct3(int dc, short* bp, int ac0, int ac1, u_char* in, u_char* out, int stride)
{
	int s0 = LIMIT_512(bp[ac0]);
	s0 = (s0 >> 2) & 0xff;
	/* 66 unique coefficients require 7 bits */
	char* mt0 = (char*)&multab[s0 << 7];

	int s1 = LIMIT_512(bp[ac1]);
	s1 = (s1 >> 2) & 0xff;
	char* mt1 = (char*)&multab[s1 << 7];

	u_int* vp0 = dct_basis[ac0];
	u_int* vp1 = dct_basis[ac1];

	for (int k = 8; --k >= 0; ) {
		int t;
		u_int v0 = *vp0++;
		u_int v1 = *vp1++;
		s0 = mt0[v0 >> 24] + mt1[v1 >> 24] + in[0] + dc;
		u_int m = LIMIT(s0, t) << SHIFT(24);
		s0 = mt0[v0 >> 16 & 0xff] + mt1[v1 >> 16 & 0xff] + in[1] + dc;
		m |= LIMIT(s0, t) << SHIFT(16);
		s0 = mt0[v0 >> 8 & 0xff] + mt1[v1 >> 8 & 0xff] + in[2] + dc;
		m |= LIMIT(s0, t) << SHIFT(8);
		s0 = mt0[v0 & 0xff] + mt1[v1 & 0xff] + in[3] + dc;
		m |= LIMIT(s0, t) << SHIFT(0);
		*(u_int*)out = m;

		v0 = *vp0++;
		v1 = *vp1++;
		s0 = mt0[v0 >> 24] + mt1[v1 >> 24] + in[4] + dc;
		m = 0;
		m |= LIMIT(s0, t) << SHIFT(24);
		s0 = mt0[v0 >> 16 & 0xff] + mt1[v1 >> 16 & 0xff] + in[5] + dc;
		m |= LIMIT(s0, t) << SHIFT(16);
		s0 = mt0[v0 >> 8 & 0xff] + mt1[v1 >> 8 & 0xff] + in[6] + dc;
		m |= LIMIT(s0, t) << SHIFT(8);
		s0 = mt0[v0 & 0xff] + mt1[v1 & 0xff] + in[7] + dc;
		m |= LIMIT(s0, t) << SHIFT(0);
		*(u_int*)(out + 4) = m;

		out += stride;
		in += stride;
	}
}

#ifdef INT_64
/*XXX assume little-endian */
#define PSPLICE(v, n) pix |= (INT_64)(v) << ((n)*8)
#define DID4PIX 
#define PSTORE ((INT_64*)p)[0] = pix
#define PIXDEF INT_64 pix = 0; int v, oflo = 0
#else
#define PSPLICE(v, n) SPLICE(pix, (v), (3 - ((n)&3)) * 8)
#define DID4PIX pix0 = pix; pix = 0
#define PSTORE ((u_int*)p)[0] = pix0; ((u_int*)p)[1] = pix
#define PIXDEF	u_int pix0, pix = 0; int v, oflo = 0
#endif
#define DOJPIX(val, n) v = FP_JNORM(val); oflo |= v; PSPLICE(v, n)
#define DOJPIXLIMIT(val, n) PSPLICE(LIMIT(FP_JNORM(val),t), n)
#define DOPIX(val, n) v = FP_NORM(val); oflo |= v; PSPLICE(v, n)
#define DOPIXLIMIT(val, n) PSPLICE(LIMIT(FP_NORM(val),t), n)
#define DOPIXIN(val, n) v = FP_NORM(val) + in[n]; oflo |= v; PSPLICE(v, n)
#define DOPIXINLIMIT(val, n) PSPLICE(LIMIT(FP_NORM(val) + in[n], t), n)

/*
 * A 2D Inverse DCT based on a column-row decomposition using
 * Arai, Agui, and Nakajmia's 8pt 1D Inverse DCT, from Fig. 4-8
 * Pennebaker & Mitchell (i.e., the pink JPEG book).  This figure
 * is the forward transform; reverse the flowgraph for the inverse
 * (you need to draw a picture).  The outputs are DFT coefficients
 * and need to be scaled according to Eq [4-3].
 *
 * The input coefficients are, counter to tradition, in column-order.
 * The bit mask indicates which coefficients are non-zero.  If the
 * corresponding bit is zero, then the coefficient is assumed zero
 * and the input coefficient is not referenced and need not be defined.
 * The 8-bit outputs are computed in row order and placed in the
 * output array pointed to by p, with each of the eight 8-byte lines
 * offset by "stride" bytes.
 *
 * qt is the inverse quantization table in column order.  These
 * coefficients are the product of the inverse quantization factor,
 * specified by the jpeg quantization table, and the first multiplier
 * in the inverse DCT flow graph.  This first multiplier is actually
 * biased by the row term so that the columns are pre-scaled to
 * eliminate the first row multiplication stage.
 *
 * The output is biased by 128, i.e., [-128,127] is mapped to [0,255],
 * which is relevant to jpeg.
 */
void
#ifdef INT_64
rdct(register short *bp, INT_64 m0, u_char* p, int stride, const int* qt)
#else
rdct(register short *bp, u_int m0, u_int m1, u_char* p,
     int stride, const int* qt)
#endif
{
	/*
	 * First pass is 1D transform over the columns.  Note that
	 * the coefficients are stored in column order, so even
	 * though we're accessing the columns, we access them
	 * in a row-like fashion.  We use a column-row decomposition
	 * instead of a row-column decomposition so we can splice
	 * pixels in an efficient, row-wise order in the second
	 * stage.
	 *
	 * The inverse quantization is folded together with the
	 * first stage multiplier.  This reduces the number of
	 * multiplies (per 8-pt transform) from 11 to 5 (ignoring
	 * the inverse quantization which must be done anyway).
	 *
	 * Because we compute appropriately scaled column DCTs,
	 * the row DCTs require only 5 multiplies per row total.
	 * So the total number of multiplies for the 8x8 DCT is 80
	 * (ignoring the checks for zero coefficients).
	 */
	int tmp[64];
	int* tp = tmp;
	int i;
	for (i = 8; --i >= 0; ) {
		if ((m0 & 0xfe) == 0) {
			/* AC terms all zero */
			int v = M(0) ? qt[0] * bp[0] : 0;
			tp[0] = v;
			tp[1] = v;
			tp[2] = v;
			tp[3] = v;
			tp[4] = v;
			tp[5] = v;
			tp[6] = v;
			tp[7] = v;
		} else {
			int t0, t1, t2, t3, t4, t5, t6, t7;

			if ((m0 & 0xaa) == 0)
				t4 = t5 = t6 = t7 = 0;
			else {
				t0 = M(5) ? qt[5] * bp[5] : 0;
				t2 = M(1) ? qt[1] * bp[1] : 0;
				t6 = M(7) ? qt[7] * bp[7] : 0;
				t7 = M(3) ? qt[3] * bp[3] : 0;

				t4 = t0 - t7;
				t1 = t2 + t6;
				t6 = t2 - t6;
				t7 += t0;

				t5 = t1 - t7;
				t7 += t1;

				t2 = FP_MUL(-A5, t4 + t6);
				t4 = FP_MUL(-A2, t4);
				t0 = t4 + t2;
				t1 = FP_MUL(A3, t5);
				t2 += FP_MUL(A4, t6);
			
				t4 = -t0;
				t5 = t1 - t0;
				t6 = t1 + t2;
				t7 += t2;
			}

#ifdef notdef
			if ((m0 & 0x55) == 0)
				t0 = t1 = t2 = t3 = 0;
			else {
#endif
				t0 = M(0) ? qt[0] * bp[0] : 0;
				t1 = M(4) ? qt[4] * bp[4] : 0;
				int x0 = t0 + t1;
				int x1 = t0 - t1;

				t0 = M(2) ? qt[2] * bp[2] : 0;
				t3 = M(6) ? qt[6] * bp[6] : 0;
				t2 = t0 - t3;
				t3 += t0;

				t2 = FP_MUL(A1, t2);
				t3 += t2;

				t0 = x0 + t3;
				t1 = x1 + t2;
				t2 = x1 - t2;
				t3 = x0 - t3;
#ifdef notdef
			}
#endif

			tp[0] = t0 + t7;
			tp[7] = t0 - t7;
			tp[1] = t1 + t6;
			tp[6] = t1 - t6;
			tp[2] = t2 + t5;
			tp[5] = t2 - t5;
			tp[3] = t3 + t4;
			tp[4] = t3 - t4;
		}
		tp += 8;
		bp += 8;
		qt += 8;

		m0 >>= 8;
#ifndef INT_64
		m0 |= m1 << 24;
		m1 >>= 8;
#endif
	}
	tp -= 64;
	/*
	 * Second pass is 1D transform over the rows.  Note that
	 * the coefficients are stored in column order, so even
	 * though we're accessing the rows, we access them
	 * in a column-like fashion.
	 *
	 * The pass above already computed the first multiplier
	 * in the flow graph.
	 */
	for (i = 8; --i >= 0; ) {
		int t0, t1, t2, t3, t4, t5, t6, t7;

		t0 = tp[8*5];
		t2 = tp[8*1];
		t6 = tp[8*7];
		t7 = tp[8*3];

#ifdef notdef
		if ((t0 | t2 | t6 | t7) == 0) {
			t4 = t5 = 0;
		} else 
#endif
{
			t4 = t0 - t7;
			t1 = t2 + t6;
			t6 = t2 - t6;
			t7 += t0;

			t5 = t1 - t7;
			t7 += t1;

			t2 = FP_MUL(-A5, t4 + t6);
			t4 = FP_MUL(-A2, t4);
			t0 = t4 + t2;
			t1 = FP_MUL(A3, t5);
			t2 += FP_MUL(A4, t6);
			
			t4 = -t0;
			t5 = t1 - t0;
			t6 = t1 + t2;
			t7 += t2;
		}

		t0 = tp[8*0];
		t1 = tp[8*4];
		int x0 = t0 + t1;
		int x1 = t0 - t1;

		t0 = tp[8*2];
		t3 = tp[8*6];
		t2 = t0 - t3;
		t3 += t0;

		t2 = FP_MUL(A1, t2);
		t3 += t2;

		t0 = x0 + t3;
		t1 = x1 + t2;
		t2 = x1 - t2;
		t3 = x0 - t3;

		PIXDEF;
		DOJPIX(t0 + t7, 0);
		DOJPIX(t1 + t6, 1);
		DOJPIX(t2 + t5, 2);
		DOJPIX(t3 + t4, 3);
		DID4PIX;
		DOJPIX(t3 - t4, 4);
		DOJPIX(t2 - t5, 5);
		DOJPIX(t1 - t6, 6);
		DOJPIX(t0 - t7, 7);
		if (oflo & ~0xff) {
			int t;
			pix = 0;
			DOJPIXLIMIT(t0 + t7, 0);
			DOJPIXLIMIT(t1 + t6, 1);
			DOJPIXLIMIT(t2 + t5, 2);
			DOJPIXLIMIT(t3 + t4, 3);
			DID4PIX;
			DOJPIXLIMIT(t3 - t4, 4);
			DOJPIXLIMIT(t2 - t5, 5);
			DOJPIXLIMIT(t1 - t6, 6);
			DOJPIXLIMIT(t0 - t7, 7);
		}
		PSTORE;

		++tp;
		p += stride;
	}
}

/*
 * Inverse 2-D transform, similar to routine above (see comment above),
 * but more appropriate for H.261 instead of JPEG.  This routine does
 * not bias the output by 128, and has an additional argument which is
 * an input array which gets summed together with the inverse-transform.
 * For example, this allows motion-compensation to be folded in here,
 * saving an extra traversal of the block.  The input pointer can be
 * null, if motion-compensation is not needed.
 *
 * This routine does not take a quantization table, since the H.261
 * inverse quantizer is easily implemented via table lookup in the decoder.
 */
void
#ifdef INT_64
rdct(register short *bp, INT_64 m0, u_char* p, int stride, const u_char* in)
#else
rdct(register short *bp, u_int m0, u_int m1, u_char* p, int stride, const u_char *in)
#endif
{
	int tmp[64];
	int* tp = tmp;
	const int* qt = cross_stage;
	/*
	 * First pass is 1D transform over the rows of the input array.
	 */
	int i;
	for (i = 8; --i >= 0; ) {
		if ((m0 & 0xfe) == 0) {
			/*
			 * All ac terms are zero.
			 */
			int v = 0;
			if (M(0))
				v = qt[0] * bp[0];
			tp[0] = v;
			tp[1] = v;
			tp[2] = v;
			tp[3] = v;
			tp[4] = v;
			tp[5] = v;
			tp[6] = v;
			tp[7] = v;
		} else {
			int t4 = 0, t5 = 0, t6 = 0, t7 = 0;
			if (m0 & 0xaa) {
				/* odd part */
				if (M(1))
					t4 = qt[1] * bp[1];
				if (M(3))
					t5 = qt[3] * bp[3];
				if (M(5))
					t6 = qt[5] * bp[5];
				if (M(7))
					t7 = qt[7] * bp[7];

				int x0 = t6 - t5;
				t6 += t5;
				int x1 = t4 - t7;
				t7 += t4;

				t5 = FP_MUL(t7 - t6, A3);
				t7 += t6;

				t4 = FP_MUL(x1 + x0, A5);
				t6 = FP_MUL(x1, A4) - t4;
				t4 += FP_MUL(x0, A2);

				t7 += t6;
				t6 += t5;
				t5 += t4;
			}
			int t0 = 0, t1 = 0, t2 = 0, t3 = 0;
			if (m0 & 0x55) {
				/* even part */
				if (M(0))
					t0 = qt[0] * bp[0];
				if (M(2))
					t1 = qt[2] * bp[2];
				if (M(4))
					t2 = qt[4] * bp[4];
				if (M(6))
					t3 = qt[6] * bp[6];

				int x0 = FP_MUL(t1 - t3, A1);
				t3 += t1;
				t1 = t0 - t2;
				t0 += t2;
				t2 = t3 + x0;
				t3 = t0 - t2;
				t0 += t2;
				t2 = t1 - x0;
				t1 += x0;
			}
			tp[0] = t0 + t7;
			tp[1] = t1 + t6;
			tp[2] = t2 + t5;
			tp[3] = t3 + t4;
			tp[4] = t3 - t4;
			tp[5] = t2 - t5;
			tp[6] = t1 - t6;
			tp[7] = t0 - t7;
		}
		qt += 8;
		tp += 8;
		bp += 8;
		m0 >>= 8;
#ifndef INT_64
		m0 |= m1 << 24;
		m1 >>= 8;
#endif
	}
	tp -= 64;
	/*
	 * Second pass is 1D transform over the rows of the temp array.
	 */
	for (i = 8; --i >= 0; ) {
		int t4 = tp[8*1];
		int t5 = tp[8*3];
		int t6 = tp[8*5];
		int t7 = tp[8*7];
		if ((t4|t5|t6|t7) != 0) {
			/* odd part */
			int x0 = t6 - t5;
			t6 += t5;
			int x1 = t4 - t7;
			t7 += t4;

			t5 = FP_MUL(t7 - t6, A3);
			t7 += t6;

			t4 = FP_MUL(x1 + x0, A5);
			t6 = FP_MUL(x1, A4) - t4;
			t4 += FP_MUL(x0, A2);

			t7 += t6;
			t6 += t5;
			t5 += t4;
		}
		int t0 = tp[8*0];
		int t1 = tp[8*2];
		int t2 = tp[8*4];
		int t3 = tp[8*6];
		if ((t0|t1|t2|t3) != 0) {
			/* even part */
			int x0 = FP_MUL(t1 - t3, A1);
			t3 += t1;
			t1 = t0 - t2;
			t0 += t2;
			t2 = t3 + x0;
			t3 = t0 - t2;
			t0 += t2;
			t2 = t1 - x0;
			t1 += x0;
		}
		if (in != 0) {
			PIXDEF;
			DOPIXIN(t0 + t7, 0);
			DOPIXIN(t1 + t6, 1);
			DOPIXIN(t2 + t5, 2);
			DOPIXIN(t3 + t4, 3);
			DID4PIX;
			DOPIXIN(t3 - t4, 4);
			DOPIXIN(t2 - t5, 5);
			DOPIXIN(t1 - t6, 6);
			DOPIXIN(t0 - t7, 7);
			if (oflo & ~0xff) {
				int t;
				pix = 0;
				DOPIXINLIMIT(t0 + t7, 0);
				DOPIXINLIMIT(t1 + t6, 1);
				DOPIXINLIMIT(t2 + t5, 2);
				DOPIXINLIMIT(t3 + t4, 3);
				DID4PIX;
				DOPIXINLIMIT(t3 - t4, 4);
				DOPIXINLIMIT(t2 - t5, 5);
				DOPIXINLIMIT(t1 - t6, 6);
				DOPIXINLIMIT(t0 - t7, 7);
			}
			PSTORE;
			in += stride;
		} else {
			PIXDEF;
			DOPIX(t0 + t7, 0);
			DOPIX(t1 + t6, 1);
			DOPIX(t2 + t5, 2);
			DOPIX(t3 + t4, 3);
			DID4PIX;
			DOPIX(t3 - t4, 4);
			DOPIX(t2 - t5, 5);
			DOPIX(t1 - t6, 6);
			DOPIX(t0 - t7, 7);
			if (oflo & ~0xff) {
				int t;
				pix = 0;
				DOPIXLIMIT(t0 + t7, 0);
				DOPIXLIMIT(t1 + t6, 1);
				DOPIXLIMIT(t2 + t5, 2);
				DOPIXLIMIT(t3 + t4, 3);
				DID4PIX;
				DOPIXLIMIT(t3 - t4, 4);
				DOPIXLIMIT(t2 - t5, 5);
				DOPIXLIMIT(t1 - t6, 6);
				DOPIXLIMIT(t0 - t7, 7);
			}
			PSTORE;
		}
		tp += 1;
		p += stride;
	}
}

/*
 * This macro does the combined descale-and-quantize
 * multiply.  It truncates rather than rounds to give
 * the behavior required for the h.261 deadband quantizer.
 */
#define FWD_DandQ(v, iq) short((v) * qt[iq])

void fdct(const u_char* in, int stride, short* out, const float* qt)
{
	float tmp[64];
	float* tp = tmp;

	int i;
	for (i = 8; --i >= 0; ) {
		float x0, x1, x2, x3, t0, t1, t2, t3, t4, t5, t6, t7;
		t0 = float(in[0] + in[7]);
		t7 = float(in[0] - in[7]);
		t1 = float(in[1] + in[6]);
		t6 = float(in[1] - in[6]);
		t2 = float(in[2] + in[5]);
		t5 = float(in[2] - in[5]);
		t3 = float(in[3] + in[4]);
		t4 = float(in[3] - in[4]);

		/* even part */
		x0 = t0 + t3;
		x2 = t1 + t2;
		tp[8*0] = x0 + x2;
		tp[8*4] = x0 - x2;
    
		x1 = t0 - t3;
		x3 = t1 - t2;
		t0 = (x1 + x3) * FA1;
		tp[8*2] = x1 + t0;
		tp[8*6] = x1 - t0;

		/* odd part */
		x0 = t4 + t5;
		x1 = t5 + t6;
		x2 = t6 + t7;

		t3 = x1 * FA1;
		t4 = t7 - t3;

		t0 = (x0 - x2) * FA5;
		t1 = x0 * FA2 + t0;
		tp[8*3] = t4 - t1;
		tp[8*5] = t4 + t1;

		t7 += t3;
		t2 = x2 * FA4 + t0;
		tp[8*1] = t7 + t2;
		tp[8*7] = t7 - t2;
		
		in += stride;
		tp += 1;
	}
	tp -= 8;

	for (i = 8; --i >= 0; ) {
		float x0, x1, x2, x3, t0, t1, t2, t3, t4, t5, t6, t7;
		t0 = tp[0] + tp[7];
		t7 = tp[0] - tp[7];
		t1 = tp[1] + tp[6];
		t6 = tp[1] - tp[6];
		t2 = tp[2] + tp[5];
		t5 = tp[2] - tp[5];
		t3 = tp[3] + tp[4];
		t4 = tp[3] - tp[4];

		/* even part */
		x0 = t0 + t3;
		x2 = t1 + t2;
		out[0] = FWD_DandQ(x0 + x2, 0);
		out[4] = FWD_DandQ(x0 - x2, 4);
    
		x1 = t0 - t3;
		x3 = t1 - t2;
		t0 = (x1 + x3) * FA1;
		out[2] = FWD_DandQ(x1 + t0, 2);
		out[6] = FWD_DandQ(x1 - t0, 6);

		/* odd part */
		x0 = t4 + t5;
		x1 = t5 + t6;
		x2 = t6 + t7;

		t3 = x1 * FA1;
		t4 = t7 - t3;

		t0 = (x0 - x2) * FA5;
		t1 = x0 * FA2 + t0;
		out[3] = FWD_DandQ(t4 - t1, 3);
		out[5] = FWD_DandQ(t4 + t1, 5);

		t7 += t3;
		t2 = x2 * FA4 + t0;
		out[1] = FWD_DandQ(t7 + t2, 1);
		out[7] = FWD_DandQ(t7 - t2, 7);

		out += 8;
		tp += 8;
		qt += 8;
	}
}

/*
 * decimate the *rows* of the two input 8x8 DCT matrices into
 * a single output matrix.  we decimate rows rather than
 * columns even though we want column decimation because
 * the DCTs are stored in column order.
 */
void
dct_decimate(const short* in0, const short* in1, short* o)
{
	for (int k = 0; k < 8; ++k) {
		int x00 = in0[0];
		int x01 = in0[1];
		int x02 = in0[2];
		int x03 = in0[3];
		int x10 = in1[0];
		int x11 = in1[1];
		int x12 = in1[2];
		int x13 = in1[3];
#define X_N 4
#define X_5(v)  ((v) << (X_N - 1))
#define X_25(v)  ((v) << (X_N - 2))
#define X_125(v)  ((v) << (X_N - 3))
#define X_0625(v)  ((v) << (X_N - 4))
#define X_375(v) (X_25(v) + X_125(v))
#define X_625(v) (X_5(v) + X_125(v))
#define X_75(v) (X_5(v) + X_25(v))
#define X_6875(v) (X_5(v) + X_125(v) + X_0625(v))
#define X_1875(v) (X_125(v) + X_0625(v))
#define X_NORM(v) ((v) >> X_N)

		/*
		 * 0.50000000  0.09011998  0.00000000 0.10630376 
		 * 	0.50000000  0.09011998  0.00000000  0.10630376
		 * 0.45306372  0.28832037  0.03732892 0.08667963
		 * 	-0.45306372  0.11942621  0.10630376 -0.06764951
		 * 0.00000000  0.49039264  0.17677670 0.00000000
		 * 	0.00000000 -0.49039264 -0.17677670  0.00000000 
		 * -0.15909482  0.34009707  0.38408888 0.05735049
		 *	0.15909482  0.43576792 -0.09011998 -0.13845632
		 * 0.00000000 -0.03732892  0.46193977 0.25663998
		 * 	0.00000000 -0.03732892  0.46193977  0.25663998
		 * 0.10630376 -0.18235049  0.25663998 0.42361940 
		 *	-0.10630376 -0.16332037 -0.45306372 -0.01587282
		 * 0.00000000  0.00000000 -0.07322330 0.41573481
		 * 	0.00000000  0.00000000  0.07322330 -0.41573481
		 * -0.09011998  0.13399123 -0.18766514 0.24442621
		 *	0.09011998  0.13845632  0.15909482  0.47539609
		 */

		o[0] = X_NORM(X_5(x00 + x10) + X_0625(x01 + x11) +
			      X_125(x03 + x13));
		o[1] = X_NORM(X_5(x00 - x10) + X_25(x01) + X_0625(x03) +
			      X_125(x11 + x12));
		o[2] = X_NORM(X_5(x01 - x11) + X_1875(x02 + x12));
		o[3] = X_NORM(X_1875(x10 - x00) + X_375(x01 + x02) + 
			      X_5(x11) - X_125(x13));
		o[4] = X_NORM(X_5(x02 + x12) + X_25(x03 + x13));
		o[5] = X_NORM(X_125(x00 - x10) - X_1875(x01 + x11) +
			      X_25(x02) + X_5(x03 - x12));
		o[6] = X_NORM(X_625(x12 - x02) + X_375(x03 + x13));
		o[7] = X_NORM(X_125(x01 - x00 + x11 + x10 + x12) +
			      X_1875(x02) + X_25(x03) + X_5(x13));

		o += 8;
		in0 += 8;
		in1 += 8;
	}
}