/* * idct_248.c * * Copyright (C) Charles 'Buck' Krasic - May 2000 * Copyright (C) Erik Walthinsen - May 2000 * * This file is part of libdv, a free DV (IEC 61834/SMPTE 314M) * codec. * * libdv is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2, or (at your * option) any later version. * * libdv is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License * along with GNU Make; see the file COPYING. If not, write to * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. * * The libdv homepage is http://libdv.sourceforge.net/. */ /* Attempt at efficient (?) 2-4-8 IDCT as defined by the DV standard, * based on matrix factorizations presented in "Direct Conversions * Between DV Format DCT and Ordinary DCT". Neri Merhav. Technical * Report: HP Laboratories Israel HPL-98-140. August, 1988. * * Note: this is very much an interim solution. It is based on the * same foundation (AAN algorithm) as the 88 MMX idct from Intel. * * This version uses 32 bit math. For MMX we will use 16bit, but that * is fairly subtle. We also need to think hard about the best way to * merge quant/weight/prescale without jeprodizing precision. * * That said, this is way faster the brute force float version. */ #include "dv_types.h" #include #include #define IDCT_248_UNIT_TEST 0 dv_248_coeff_t dv_idct_248_prescale[64]; /* beta2 = cos(M_PI/4); beta0 = beta2 - 0.5; beta1 = -1 - beta0; beta3 = -cos(3 * M_PI / 8); beta4 = cos(M_PI / 8); */ static gint32 beta0; static gint32 beta1; static gint32 beta2; static gint32 beta3; static gint32 beta4; static gdouble C(gint u) { gdouble result; if(u == 0) { result = 0.5 / sqrt(2.0); } else { result = 0.5; } // else return(result); } // C static gdouble tickC(gint u) { gdouble result; if(u == 0) { result = 1.0 / sqrt(2.0); } else { result = 0.5; } // else return(result); } // tickC static inline gint32 fixed_multiply(gint32 a, gint32 b) { #ifdef ARCH_X86 register gint32 eax asm("ax"); register gint32 edx asm("dx"); eax = a; __asm__("imull %2" :"=a" (eax), "=d" (edx) :"g" (b), "0" (eax)); return(edx << 2); #else return (gint32)(((guint64)a * (guint64)b) >> 30); #endif } // fixed_multiply /* Compute the prescale vector. * (verify against matlab result for kron(inv(D2),D)) */ void dv_dct_248_init() { extern double dv_weight_inverse_248_matrix[]; gint k, l; gdouble d; gdouble diag[2][8]; gdouble dbeta0, dbeta1, dbeta2, dbeta3, dbeta4; dbeta2 = cos(M_PI/4); dbeta0 = dbeta2 - 0.5; dbeta1 = -1 - dbeta0; dbeta3 = -cos(3 * M_PI / 8); dbeta4 = cos(M_PI / 8); beta0 = dbeta0 * pow(2,30); beta1 = dbeta1 * pow(2,30); beta2 = dbeta2 * pow(2,30); beta3 = dbeta3 * pow(2,30); beta4 = dbeta4 * pow(2,30); for(k=0; k<4;k++) { d = C(k) / (2.0 * cos( (M_PI * (double)k) / 8.0 )); diag[0][k] = diag[0][k+4] = d; } // for for(k=0; k<8;k++) { diag[1][k] = tickC(k) / (2.0 * cos( M_PI * k / 16.0 )); } // for for(k=0; k<8;k++) { for(l= 0; l<8 ; l++) { // Note the 2^16 shift is for fixed point precision. dv_idct_248_prescale[k*8+l] = 1.0/diag[0][k] * diag[1][l] * pow(2.0,14.0); dv_idct_248_prescale[k*8+l] *= dv_weight_inverse_248_matrix[k*8+l]; } // for } // for } // dv_dct_248_init /* Total cost: 144 mults, 576 adds, 144 shifts. AAN is cited as having cost 144 mults, 464 mults. Doing some CSE below would probably get us there. In principle, 2-4-8 is less complex than 88, since one of the 1D's consists of 2 order 4 DCTS instead of an order 8, and DCT's complexity is super-linear. Even so, this is a big improvement over brute force, which requires 4096 floating point multiplies. */ /* Let gcc make these into shifts if it wants... */ #define DIV_TWO(A) ((A) / 2) #define DIV_FOUR(A) ((A) / 4) void dv_idct_248(dv_248_coeff_t *x248, dv_coeff_t *out) { dv_248_coeff_t tmp[64]; dv_248_coeff_t *in, *lhs; dv_248_coeff_t u,v,w,z; dv_248_coeff_t in0, in1, in2, in3, in4, in5, in6, in7; gint i; // Now, tmp = inv(h2) * inv(g2) * (prescale = inv(d2) * x248 * d) // 32 mults, 64 adds, 80 shifts, 16 negates in = x248; lhs = tmp; #if IDCT_248_UNIT_TEST printf("\nt0:\n"); for(i=0;i<64; i++) { printf("%d ", (in[i] + 0x2000) >> 14); if((i+1) % 8 == 0) printf("\n"); } // for #endif // IDCT_248_UNIT_TEST for(i=0; i<8; i++) { u = in[0+i]; v = in[2*8+i]; w = in[1*8+i]; z = in[3*8+i]; lhs[0*8+i] = DIV_FOUR(u) + DIV_TWO(v); lhs[1*8+i] = DIV_FOUR(u) - DIV_TWO(v); lhs[2*8+i] = fixed_multiply(w,beta0) + fixed_multiply(z,beta1); lhs[3*8+i] = -(DIV_TWO(w+z)); u = in[4*8+i]; v = in[6*8+i]; w = in[5*8+i]; z = in[7*8+i]; lhs[4*8+i] = DIV_FOUR(u) + DIV_TWO(v); lhs[5*8+i] = DIV_FOUR(u) - DIV_TWO(v); lhs[6*8+i] = fixed_multiply(w,beta0) + fixed_multiply(z,beta1); lhs[7*8+i] = -(DIV_TWO(w+z)); } // for #if IDCT_248_UNIT_TEST printf("\nt1:\n"); for(i=0;i<64; i++) { printf("%d ", (lhs[i] + 0x2000) >> 14); if((i+1) % 8 == 0) printf("\n"); } // for #endif // IDCT_248_UNIT_TEST in = tmp; lhs = x248; // Do lhs = inv(f) * inv(L2) * in (butterfly) // 192 adds, 64 shifts for(i=0; i<8; i++) { u = in[8*0+i]; v = in[8*3+i]; w = in[8*4+i]; z = in[8*7+i]; lhs[8*0+i] = DIV_FOUR(u - v + w - z); lhs[8*1+i] = DIV_FOUR(u - v - w + z); lhs[8*6+i] = DIV_FOUR(u + v + w + z); lhs[8*7+i] = DIV_FOUR(u + v - w - z); u = in[8*1+i]; v = in[8*2+i]; w = in[8*5+i]; z = in[8*6+i]; lhs[i+8*2] = DIV_FOUR(u + v + w + z); lhs[i+8*3] = DIV_FOUR(u + v - w - z); lhs[i+8*4] = DIV_FOUR(u - v + w - z); lhs[i+8*5] = DIV_FOUR(u - v - w + z); } // for #if IDCT_248_UNIT_TEST printf("\nt2:\n"); for(i=0;i<64; i++) { printf("%d ", (lhs[i] + 0x2000) >> 14); if((i+1) % 8 == 0) printf("\n"); } // for #endif // IDCT_248_UNIT_TEST in = x248; lhs = tmp; // Do lhs = in * p * b1 * b2 * m // 48 mults, 48 adds for(i=0; i<8; i++) { lhs[i*8+0] = in[i*8+0]; lhs[i*8+1] = in[i*8+4]; u = in[i*8+2]; v = in[i*8+6]; lhs[i*8+2] = fixed_multiply(u - v,beta2); lhs[i*8+3] = u + v; u = in[i*8+1]; v = in[i*8+3]; w = in[i*8+5]; z = in[i*8+7]; lhs[i*8+4] = fixed_multiply(u - z,beta3) + fixed_multiply(v - w,beta4); lhs[i*8+5] = fixed_multiply(u - v - w + z,beta2); lhs[i*8+6] = fixed_multiply(u - z,beta4) + fixed_multiply(w - v,beta3); lhs[i*8+7] = u + v + w + z; } // for #if IDCT_248_UNIT_TEST printf("\nt3:\n"); for(i=0;i<64; i++) { printf("%d ", (lhs[i] + 0x2000) >> 14); if((i+1) % 8 == 0) printf("\n"); } // for #endif // IDCT_248_UNIT_TEST in = lhs; lhs = x248; // Do lhs = in * a1 * a2 * a3 (butterflys...) // 272 adds (will gcc factor some of these out?) for(i=0; i<8; i++) { in0 = in[i*8+0]; in1 = in[i*8+1]; in2 = in[i*8+2]; in3 = in[i*8+3]; in4 = in[i*8+4]; in5 = in[i*8+5]; in6 = in[i*8+6]; in7 = in[i*8+7]; lhs[i*8+0] = in0 + in1 + in2 + in3 + in6 + in7; lhs[i*8+1] = in0 - in1 + in2 + in5 + in6; lhs[i*8+2] = in0 - in1 - in2 - in4 + in5; lhs[i*8+3] = in0 + in1 - in2 - in3 - in4; lhs[i*8+4] = in0 + in1 - in2 - in3 + in4; lhs[i*8+5] = in0 - in1 - in2 + in4 - in5; lhs[i*8+6] = in0 - in1 + in2 - in5 - in6; lhs[i*8+7] = in0 + in1 + in2 + in3 - in6 - in7; } // for #if IDCT_248_UNIT_TEST printf("\nout:\n"); for(i=0;i<64; i++) { printf("%d ", (lhs[i] + 0x2000) >> 14); if((i+1) % 8 == 0) printf("\n"); } // for #endif // IDCT_248_UNIT_TEST for(i=0; i<64; i++) out [i] = (lhs[i] + 0x2000) >> 14; } // dv_idct_248 #if IDCT_248_UNIT_TEST // This is a test matrix generated from forward 2-4-8 dct using matlab static gdouble dv_idct_248_test_x248[64] = { -126.6250, -227.4838, -111.9136, 0.2316, 19.8750, 0.3386, -0.0514, -0.0334, 14.7008, 17.6504, -8.5810, -16.1455, -0.0676, 0.1048, 0.2134, -0.7044, 0.1250, 0.5249, -0.1633, -0.5139, 0.1250, -0.1308, -0.0676, -0.0758, 0.5404, -0.4475, -0.0366, 0.3909, 0.1633, 0.0426, 0.0810, 0.2244, 25.8750, 15.7781, -15.5290, -22.2636, -0.6250, 4.3342, 0.0733, -0.4023, 1.6544, -3.9592, 4.2829, -0.1547, -3.8753, 0.0593, -0.2866, 0.8777, -0.3750, -3.9695, -0.1633, 4.5689, 0.6250, 0.3619, -0.0676, -0.5475, 4.3208, 0.2903, 0.4634, -0.5293, 0.1169, 0.4600, 0.2171, -0.2753 }; // dv_idct24_test_x248 int main(int argc, char **argv) { gint i; dv_248_coeff_t x248[64]; dv_init_dct248(); /* prescale - 64 mults */ for(i=0; i<64; i++) { x248[i] = dv_idct_248_test_x248[i] * dv_idct_248_prescale[i]; } dv_idct_248(x248); exit(0); } // main #endif //IDCT_248_UNIT_TEST