/* libfame - Fast Assembly MPEG Encoder Library Copyright (C) 2000-2001 Vivien Chappelier This library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more details. You should have received a copy of the GNU Library General Public License along with this library; if not, write to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ /********************** MMX accelerated dequantisation *************************/ #define DEQUANTIZE_PRESCALE_STEP(x) \ "movq %%mm0, %%mm4\n" /* mm4 = mm0 */ \ "movq %%mm1, %%mm5\n" /* mm5 = mm1 */ \ "pmulhw 0x" #x "0(%3), %%mm0\n" /* premultiply for iDCT */ \ "pmulhw 0x" #x "8(%3), %%mm1\n" /* premultiply for iDCT */ \ "pmullw 0x" #x "0(%3), %%mm4\n" /* premultiply for iDCT */ \ "pmullw 0x" #x "8(%3), %%mm5\n" /* premultiply for iDCT */ \ "psrlw $0x0b, %%mm4\n" /* keep 5 bits */ \ "psrlw $0x0b, %%mm5\n" /* keep 5 bits */ \ "paddw " ASMSYM "_mmx_1, %%mm4\n" /* + 1 */ \ "paddw " ASMSYM "_mmx_1, %%mm5\n" /* + 1 */ \ "psrlw $0x01, %%mm4\n" /* keep 4 bits rounded */ \ "psrlw $0x01, %%mm5\n" /* keep 4 bits rounded */ \ "psllw $0x04, %%mm0\n" /* multiply by 16 for iDCT */ \ "psllw $0x04, %%mm1\n" /* multiply by 16 for iDCT */ \ "paddsw %%mm4, %%mm0\n" /* add least significant part */ \ "paddsw %%mm5, %%mm1\n" /* add least significant part */ \ "movq %%mm0, 0x" #x "0(%2)\n" /* store in cache */ \ "movq %%mm1, 0x" #x "8(%2)\n" /* store in cache */ #define DEQUANTIZE_GLOBAL_MISMATCH_CONTROL() \ "movq %%mm6, %%mm5\n" /* copy mismatch */ \ "psllq $0x20, %%mm5\n" /* mm5 = higher 32 bits */ \ "pxor %%mm6, %%mm5\n" /* sum mismatch */ \ "movq %%mm5, %%mm4\n" /* copy mismatch */ \ "psllq $0x10, %%mm5\n" /* mm5 = higher 16 bits */ \ "movq %%mm1, %%mm6\n" /* copy last line */ \ "pxor %%mm5, %%mm4\n" /* sum mismatch */ \ "movq %%mm7, %%mm3\n" /* mm3 = mm7 */ \ "pcmpeqw %%mm7, %%mm3\n" /* mm3 = 0xffffffffffffffff */ \ "psllq $0x3f, %%mm3\n" /* mm3 = 0x8000000000000000 */ \ "psrlq $0x0f, %%mm3\n" /* mm3 = 0x0001000000000000 */ \ "pxor %%mm3, %%mm6\n" /* temp last coeff ^= 1 */ \ "pand %%mm3, %%mm4\n" /* keep only lsb of mismatch */ \ "pxor %%mm4, %%mm6\n" /* temp last coeff ^= !(mismatch&1) */ \ "psubsw %%mm1, %%mm6\n" /* mismatch = temp last coeff - last coeff */ \ "psrlq $0x30, %%mm6\n" /* retrieve mismatch in lower word */ static void inline dequantize_intra_global(dct_t *block, dct_t *cache, dct_t *dqmatrix, dct_t *psmatrix, dct_t *mismatch) { unsigned int m; #define DEQUANTIZE_INTRA_GLOBAL_STEP(x) \ "movq 0x" #x "0(%0), %%mm0\n" /* load 1st line 1st half */ \ "movq 0x" #x "8(%0), %%mm1\n" /* load 1st line 2nd half */ \ "movq %%mm0, %%mm2\n" /* mm2 = 1st line 1st half */ \ "movq %%mm1, %%mm3\n" /* mm3 = 1st line 1st half */ \ "psraw $0x0f, %%mm2\n" /* mm2 = (sign(mm0) - 1) / 2 */ \ "psraw $0x0f, %%mm3\n" /* mm3 = (sign(mm1) - 1) / 2 */ \ "pmullw 0x" #x "0(%1), %%mm0\n" /* mm0=[0-3]*Q */ \ "pmullw 0x" #x "8(%1), %%mm1\n" /* mm1=[4-7]*Q */ \ "psllw $0x03, %%mm2\n" /* sign adjust before shift */ \ "psllw $0x03, %%mm3\n" /* sign adjust before shift */ \ "psubw %%mm2, %%mm0\n" /* sign adjust before shift */ \ "psubw %%mm3, %%mm1\n" /* sign adjust before shift */ \ "psraw $0x03, %%mm2\n" /* sign adjust before shift */ \ "psraw $0x03, %%mm3\n" /* sign adjust before shift */ \ "paddw %%mm2, %%mm0\n" /* sign adjust before shift */ \ "paddw %%mm3, %%mm1\n" /* sign adjust before shift */ \ "psraw $0x03, %%mm0\n" /* divide by 8 */ \ "psraw $0x03, %%mm1\n" /* divide by 8 */ \ "pxor %%mm0, %%mm6\n" /* accumulate mismatch */ \ "pxor %%mm1, %%mm6\n" /* accumulate mismatch */ asm volatile ("pxor %%mm7, %%mm7\n" /* mm7 = 0 */ "pxor %%mm6, %%mm6\n" /* mm6 = mismatch accumulator */ DEQUANTIZE_INTRA_GLOBAL_STEP(0) DEQUANTIZE_PRESCALE_STEP(0) DEQUANTIZE_INTRA_GLOBAL_STEP(1) DEQUANTIZE_PRESCALE_STEP(1) DEQUANTIZE_INTRA_GLOBAL_STEP(2) DEQUANTIZE_PRESCALE_STEP(2) DEQUANTIZE_INTRA_GLOBAL_STEP(3) DEQUANTIZE_PRESCALE_STEP(3) DEQUANTIZE_INTRA_GLOBAL_STEP(4) DEQUANTIZE_PRESCALE_STEP(4) DEQUANTIZE_INTRA_GLOBAL_STEP(5) DEQUANTIZE_PRESCALE_STEP(5) DEQUANTIZE_INTRA_GLOBAL_STEP(6) DEQUANTIZE_PRESCALE_STEP(6) DEQUANTIZE_INTRA_GLOBAL_STEP(7) DEQUANTIZE_GLOBAL_MISMATCH_CONTROL() DEQUANTIZE_PRESCALE_STEP(7) : "=r"(block), "=r"(dqmatrix), "=r"(cache), "=r"(psmatrix) : "0"(block), "1"(dqmatrix), "2"(cache), "3"(psmatrix) : "memory"); asm volatile("movd %%mm6, %0\n" /* export mismatch */ : "=r"(m) : /* no input */ ); *mismatch = (dct_t) (m<<12); } static void inline dequantize_intra_local(dct_t *block, dct_t *cache, dct_t *dqmatrix, dct_t *psmatrix, dct_t *mismatch /* not used */) { /* coeff[i] = (2*level[i]*qscale*matrix[i])/16 */ /* then coeff[i] = { coeff[i] + 1, if coeff[i] < 0 and coeff[i] is even */ /* { coeff[i] - 1, if coeff[i] > 0 and coeff[i] is even */ /* { coeff[i] otherwise */ /* implementation is */ /* coeff[i] = (level[i]*qscale*matrix[i]+(level[i]<0)?7:0)>>3 */ /* coeff[i] = (coeff[i]-(coeff[i]>0):1?0)|1 */ #define DEQUANTIZE_INTRA_LOCAL_STEP(x) \ "movq 0x" #x "0(%0), %%mm0\n" /* load 1st line 1st half */ \ "movq 0x" #x "8(%0), %%mm1\n" /* load 1st line 2nd half */ \ "movq %%mm0, %%mm2\n" /* mm2 = 1st line 1st half */ \ "movq %%mm1, %%mm3\n" /* mm3 = 1st line 2nd half */ \ "psraw $0x0f, %%mm2\n" /* mm2 = (sign(mm0) - 1) / 2 */ \ "psraw $0x0f, %%mm3\n" /* mm3 = (sign(mm1) - 1) / 2 */ \ "pmullw 0x" #x "0(%1), %%mm0\n" /* mm0=[0-3]*Q */ \ "pmullw 0x" #x "8(%1), %%mm1\n" /* mm1=[4-7]*Q */ \ "movq %%mm0, %%mm4\n" /* mm4 = mm0 */ \ "movq %%mm1, %%mm5\n" /* mm5 = mm1 */ \ "pcmpeqw %%mm7, %%mm4\n" /* mm4[0-3]=0xFF if mm4[0-3]==0 */ \ "pcmpeqw %%mm7, %%mm5\n" /* mm5[0-3]=0xFF if mm5[0-3]==0 */ \ "pcmpeqw %%mm7, %%mm4\n" /* mm4[0-3]=0xFF if mm0[0-3]!=0 */ \ "pcmpeqw %%mm7, %%mm5\n" /* mm5[0-3]=0xFF if mm1[0-3]!=0 */ \ "psllw $0x03, %%mm2\n" /* sign adjust before shift */ \ "psllw $0x03, %%mm3\n" /* sign adjust before shift */ \ "psubw %%mm2, %%mm0\n" /* sign adjust before shift */ \ "psubw %%mm3, %%mm1\n" /* sign adjust before shift */ \ "psraw $0x03, %%mm2\n" /* sign adjust before shift */ \ "psraw $0x03, %%mm3\n" /* sign adjust before shift */ \ "paddw %%mm2, %%mm0\n" /* sign adjust before shift */ \ "paddw %%mm3, %%mm1\n" /* sign adjust before shift */ \ "psraw $0x03, %%mm0\n" /* divide by 8 */ \ "psraw $0x03, %%mm1\n" /* divide by 8 */ \ "pcmpeqw %%mm7, %%mm2\n" /* invert sign */ \ "pcmpeqw %%mm7, %%mm3\n" /* invert sign */ \ "paddw %%mm2, %%mm0\n" /* sub 1 if >0 */ \ "paddw %%mm3, %%mm1\n" /* sub 1 if >0 */ \ "por " ASMSYM "_mmx_1, %%mm0\n" /* or 1 */ \ "por " ASMSYM "_mmx_1, %%mm1\n" /* or 1 */ \ "pand %%mm4, %%mm0\n" /* [0-3]=0 if [0-3] was zero */ \ "pand %%mm5, %%mm1\n" /* [4-7]=0 if [4-7] was zero */ asm volatile ("pxor %%mm7, %%mm7\n" /* mm7 = 0 */ "pxor %%mm6, %%mm6\n" /* mm6 = mismatch accumulator */ DEQUANTIZE_INTRA_LOCAL_STEP(0) DEQUANTIZE_PRESCALE_STEP(0) DEQUANTIZE_INTRA_LOCAL_STEP(1) DEQUANTIZE_PRESCALE_STEP(1) DEQUANTIZE_INTRA_LOCAL_STEP(2) DEQUANTIZE_PRESCALE_STEP(2) DEQUANTIZE_INTRA_LOCAL_STEP(3) DEQUANTIZE_PRESCALE_STEP(3) DEQUANTIZE_INTRA_LOCAL_STEP(4) DEQUANTIZE_PRESCALE_STEP(4) DEQUANTIZE_INTRA_LOCAL_STEP(5) DEQUANTIZE_PRESCALE_STEP(5) DEQUANTIZE_INTRA_LOCAL_STEP(6) DEQUANTIZE_PRESCALE_STEP(6) DEQUANTIZE_INTRA_LOCAL_STEP(7) DEQUANTIZE_PRESCALE_STEP(7) : "=r"(block), "=r"(dqmatrix), "=r"(cache), "=r"(psmatrix) : "0"(block), "1"(dqmatrix), "2"(cache), "3"(psmatrix) : "memory"); } static void inline dequantize_inter_global(dct_t *block, dct_t *cache, dct_t *dqmatrix, dct_t *psmatrix, dct_t *mismatch) { unsigned int m; #define DEQUANTIZE_INTER_GLOBAL_DEQUANT_STEP(x) \ "movq 0x" #x "0(%0), %%mm4\n" /* load 1st line 1st half */ \ "pxor %%mm2, %%mm2\n" /* mm2 = 1st line 1st half */ \ "movq %%mm4, %%mm0\n" \ "movq 0x" #x "8(%0), %%mm5\n" /* load 1st line 2nd half */ \ "pxor %%mm3, %%mm3\n" /* mm3 = 1st line 1st half */ \ "movq %%mm5, %%mm1\n" \ "psllw $1, %%mm0\n" /* mm0 = 2*mm0 */ \ "pcmpgtw %%mm4, %%mm2\n" /* mm2 = (mm0<0)?0xffff:0x0000 */ \ "psllw $1, %%mm1\n" /* mm1 = 2*mm1 */ \ "pcmpgtw %%mm5, %%mm3\n" /* mm3 = (mm1<0)?0xffff:0x0000 */ \ "pxor %%mm2, %%mm0\n" /* mm0 = 2*|mm0|-(mm0<0)*/ \ "pxor %%mm3, %%mm1\n" /* mm1 = 2*|mm1|-(mm1<0)*/ \ "pcmpeqw %%mm7, %%mm4\n" /* mm4 = (mm0==0)?0xffff:0x0000 */ \ "pcmpeqw %%mm7, %%mm5\n" /* mm5 = (mm1==0)?0xffff:0x0000 */ \ "psubsw %%mm2, %%mm0\n" /* mm0 = 2*|mm0| */ \ "psubsw %%mm3, %%mm1\n" /* mm1 = 2*|mm1| */ \ "pcmpeqw %%mm7, %%mm4\n" /* mm4 = (mm0==0)?0x0000:0xffff */ \ "pcmpeqw %%mm7, %%mm5\n" /* mm5 = (mm1==0)?0x0000:0xffff */ \ "psubw %%mm4, %%mm0\n" /* mm0 = 2*|mm0|+(mm0!=0) */ \ "psubw %%mm5, %%mm1\n" /* mm1 = 2*|mm0|+(mm0!=0) */ \ "pmullw 0x" #x "0(%1), %%mm0\n" /* mm0=(2*|mm0|+1)*Q */ \ "pmullw 0x" #x "8(%1), %%mm1\n" /* mm1=(2*|mm0|+1)*Q */ \ "psraw $0x04, %%mm0\n" /* divide by 16 */ \ "psraw $0x04, %%mm1\n" /* divide by 16 */ \ "pxor %%mm2, %%mm0\n" /* mm0 =(2*|mm0|+1)*Q*sign(mm0)-(mm0<0)*/ \ "pxor %%mm3, %%mm1\n" /* mm1 =(2*|mm1|+1)*Q*sign(mm1)-(mm1<0)*/ \ "psubsw %%mm2, %%mm0\n" /* mm0 =(2*|mm0|+1)*Q*sign(mm0) */ \ "psubsw %%mm3, %%mm1\n" /* mm1 =(2*|mm1|+1)*Q*sign(mm1) */ \ "pxor %%mm0, %%mm6\n" /* accumulate mismatch */ \ "pxor %%mm1, %%mm6\n" /* accumulate mismatch */ asm volatile ("pxor %%mm7, %%mm7\n" /* mm7 = 0 */ "pxor %%mm6, %%mm6\n" /* mm6 = mismatch accumulator */ DEQUANTIZE_INTER_GLOBAL_DEQUANT_STEP(0) DEQUANTIZE_PRESCALE_STEP(0) DEQUANTIZE_INTER_GLOBAL_DEQUANT_STEP(1) DEQUANTIZE_PRESCALE_STEP(1) DEQUANTIZE_INTER_GLOBAL_DEQUANT_STEP(2) DEQUANTIZE_PRESCALE_STEP(2) DEQUANTIZE_INTER_GLOBAL_DEQUANT_STEP(3) DEQUANTIZE_PRESCALE_STEP(3) DEQUANTIZE_INTER_GLOBAL_DEQUANT_STEP(4) DEQUANTIZE_PRESCALE_STEP(4) DEQUANTIZE_INTER_GLOBAL_DEQUANT_STEP(5) DEQUANTIZE_PRESCALE_STEP(5) DEQUANTIZE_INTER_GLOBAL_DEQUANT_STEP(6) DEQUANTIZE_PRESCALE_STEP(6) DEQUANTIZE_INTER_GLOBAL_DEQUANT_STEP(7) DEQUANTIZE_GLOBAL_MISMATCH_CONTROL() /* WARNING : mismatch control is too small and would be zeroed */ /* by prescale. This would cause artifacts on the long term */ /* since the last coefficient has high chances of being 0 */ /* and thus should be rounded up most of the time. */ /* Thus we accumulate mismatch instead until it gets */ /* large enough to produce significant output after iDCT */ /* resetting the accumulator when the block is coded intra */ DEQUANTIZE_PRESCALE_STEP(7) : "=r"(block), "=r"(dqmatrix), "=r"(cache), "=r"(psmatrix) : "0"(block), "1"(dqmatrix), "2"(cache), "3"(psmatrix) : "memory"); asm volatile("movd %%mm6, %0\n" /* export mismatch */ : "=r"(m) : /* no input */ ); *mismatch += (dct_t) (m<<12); /* threshold is ((1 << 16)/(16*psmatrix[63]) * (1 << 12) + 0.5) = 26887 */ #define MISMATCH_THRESHOLD 26887 if(*mismatch > MISMATCH_THRESHOLD) { /* after this threshold, prescaled mismatch is >= 1 */ cache[63] ++; /* add mismatch */ *mismatch -= MISMATCH_THRESHOLD; } if(*mismatch < (-26887)) { cache[63] --; /* sub mismatch */ *mismatch += MISMATCH_THRESHOLD; } } static void inline dequantize_inter_local(dct_t *block, dct_t *cache, dct_t *dqmatrix, dct_t *psmatrix, dct_t *mismatch /* not used */) { /* coeff[i] = ((2*level[i]+sign(level[i]))*qscale*matrix[i])/16 */ /* then coeff[i] = { coeff[i] + 1, if coeff[i] < 0 and coeff[i] is even */ /* { coeff[i] - 1, if coeff[i] > 0 and coeff[i] is even */ /* { coeff[i] otherwise */ /* TODO: check efficiency of new inter_global method on this */ #define DEQUANTIZE_INTER_LOCAL_STEP(x) \ "movq 0x" #x "0(%0), %%mm0\n" /* load 1st line 1st half */ \ "movq 0x" #x "8(%0), %%mm1\n" /* load 1st line 2nd half */ \ "movq %%mm0, %%mm2\n" /* mm2 = 1st line 1st half */ \ "movq %%mm1, %%mm3\n" /* mm3 = 1st line 1st half */ \ "psraw $0x0f, %%mm2\n" /* mm2 = (sign(mm0) - 1) / 2 */ \ "psraw $0x0f, %%mm3\n" /* mm3 = (sign(mm0) - 1) / 2 */ \ "paddsw %%mm2, %%mm0\n" /* mm0 = [0-3]+(sign([0-3])-1)/2*/ \ "paddsw %%mm3, %%mm1\n" /* mm1 = [4-7]+(sign([0-3])-1)/2*/ \ "paddsw %%mm0, %%mm0\n" /* mm0 = 2*[0-3]+sign([0-3])-1 */ \ "paddsw %%mm1, %%mm1\n" /* mm1 = 2*[4-7]+sign([4-7])-1 */ \ "pmullw 0x" #x "0(%1), %%mm0\n" /* mm0=(2*[0-3]+sign([0-3])-1)*Q*/ \ "pmullw 0x" #x "8(%1), %%mm1\n" /* mm1=(2*[4-7]+sign([4-7])-1)*Q*/ \ "movq %%mm0, %%mm4\n" /* mm4 = mm0 */ \ "movq %%mm1, %%mm5\n" /* mm5 = mm1 */ \ "paddsw 0x" #x "0(%1), %%mm0\n" /* mm0=(2*[0-3]+sign([0-3]))*Q*/ \ "paddsw 0x" #x "8(%1), %%mm1\n" /* mm1=(2*[4-7]+sign([4-7]))*Q*/ \ "pcmpeqw %%mm7, %%mm4\n" /* mm4[0-3]=0xFF if mm4[0-3]==0 */ \ "pcmpeqw %%mm7, %%mm5\n" /* mm5[0-3]=0xFF if mm5[0-3]==0 */ \ "pcmpeqw %%mm7, %%mm4\n" /* mm4[0-3]=0xFF if mm0[0-3]!=0 */ \ "pcmpeqw %%mm7, %%mm5\n" /* mm5[0-3]=0xFF if mm1[0-3]!=0 */ \ "psllw $0x04, %%mm2\n" /* sign adjust before shift */ \ "psllw $0x04, %%mm3\n" /* sign adjust before shift */ \ "psubw %%mm2, %%mm0\n" /* sign adjust before shift */ \ "psubw %%mm3, %%mm1\n" /* sign adjust before shift */ \ "psraw $0x04, %%mm2\n" /* sign adjust before shift */ \ "psraw $0x04, %%mm3\n" /* sign adjust before shift */ \ "paddw %%mm2, %%mm0\n" /* sign adjust before shift */ \ "paddw %%mm3, %%mm1\n" /* sign adjust before shift */ \ "psraw $0x04, %%mm0\n" /* divide by 16 */ \ "psraw $0x04, %%mm1\n" /* divide by 16 */ \ "pcmpeqw %%mm7, %%mm2\n" /* invert sign */ \ "pcmpeqw %%mm7, %%mm3\n" /* invert sign */ \ "paddw %%mm2, %%mm0\n" /* sub 1 if >0 */ \ "paddw %%mm3, %%mm1\n" /* sub 1 if >0 */ \ "por " ASMSYM "_mmx_1, %%mm0\n" /* or 1 */ \ "por " ASMSYM "_mmx_1, %%mm1\n" /* or 1 */ \ "pand %%mm4, %%mm0\n" /* [0-3]=0 if [0-3] was zero */ \ "pand %%mm5, %%mm1\n" /* [4-7]=0 if [4-7] was zero */ asm volatile ("pxor %%mm7, %%mm7\n" /* mm7 = 0 */ "pxor %%mm6, %%mm6\n" /* mm6 = mismatch accumulator */ DEQUANTIZE_INTER_LOCAL_STEP(0) DEQUANTIZE_PRESCALE_STEP(0) DEQUANTIZE_INTER_LOCAL_STEP(1) DEQUANTIZE_PRESCALE_STEP(1) DEQUANTIZE_INTER_LOCAL_STEP(2) DEQUANTIZE_PRESCALE_STEP(2) DEQUANTIZE_INTER_LOCAL_STEP(3) DEQUANTIZE_PRESCALE_STEP(3) DEQUANTIZE_INTER_LOCAL_STEP(4) DEQUANTIZE_PRESCALE_STEP(4) DEQUANTIZE_INTER_LOCAL_STEP(5) DEQUANTIZE_PRESCALE_STEP(5) DEQUANTIZE_INTER_LOCAL_STEP(6) DEQUANTIZE_PRESCALE_STEP(6) DEQUANTIZE_INTER_LOCAL_STEP(7) DEQUANTIZE_PRESCALE_STEP(7) : "=r"(block), "=r"(dqmatrix), "=r"(cache), "=r"(psmatrix) : "0"(block), "1"(dqmatrix), "2"(cache), "3"(psmatrix) : "memory"); }