/* libfame - Fast Assembly MPEG Encoder Library Copyright (C) 2000-2001 Vivien Chappelier This library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Library General Public License for more details. You should have received a copy of the GNU Library General Public License along with this library; if not, write to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ /*********************** fetch data from current picture *********************/ static void inline prefetch_withoutmask(unsigned char *input, dct_t *output, unsigned char *mask, /* unused */ int pitch) { int dummy; asm volatile ("pxor %%mm7, %%mm7\n" "movq (%0), %%mm0\n" "movq (%0, %2), %%mm2\n" "movq %%mm0, %%mm1\n" "movq %%mm2, %%mm3\n" "punpcklbw %%mm7, %%mm0\n" "punpckhbw %%mm7, %%mm1\n" "punpcklbw %%mm7, %%mm2\n" "punpckhbw %%mm7, %%mm3\n" "addl %2, %0\n" "addl %2, %0\n" "movq %%mm0, 0x00(%1)\n" "movq %%mm1, 0x08(%1)\n" "movq %%mm2, 0x10(%1)\n" "movq %%mm3, 0x18(%1)\n" "movq (%0), %%mm0\n" "movq (%0, %2), %%mm2\n" "movq %%mm0, %%mm1\n" "movq %%mm2, %%mm3\n" "punpcklbw %%mm7, %%mm0\n" "punpckhbw %%mm7, %%mm1\n" "punpcklbw %%mm7, %%mm2\n" "punpckhbw %%mm7, %%mm3\n" "addl %2, %0\n" "addl %2, %0\n" "movq %%mm0, 0x20(%1)\n" "movq %%mm1, 0x28(%1)\n" "movq %%mm2, 0x30(%1)\n" "movq %%mm3, 0x38(%1)\n" "movq (%0), %%mm0\n" "movq (%0, %2), %%mm2\n" "movq %%mm0, %%mm1\n" "movq %%mm2, %%mm3\n" "punpcklbw %%mm7, %%mm0\n" "punpckhbw %%mm7, %%mm1\n" "punpcklbw %%mm7, %%mm2\n" "punpckhbw %%mm7, %%mm3\n" "addl %2, %0\n" "addl %2, %0\n" "movq %%mm0, 0x40(%1)\n" "movq %%mm1, 0x48(%1)\n" "movq %%mm2, 0x50(%1)\n" "movq %%mm3, 0x58(%1)\n" "movq (%0), %%mm0\n" "movq (%0, %2), %%mm2\n" "movq %%mm0, %%mm1\n" "movq %%mm2, %%mm3\n" "punpcklbw %%mm7, %%mm0\n" "punpckhbw %%mm7, %%mm1\n" "punpcklbw %%mm7, %%mm2\n" "punpckhbw %%mm7, %%mm3\n" "addl %2, %0\n" "addl %2, %0\n" "movq %%mm0, 0x60(%1)\n" "movq %%mm1, 0x68(%1)\n" "movq %%mm2, 0x70(%1)\n" "movq %%mm3, 0x78(%1)\n" : "=r"(dummy), "=r"(output), "=r"(pitch) : "0"(input), "1"(output), "2"(pitch) : "memory"); } static void inline prefetch_Y_withmask(unsigned char *input, dct_t *output, unsigned char *mask, int pitch) { int dummy1, dummy2; int mean, count; /* compute mean of visible pixels */ asm volatile ( "pxor %%mm7, %%mm7\n" /* mm7 = zero */ "pxor %%mm6, %%mm6\n" /* mm6 = mask accumulator */ "pxor %%mm5, %%mm5\n" /* mm5 = texture accumulator */ /* 1st pass */ "movq (%0), %%mm0\n" /* load 1st texture line */ "movq (%0, %1), %%mm2\n" /* load 2nd texture line */ "movq (%2), %%mm1\n" /* load 1st mask line */ "movq (%2, %1), %%mm3\n" /* load 2nd mask line */ "pcmpgtb %%mm7, %%mm1\n" /* saturate 1st mask line */ "pcmpgtb %%mm7, %%mm3\n" /* saturate 2nd mask line */ "pand %%mm1, %%mm0\n" /* mask 1st texture line */ "pand %%mm3, %%mm2\n" /* mask 2nd texture line */ "psubsb %%mm1, %%mm6\n" /* accumulate mask */ "psubsb %%mm3, %%mm6\n" /* accumulate mask */ "movq %%mm0, %%mm1\n" /* copy 1st texture line */ "movq %%mm2, %%mm3\n" /* copy 2nd texture line */ "punpcklbw %%mm7, %%mm0\n" /* unpack texture to word for addition */ "punpckhbw %%mm7, %%mm1\n" /* unpack texture to word for addition */ "punpcklbw %%mm7, %%mm2\n" /* unpack texture to word for addition */ "punpckhbw %%mm7, %%mm3\n" /* unpack texture to word for addition */ "paddw %%mm0, %%mm5\n" /* accumulate texture */ "paddw %%mm1, %%mm5\n" /* accumulate texture */ "paddw %%mm2, %%mm5\n" /* accumulate texture */ "paddw %%mm3, %%mm5\n" /* accumulate texture */ "addl %1, %0\n" /* move one texture line down */ "addl %1, %0\n" /* move one texture line down */ "addl %1, %2\n" /* move one mask line down */ "addl %1, %2\n" /* move one mask line down */ /* 2nd pass */ "movq (%0), %%mm0\n" /* load 3rd texture line */ "movq (%0, %1), %%mm2\n" /* load 4th texture line */ "movq (%2), %%mm1\n" /* load 3rd mask line */ "movq (%2, %1), %%mm3\n" /* load 4th mask line */ "pcmpgtb %%mm7, %%mm1\n" /* saturate 3rd mask line */ "pcmpgtb %%mm7, %%mm3\n" /* saturate 4th mask line */ "pand %%mm1, %%mm0\n" /* mask 3rd texture line */ "pand %%mm3, %%mm2\n" /* mask 4th texture line */ "psubsb %%mm1, %%mm6\n" /* accumulate mask */ "psubsb %%mm3, %%mm6\n" /* accumulate mask */ "movq %%mm0, %%mm1\n" /* copy 3rd texture line */ "movq %%mm2, %%mm3\n" /* copy 4th texture line */ "punpcklbw %%mm7, %%mm0\n" /* unpack texture to word for addition */ "punpckhbw %%mm7, %%mm1\n" /* unpack texture to word for addition */ "punpcklbw %%mm7, %%mm2\n" /* unpack texture to word for addition */ "punpckhbw %%mm7, %%mm3\n" /* unpack texture to word for addition */ "paddw %%mm0, %%mm5\n" /* accumulate texture */ "paddw %%mm1, %%mm5\n" /* accumulate texture */ "paddw %%mm2, %%mm5\n" /* accumulate texture */ "paddw %%mm3, %%mm5\n" /* accumulate texture */ "addl %1, %0\n" /* move one texture line down */ "addl %1, %0\n" /* move one texture line down */ "addl %1, %2\n" /* move one mask line down */ "addl %1, %2\n" /* move one mask line down */ /* 3rd pass */ "movq (%0), %%mm0\n" /* load 5th texture line */ "movq (%0, %1), %%mm2\n" /* load 6th texture line */ "movq (%2), %%mm1\n" /* load 5th mask line */ "movq (%2, %1), %%mm3\n" /* load 6th mask line */ "pcmpgtb %%mm7, %%mm1\n" /* saturate 5th mask line */ "pcmpgtb %%mm7, %%mm3\n" /* saturate 6th mask line */ "pand %%mm1, %%mm0\n" /* mask 5th texture line */ "pand %%mm3, %%mm2\n" /* mask 6th texture line */ "psubsb %%mm1, %%mm6\n" /* accumulate mask */ "psubsb %%mm3, %%mm6\n" /* accumulate mask */ "movq %%mm0, %%mm1\n" /* copy 5th texture line */ "movq %%mm2, %%mm3\n" /* copy 6th texture line */ "punpcklbw %%mm7, %%mm0\n" /* unpack texture to word for addition */ "punpckhbw %%mm7, %%mm1\n" /* unpack texture to word for addition */ "punpcklbw %%mm7, %%mm2\n" /* unpack texture to word for addition */ "punpckhbw %%mm7, %%mm3\n" /* unpack texture to word for addition */ "paddw %%mm0, %%mm5\n" /* accumulate texture */ "paddw %%mm1, %%mm5\n" /* accumulate texture */ "paddw %%mm2, %%mm5\n" /* accumulate texture */ "paddw %%mm3, %%mm5\n" /* accumulate texture */ "addl %1, %0\n" /* move one texture line down */ "addl %1, %0\n" /* move one texture line down */ "addl %1, %2\n" /* move one mask line down */ "addl %1, %2\n" /* move one mask line down */ /* 4th pass */ "movq (%0), %%mm0\n" /* load 7th texture line */ "movq (%0, %1), %%mm2\n" /* load 8th texture line */ "movq (%2), %%mm1\n" /* load 7th mask line */ "movq (%2, %1), %%mm3\n" /* load 8th mask line */ "pcmpgtb %%mm7, %%mm1\n" /* saturate 1st mask line */ "pcmpgtb %%mm7, %%mm3\n" /* saturate 2nd mask line */ "pand %%mm1, %%mm0\n" /* mask 1st texture line */ "pand %%mm3, %%mm2\n" /* mask 2nd texture line */ "psubsb %%mm1, %%mm6\n" /* accumulate mask */ "psubsb %%mm3, %%mm6\n" /* accumulate mask */ "movq %%mm0, %%mm1\n" /* copy 1st texture line */ "movq %%mm2, %%mm3\n" /* copy 2nd texture line */ "punpcklbw %%mm7, %%mm0\n" /* unpack texture to word for addition */ "punpckhbw %%mm7, %%mm1\n" /* unpack texture to word for addition */ "punpcklbw %%mm7, %%mm2\n" /* unpack texture to word for addition */ "punpckhbw %%mm7, %%mm3\n" /* unpack texture to word for addition */ "paddw %%mm0, %%mm5\n" /* accumulate texture */ "paddw %%mm1, %%mm5\n" /* accumulate texture */ "paddw %%mm2, %%mm5\n" /* accumulate texture */ "paddw %%mm3, %%mm5\n" /* accumulate texture */ : "=r"(dummy1), "=r"(pitch), "=r"(dummy2) : "0"(input), "1"(pitch), "2"(mask) : "memory"); asm volatile (/* line accumulation */ "movq %%mm6, %%mm0\n" /* copy column pixel counts */ "psrlq $32, %%mm6\n" /* move high dword low */ "paddb %%mm6, %%mm0\n" /* sum column counts */ "movd %%mm0, %%eax\n" /* get 4 packed counts to register eax */ "movl %%eax, %%ecx\n" /* copy register eax to register ecx */ "bswap %%ecx\n" /* swap temporary register ecx */ "addw %%cx, %%ax\n" /* sum low words */ "addb %%ah, %%al\n" /* sum low bytes */ "andl $0x000000ff, %%eax\n" /* mask final value */ "movq %%mm5, %%mm1\n" /* copy column pixel partial sums */ "psrlq $32, %%mm5\n" /* move high dword low */ "paddw %%mm5, %%mm1\n" /* sum partial sums */ "movq %%mm1, %%mm0\n" /* copy column pixel partial sums */ "psrlq $16, %%mm0\n" /* move high word low */ "paddw %%mm0, %%mm1\n" /* sum partial sums */ "movd %%mm1, %%ecx\n" /* get result in ecx */ "andl $0x0000ffff, %%ecx\n" /* mask final value */ : "=a" (count), "=c" (mean) ); if(count) mean /= count; asm volatile (/* replicate mean for padding */ "movd %%ecx, %%mm6\n" /* mm6 will hold mean value */ "punpcklbw %%mm6, %%mm6\n" /* replicate mean to dword */ "punpcklwd %%mm6, %%mm6\n" /* replicate mean to dword */ "punpckldq %%mm6, %%mm6\n" /* replicate mean to qword */ : : "c" (mean)); /* fetch and fill empty pixels with mean value */ #define PREFETCH_Y_MASK_STEP(x, y) \ "movq (%0), %%mm0\n" /* load 1st texture line */ \ "movq (%0, %2), %%mm2\n" /* load 2nd texture line */ \ "movq (%3), %%mm1\n" /* load 1st mask line */ \ "movq (%3, %2), %%mm3\n" /* load 2nd mask line */ \ "pcmpgtb %%mm7, %%mm1\n" /* saturate 1st mask line */ \ "pcmpgtb %%mm7, %%mm3\n" /* saturate 2nd mask line */ \ "pand %%mm1, %%mm0\n" /* mask 1st texture line */ \ "pand %%mm3, %%mm2\n" /* mask 2nd texture line */ \ "pcmpeqb %%mm7, %%mm1\n" /* invert mask */ \ "pcmpeqb %%mm7, %%mm3\n" /* invert mask */ \ "movq %%mm6, %%mm4\n" /* load mean value */ \ "movq %%mm6, %%mm5\n" /* load mean value */ \ "pand %%mm1, %%mm4\n" /* mask mean for 1st texture line */ \ "pand %%mm3, %%mm5\n" /* mask mean for 2nd texture line */ \ "por %%mm4, %%mm0\n" /* join texture and mean */ \ "por %%mm5, %%mm2\n" /* join texture and mean */ \ "movq %%mm0, %%mm1\n" /* copy 1st texture line */ \ "movq %%mm2, %%mm3\n" /* copy 2nd texture line */ \ "punpcklbw %%mm7, %%mm0\n" /* unpack texture to word */ \ "punpckhbw %%mm7, %%mm1\n" /* unpack texture to word */ \ "punpcklbw %%mm7, %%mm2\n" /* unpack texture to word */ \ "punpckhbw %%mm7, %%mm3\n" /* unpack texture to word */ \ "movq %%mm0, 0x" #x "0(%1)\n" /* store texture */ \ "movq %%mm1, 0x" #x "8(%1)\n" /* store texture */ \ "movq %%mm2, 0x" #y "0(%1)\n" /* store texture */ \ "movq %%mm3, 0x" #y "8(%1)\n" /* store texture */ \ "addl %2, %0\n" /* move one texture line down */ \ "addl %2, %0\n" /* move one texture line down */ \ "addl %2, %3\n" /* move one mask line down */ \ "addl %2, %3\n" /* move one mask line down */ asm volatile (PREFETCH_Y_MASK_STEP(0, 1) PREFETCH_Y_MASK_STEP(2, 3) PREFETCH_Y_MASK_STEP(4, 5) PREFETCH_Y_MASK_STEP(6, 7) : "=r"(dummy1), "=r"(output), "=r"(pitch), "=r"(dummy2) : "0"(input), "1"(output), "2"(pitch), "3"(mask) : "memory"); /* TODO: bilinear filtering */ } static void inline prefetch_C_withmask(unsigned char *input, dct_t *output, unsigned char *mask, int pitch) { int dummy1, dummy2; int mean, count; /* compute mean of visible pixels */ asm volatile ( "pxor %%mm7, %%mm7\n" /* mm7 = zero */ "pxor %%mm6, %%mm6\n" /* mm6 = mask accumulator */ "pxor %%mm5, %%mm5\n" /* mm5 = texture accumulator */ /* 1st pass */ "movq (%2), %%mm0\n" /* load 1st mask line up left part */ "movq (%2, %1, 2), %%mm1\n" /* load 1st mask line down left part */ "por %%mm0, %%mm1\n" /* conservative subsample */ "movq 8(%2), %%mm0\n" /* load 1st mask line up right part */ "movq 8(%2, %1, 2), %%mm2\n" /* load 1st mask line down right part */ "por %%mm0, %%mm2\n" /* conservative subsample */ "packsswb %%mm2, %%mm1\n" /* 0000->00 00FF->7F FF00->80 FFFF->FF */ "pcmpeqb %%mm7, %%mm1\n" /* 0000->FF 00FF->00 FF00->00 FFFF->00 */ "pcmpeqb %%mm7, %%mm1\n" /* 0000->00 00FF->FF FF00->FF FFFF->FF */ "addl %1, %2\n" /* move one mask line down */ "addl %1, %2\n" "addl %1, %2\n" "addl %1, %2\n" "movq (%2), %%mm0\n" /* load 2nd mask line up left part */ "movq (%2, %1, 2), %%mm3\n" /* load 2nd mask line down left part */ "por %%mm0, %%mm3\n" /* conservative subsample */ "movq 8(%2), %%mm0\n" /* load 2nd mask line up right part */ "movq 8(%2, %1, 2), %%mm2\n" /* load 2nd mask line down right part */ "por %%mm0, %%mm2\n" /* conservative subsample */ "packsswb %%mm2, %%mm3\n" /* 0000->00 00FF->7F FF00->80 FFFF->FF */ "pcmpeqb %%mm7, %%mm3\n" /* 0000->FF 00FF->00 FF00->00 FFFF->00 */ "pcmpeqb %%mm7, %%mm3\n" /* 0000->00 00FF->FF FF00->FF FFFF->FF */ "addl %1, %2\n" /* move one mask line down */ "addl %1, %2\n" "addl %1, %2\n" "addl %1, %2\n" "movq (%0), %%mm0\n" /* load 1st texture line */ "movq (%0, %1), %%mm2\n" /* load 2nd texture line */ "pand %%mm1, %%mm0\n" /* mask 1st texture line */ "pand %%mm3, %%mm2\n" /* mask 2nd texture line */ "psubsb %%mm1, %%mm6\n" /* accumulate mask */ "psubsb %%mm3, %%mm6\n" /* accumulate mask */ "movq %%mm0, %%mm1\n" /* copy 1st texture line */ "movq %%mm2, %%mm3\n" /* copy 2nd texture line */ "punpcklbw %%mm7, %%mm0\n" /* unpack texture to word for addition */ "punpckhbw %%mm7, %%mm1\n" /* unpack texture to word for addition */ "punpcklbw %%mm7, %%mm2\n" /* unpack texture to word for addition */ "punpckhbw %%mm7, %%mm3\n" /* unpack texture to word for addition */ "paddw %%mm0, %%mm5\n" /* accumulate texture */ "paddw %%mm1, %%mm5\n" /* accumulate texture */ "paddw %%mm2, %%mm5\n" /* accumulate texture */ "paddw %%mm3, %%mm5\n" /* accumulate texture */ "addl %1, %0\n" /* move one texture line down */ "addl %1, %0\n" /* move one texture line down */ /* 2nd pass */ "movq (%2), %%mm0\n" /* load 1st mask line up left part */ "movq (%2, %1, 2), %%mm1\n" /* load 1st mask line down left part */ "por %%mm0, %%mm1\n" /* conservative subsample */ "movq 8(%2), %%mm0\n" /* load 1st mask line up right part */ "movq 8(%2, %1, 2), %%mm2\n" /* load 1st mask line down right part */ "por %%mm0, %%mm2\n" /* conservative subsample */ "packsswb %%mm2, %%mm1\n" /* 0000->00 00FF->7F FF00->80 FFFF->FF */ "pcmpeqb %%mm7, %%mm1\n" /* 0000->FF 00FF->00 FF00->00 FFFF->00 */ "pcmpeqb %%mm7, %%mm1\n" /* 0000->00 00FF->FF FF00->FF FFFF->FF */ "addl %1, %2\n" /* move one mask line down */ "addl %1, %2\n" "addl %1, %2\n" "addl %1, %2\n" "movq (%2), %%mm0\n" /* load 2nd mask line up left part */ "movq (%2, %1, 2), %%mm3\n" /* load 2nd mask line down left part */ "por %%mm0, %%mm3\n" /* conservative subsample */ "movq 8(%2), %%mm0\n" /* load 2nd mask line up right part */ "movq 8(%2, %1, 2), %%mm2\n" /* load 2nd mask line down right part */ "por %%mm0, %%mm2\n" /* conservative subsample */ "packsswb %%mm2, %%mm3\n" /* 0000->00 00FF->7F FF00->80 FFFF->FF */ "pcmpeqb %%mm7, %%mm3\n" /* 0000->FF 00FF->00 FF00->00 FFFF->00 */ "pcmpeqb %%mm7, %%mm3\n" /* 0000->00 00FF->FF FF00->FF FFFF->FF */ "addl %1, %2\n" /* move one mask line down */ "addl %1, %2\n" "addl %1, %2\n" "addl %1, %2\n" "movq (%0), %%mm0\n" /* load 1st texture line */ "movq (%0, %1), %%mm2\n" /* load 2nd texture line */ "pand %%mm1, %%mm0\n" /* mask 1st texture line */ "pand %%mm3, %%mm2\n" /* mask 2nd texture line */ "psubsb %%mm1, %%mm6\n" /* accumulate mask */ "psubsb %%mm3, %%mm6\n" /* accumulate mask */ "movq %%mm0, %%mm1\n" /* copy 1st texture line */ "movq %%mm2, %%mm3\n" /* copy 2nd texture line */ "punpcklbw %%mm7, %%mm0\n" /* unpack texture to word for addition */ "punpckhbw %%mm7, %%mm1\n" /* unpack texture to word for addition */ "punpcklbw %%mm7, %%mm2\n" /* unpack texture to word for addition */ "punpckhbw %%mm7, %%mm3\n" /* unpack texture to word for addition */ "paddw %%mm0, %%mm5\n" /* accumulate texture */ "paddw %%mm1, %%mm5\n" /* accumulate texture */ "paddw %%mm2, %%mm5\n" /* accumulate texture */ "paddw %%mm3, %%mm5\n" /* accumulate texture */ "addl %1, %0\n" /* move one texture line down */ "addl %1, %0\n" /* move one texture line down */ /* 3rd pass */ "movq (%2), %%mm0\n" /* load 1st mask line up left part */ "movq (%2, %1, 2), %%mm1\n" /* load 1st mask line down left part */ "por %%mm0, %%mm1\n" /* conservative subsample */ "movq 8(%2), %%mm0\n" /* load 1st mask line up right part */ "movq 8(%2, %1, 2), %%mm2\n" /* load 1st mask line down right part */ "por %%mm0, %%mm2\n" /* conservative subsample */ "packsswb %%mm2, %%mm1\n" /* 0000->00 00FF->7F FF00->80 FFFF->FF */ "pcmpeqb %%mm7, %%mm1\n" /* 0000->FF 00FF->00 FF00->00 FFFF->00 */ "pcmpeqb %%mm7, %%mm1\n" /* 0000->00 00FF->FF FF00->FF FFFF->FF */ "movq %%mm2, %%mm1\n" "addl %1, %2\n" /* move one mask line down */ "addl %1, %2\n" "addl %1, %2\n" "addl %1, %2\n" "movq (%2), %%mm0\n" /* load 2nd mask line up left part */ "movq (%2, %1, 2), %%mm3\n" /* load 2nd mask line down left part */ "por %%mm0, %%mm3\n" /* conservative subsample */ "movq 8(%2), %%mm0\n" /* load 2nd mask line up right part */ "movq 8(%2, %1, 2), %%mm2\n" /* load 2nd mask line down right part */ "por %%mm0, %%mm2\n" /* conservative subsample */ "packsswb %%mm2, %%mm3\n" /* 0000->00 00FF->7F FF00->80 FFFF->FF */ "pcmpeqb %%mm7, %%mm3\n" /* 0000->FF 00FF->00 FF00->00 FFFF->00 */ "pcmpeqb %%mm7, %%mm3\n" /* 0000->00 00FF->FF FF00->FF FFFF->FF */ "addl %1, %2\n" /* move one mask line down */ "addl %1, %2\n" "addl %1, %2\n" "addl %1, %2\n" "movq (%0), %%mm0\n" /* load 1st texture line */ "movq (%0, %1), %%mm2\n" /* load 2nd texture line */ "pand %%mm1, %%mm0\n" /* mask 1st texture line */ "pand %%mm3, %%mm2\n" /* mask 2nd texture line */ "psubsb %%mm1, %%mm6\n" /* accumulate mask */ "psubsb %%mm3, %%mm6\n" /* accumulate mask */ "movq %%mm0, %%mm1\n" /* copy 1st texture line */ "movq %%mm2, %%mm3\n" /* copy 2nd texture line */ "punpcklbw %%mm7, %%mm0\n" /* unpack texture to word for addition */ "punpckhbw %%mm7, %%mm1\n" /* unpack texture to word for addition */ "punpcklbw %%mm7, %%mm2\n" /* unpack texture to word for addition */ "punpckhbw %%mm7, %%mm3\n" /* unpack texture to word for addition */ "paddw %%mm0, %%mm5\n" /* accumulate texture */ "paddw %%mm1, %%mm5\n" /* accumulate texture */ "paddw %%mm2, %%mm5\n" /* accumulate texture */ "paddw %%mm3, %%mm5\n" /* accumulate texture */ "addl %1, %0\n" /* move one texture line down */ "addl %1, %0\n" /* move one texture line down */ /* 4th pass */ "movq (%2), %%mm0\n" /* load 1st mask line up left part */ "movq (%2, %1, 2), %%mm1\n" /* load 1st mask line down left part */ "por %%mm0, %%mm1\n" /* conservative subsample */ "movq 8(%2), %%mm0\n" /* load 1st mask line up right part */ "movq 8(%2, %1, 2), %%mm2\n" /* load 1st mask line down right part */ "por %%mm0, %%mm2\n" /* conservative subsample */ "packsswb %%mm2, %%mm1\n" /* 0000->00 00FF->7F FF00->80 FFFF->FF */ "pcmpeqb %%mm7, %%mm1\n" /* 0000->FF 00FF->00 FF00->00 FFFF->00 */ "pcmpeqb %%mm7, %%mm1\n" /* 0000->00 00FF->FF FF00->FF FFFF->FF */ "addl %1, %2\n" /* move one mask line down */ "addl %1, %2\n" "addl %1, %2\n" "addl %1, %2\n" "movq (%2), %%mm0\n" /* load 2nd mask line up left part */ "movq (%2, %1, 2), %%mm3\n" /* load 2nd mask line down left part */ "por %%mm0, %%mm3\n" /* conservative subsample */ "movq 8(%2), %%mm0\n" /* load 2nd mask line up right part */ "movq 8(%2, %1, 2), %%mm2\n" /* load 2nd mask line down right part */ "por %%mm0, %%mm2\n" /* conservative subsample */ "packsswb %%mm2, %%mm3\n" /* 0000->00 00FF->7F FF00->80 FFFF->FF */ "pcmpeqb %%mm7, %%mm3\n" /* 0000->FF 00FF->00 FF00->00 FFFF->00 */ "pcmpeqb %%mm7, %%mm3\n" /* 0000->00 00FF->FF FF00->FF FFFF->FF */ "addl %1, %2\n" /* move one mask line down */ "addl %1, %2\n" "addl %1, %2\n" "addl %1, %2\n" "movq (%0), %%mm0\n" /* load 1st texture line */ "movq (%0, %1), %%mm2\n" /* load 2nd texture line */ "pand %%mm1, %%mm0\n" /* mask 1st texture line */ "pand %%mm3, %%mm2\n" /* mask 2nd texture line */ "psubsb %%mm1, %%mm6\n" /* accumulate mask */ "psubsb %%mm3, %%mm6\n" /* accumulate mask */ "movq %%mm0, %%mm1\n" /* copy 1st texture line */ "movq %%mm2, %%mm3\n" /* copy 2nd texture line */ "punpcklbw %%mm7, %%mm0\n" /* unpack texture to word for addition */ "punpckhbw %%mm7, %%mm1\n" /* unpack texture to word for addition */ "punpcklbw %%mm7, %%mm2\n" /* unpack texture to word for addition */ "punpckhbw %%mm7, %%mm3\n" /* unpack texture to word for addition */ "paddw %%mm0, %%mm5\n" /* accumulate texture */ "paddw %%mm1, %%mm5\n" /* accumulate texture */ "paddw %%mm2, %%mm5\n" /* accumulate texture */ "paddw %%mm3, %%mm5\n" /* accumulate texture */ "addl %1, %0\n" /* move one texture line down */ "addl %1, %0\n" /* move one texture line down */ : "=r"(dummy1), "=r"(pitch), "=r"(dummy2) : "0"(input), "1"(pitch), "2"(mask) : "memory"); asm volatile (/* line accumulation */ "movq %%mm6, %%mm0\n" /* copy column pixel counts */ "psrlq $32, %%mm6\n" /* move high dword low */ "paddb %%mm6, %%mm0\n" /* sum column counts */ "movd %%mm0, %%eax\n" /* get 4 packed counts to register eax */ "movl %%eax, %%ecx\n" /* copy register eax to register ecx */ "bswap %%ecx\n" /* swap temporary register ecx */ "addw %%cx, %%ax\n" /* sum low words */ "addb %%ah, %%al\n" /* sum low bytes */ "andl $0x000000ff, %%eax\n" /* mask final value */ "movq %%mm5, %%mm1\n" /* copy column pixel partial sums */ "psrlq $32, %%mm5\n" /* move high dword low */ "paddw %%mm5, %%mm1\n" /* sum partial sums */ "movq %%mm1, %%mm0\n" /* copy column pixel partial sums */ "psrlq $16, %%mm0\n" /* move high word low */ "paddw %%mm0, %%mm1\n" /* sum partial sums */ "movd %%mm1, %%ecx\n" /* get result in ecx */ "andl $0x0000ffff, %%ecx\n" /* mask final value */ : "=a" (count), "=c" (mean) ); if(count) mean /= count; /* replicate mean for padding */ asm volatile ( "movd %%ecx, %%mm6\n" /* mm6 will hold mean value */ "punpcklbw %%mm6, %%mm6\n" /* replicate mean to dword */ "punpcklwd %%mm6, %%mm6\n" /* replicate mean to dword */ "punpckldq %%mm6, %%mm6\n" /* replicate mean to qword */ : : "c" (mean)); /* fetch and fill empty pixels with mean value */ asm volatile ( /* 1st pass */ "movq (%3), %%mm0\n" /* load 1st mask line up left part */ "movq (%3, %2, 2), %%mm1\n" /* load 1st mask line down left part */ "por %%mm0, %%mm1\n" /* conservative subsample */ "movq 8(%3), %%mm0\n" /* load 1st mask line up right part */ "movq 8(%3, %2, 2), %%mm2\n" /* load 1st mask line down right part */ "por %%mm0, %%mm2\n" /* conservative subsample */ "packsswb %%mm2, %%mm1\n" /* 0000->00 00FF->7F FF00->80 FFFF->FF */ "pcmpeqb %%mm7, %%mm1\n" /* 0000->FF 00FF->00 FF00->00 FFFF->00 */ "pcmpeqb %%mm7, %%mm1\n" /* 0000->00 00FF->FF FF00->FF FFFF->FF */ "addl %2, %3\n" /* move one mask line down */ "addl %2, %3\n" "addl %2, %3\n" "addl %2, %3\n" "movq (%3), %%mm0\n" /* load 2nd mask line up left part */ "movq (%3, %2, 2), %%mm3\n" /* load 2nd mask line down left part */ "por %%mm0, %%mm3\n" /* conservative subsample */ "movq 8(%3), %%mm0\n" /* load 2nd mask line up right part */ "movq 8(%3, %2, 2), %%mm2\n" /* load 2nd mask line down right part */ "por %%mm0, %%mm2\n" /* conservative subsample */ "packsswb %%mm2, %%mm3\n" /* 0000->00 00FF->7F FF00->80 FFFF->FF */ "pcmpeqb %%mm7, %%mm3\n" /* 0000->FF 00FF->00 FF00->00 FFFF->00 */ "pcmpeqb %%mm7, %%mm3\n" /* 0000->00 00FF->FF FF00->FF FFFF->FF */ "addl %2, %3\n" /* move one mask line down */ "addl %2, %3\n" "addl %2, %3\n" "addl %2, %3\n" "movq (%0), %%mm0\n" /* load 1st texture line */ "movq (%0, %2), %%mm2\n" /* load 2nd texture line */ "pand %%mm1, %%mm0\n" /* mask 1st texture line */ "pand %%mm3, %%mm2\n" /* mask 2nd texture line */ "pcmpeqb %%mm7, %%mm1\n" /* invert mask */ "pcmpeqb %%mm7, %%mm3\n" /* invert mask */ "movq %%mm6, %%mm4\n" /* load mean value */ "pand %%mm1, %%mm4\n" /* mask mean for 1st texture line */ "por %%mm4, %%mm0\n" /* join texture and mean */ "movq %%mm6, %%mm4\n" /* load mean value */ "pand %%mm3, %%mm4\n" /* mask mean for 2nd texture line */ "por %%mm4, %%mm2\n" /* join texture and mean */ "movq %%mm0, %%mm1\n" /* copy 1st texture line */ "movq %%mm2, %%mm3\n" /* copy 2nd texture line */ "punpcklbw %%mm7, %%mm0\n" /* unpack texture to word */ "punpckhbw %%mm7, %%mm1\n" /* unpack texture to word */ "punpcklbw %%mm7, %%mm2\n" /* unpack texture to word */ "punpckhbw %%mm7, %%mm3\n" /* unpack texture to word */ "movq %%mm0, 0x00(%1)\n" /* store texture */ "movq %%mm1, 0x08(%1)\n" /* store texture */ "movq %%mm2, 0x10(%1)\n" /* store texture */ "movq %%mm3, 0x18(%1)\n" /* store texture */ "addl %2, %0\n" /* move one texture line down */ "addl %2, %0\n" /* move one texture line down */ /* 2nd pass */ "movq (%3), %%mm0\n" /* load 1st mask line up left part */ "movq (%3, %2, 2), %%mm1\n" /* load 1st mask line down left part */ "por %%mm0, %%mm1\n" /* conservative subsample */ "movq 8(%3), %%mm0\n" /* load 1st mask line up right part */ "movq 8(%3, %2, 2), %%mm2\n" /* load 1st mask line down right part */ "por %%mm0, %%mm2\n" /* conservative subsample */ "packsswb %%mm2, %%mm1\n" /* 0000->00 00FF->7F FF00->80 FFFF->FF */ "pcmpeqb %%mm7, %%mm1\n" /* 0000->FF 00FF->00 FF00->00 FFFF->00 */ "pcmpeqb %%mm7, %%mm1\n" /* 0000->00 00FF->FF FF00->FF FFFF->FF */ "addl %2, %3\n" /* move one mask line down */ "addl %2, %3\n" "addl %2, %3\n" "addl %2, %3\n" "movq (%3), %%mm0\n" /* load 2nd mask line up left part */ "movq (%3, %2, 2), %%mm3\n" /* load 2nd mask line down left part */ "por %%mm0, %%mm3\n" /* conservative subsample */ "movq 8(%3), %%mm0\n" /* load 2nd mask line up right part */ "movq 8(%3, %2, 2), %%mm2\n" /* load 2nd mask line down right part */ "por %%mm0, %%mm2\n" /* conservative subsample */ "packsswb %%mm2, %%mm3\n" /* 0000->00 00FF->7F FF00->80 FFFF->FF */ "pcmpeqb %%mm7, %%mm3\n" /* 0000->FF 00FF->00 FF00->00 FFFF->00 */ "pcmpeqb %%mm7, %%mm3\n" /* 0000->00 00FF->FF FF00->FF FFFF->FF */ "addl %2, %3\n" /* move one mask line down */ "addl %2, %3\n" "addl %2, %3\n" "addl %2, %3\n" "movq (%0), %%mm0\n" /* load 1st texture line */ "movq (%0, %2), %%mm2\n" /* load 2nd texture line */ "pand %%mm1, %%mm0\n" /* mask 1st texture line */ "pand %%mm3, %%mm2\n" /* mask 2nd texture line */ "pcmpeqb %%mm7, %%mm1\n" /* invert mask */ "pcmpeqb %%mm7, %%mm3\n" /* invert mask */ "movq %%mm6, %%mm4\n" /* load mean value */ "pand %%mm1, %%mm4\n" /* mask mean for 1st texture line */ "por %%mm4, %%mm0\n" /* join texture and mean */ "movq %%mm6, %%mm4\n" /* load mean value */ "pand %%mm3, %%mm4\n" /* mask mean for 2nd texture line */ "por %%mm4, %%mm2\n" /* join texture and mean */ "movq %%mm0, %%mm1\n" /* copy 1st texture line */ "movq %%mm2, %%mm3\n" /* copy 2nd texture line */ "punpcklbw %%mm7, %%mm0\n" /* unpack texture to word */ "punpckhbw %%mm7, %%mm1\n" /* unpack texture to word */ "punpcklbw %%mm7, %%mm2\n" /* unpack texture to word */ "punpckhbw %%mm7, %%mm3\n" /* unpack texture to word */ "movq %%mm0, 0x20(%1)\n" /* store texture */ "movq %%mm1, 0x28(%1)\n" /* store texture */ "movq %%mm2, 0x30(%1)\n" /* store texture */ "movq %%mm3, 0x38(%1)\n" /* store texture */ "addl %2, %0\n" /* move one texture line down */ "addl %2, %0\n" /* move one texture line down */ /* 3rd pass */ "movq (%3), %%mm0\n" /* load 1st mask line up left part */ "movq (%3, %2, 2), %%mm1\n" /* load 1st mask line down left part */ "por %%mm0, %%mm1\n" /* conservative subsample */ "movq 8(%3), %%mm0\n" /* load 1st mask line up right part */ "movq 8(%3, %2, 2), %%mm2\n" /* load 1st mask line down right part */ "por %%mm0, %%mm2\n" /* conservative subsample */ "packsswb %%mm2, %%mm1\n" /* 0000->00 00FF->7F FF00->80 FFFF->FF */ "pcmpeqb %%mm7, %%mm1\n" /* 0000->FF 00FF->00 FF00->00 FFFF->00 */ "pcmpeqb %%mm7, %%mm1\n" /* 0000->00 00FF->FF FF00->FF FFFF->FF */ "addl %2, %3\n" /* move one mask line down */ "addl %2, %3\n" "addl %2, %3\n" "addl %2, %3\n" "movq (%3), %%mm0\n" /* load 2nd mask line up left part */ "movq (%3, %2, 2), %%mm3\n" /* load 2nd mask line down left part */ "por %%mm0, %%mm3\n" /* conservative subsample */ "movq 8(%3), %%mm0\n" /* load 2nd mask line up right part */ "movq 8(%3, %2, 2), %%mm2\n" /* load 2nd mask line down right part */ "por %%mm0, %%mm2\n" /* conservative subsample */ "packsswb %%mm2, %%mm3\n" /* 0000->00 00FF->7F FF00->80 FFFF->FF */ "pcmpeqb %%mm7, %%mm3\n" /* 0000->FF 00FF->00 FF00->00 FFFF->00 */ "pcmpeqb %%mm7, %%mm3\n" /* 0000->00 00FF->FF FF00->FF FFFF->FF */ "addl %2, %3\n" /* move one mask line down */ "addl %2, %3\n" "addl %2, %3\n" "addl %2, %3\n" "movq (%0), %%mm0\n" /* load 1st texture line */ "movq (%0, %2), %%mm2\n" /* load 2nd texture line */ "pand %%mm1, %%mm0\n" /* mask 1st texture line */ "pand %%mm3, %%mm2\n" /* mask 2nd texture line */ "pcmpeqb %%mm7, %%mm1\n" /* invert mask */ "pcmpeqb %%mm7, %%mm3\n" /* invert mask */ "movq %%mm6, %%mm4\n" /* load mean value */ "pand %%mm1, %%mm4\n" /* mask mean for 1st texture line */ "por %%mm4, %%mm0\n" /* join texture and mean */ "movq %%mm6, %%mm4\n" /* load mean value */ "pand %%mm3, %%mm4\n" /* mask mean for 2nd texture line */ "por %%mm4, %%mm2\n" /* join texture and mean */ "movq %%mm0, %%mm1\n" /* copy 1st texture line */ "movq %%mm2, %%mm3\n" /* copy 2nd texture line */ "punpcklbw %%mm7, %%mm0\n" /* unpack texture to word */ "punpckhbw %%mm7, %%mm1\n" /* unpack texture to word */ "punpcklbw %%mm7, %%mm2\n" /* unpack texture to word */ "punpckhbw %%mm7, %%mm3\n" /* unpack texture to word */ "movq %%mm0, 0x40(%1)\n" /* store texture */ "movq %%mm1, 0x48(%1)\n" /* store texture */ "movq %%mm2, 0x50(%1)\n" /* store texture */ "movq %%mm3, 0x58(%1)\n" /* store texture */ "addl %2, %0\n" /* move one texture line down */ "addl %2, %0\n" /* move one texture line down */ /* 4th pass */ "movq (%3), %%mm0\n" /* load 1st mask line up left part */ "movq (%3, %2, 2), %%mm1\n" /* load 1st mask line down left part */ "por %%mm0, %%mm1\n" /* conservative subsample */ "movq 8(%3), %%mm0\n" /* load 1st mask line up right part */ "movq 8(%3, %2, 2), %%mm2\n" /* load 1st mask line down right part */ "por %%mm0, %%mm2\n" /* conservative subsample */ "packsswb %%mm2, %%mm1\n" /* 0000->00 00FF->7F FF00->80 FFFF->FF */ "pcmpeqb %%mm7, %%mm1\n" /* 0000->FF 00FF->00 FF00->00 FFFF->00 */ "pcmpeqb %%mm7, %%mm1\n" /* 0000->00 00FF->FF FF00->FF FFFF->FF */ "addl %2, %3\n" /* move one mask line down */ "addl %2, %3\n" "addl %2, %3\n" "addl %2, %3\n" "movq (%3), %%mm0\n" /* load 2nd mask line up left part */ "movq (%3, %2, 2), %%mm3\n" /* load 2nd mask line down left part */ "por %%mm0, %%mm3\n" /* conservative subsample */ "movq 8(%3), %%mm0\n" /* load 2nd mask line up right part */ "movq 8(%3, %2, 2), %%mm2\n" /* load 2nd mask line down right part */ "por %%mm0, %%mm2\n" /* conservative subsample */ "packsswb %%mm2, %%mm3\n" /* 0000->00 00FF->7F FF00->80 FFFF->FF */ "pcmpeqb %%mm7, %%mm3\n" /* 0000->FF 00FF->00 FF00->00 FFFF->00 */ "pcmpeqb %%mm7, %%mm3\n" /* 0000->00 00FF->FF FF00->FF FFFF->FF */ "addl %2, %3\n" /* move one mask line down */ "addl %2, %3\n" "addl %2, %3\n" "addl %2, %3\n" "movq (%0), %%mm0\n" /* load 1st texture line */ "movq (%0, %2), %%mm2\n" /* load 2nd texture line */ "pand %%mm1, %%mm0\n" /* mask 1st texture line */ "pand %%mm3, %%mm2\n" /* mask 2nd texture line */ "pcmpeqb %%mm7, %%mm1\n" /* invert mask */ "pcmpeqb %%mm7, %%mm3\n" /* invert mask */ "movq %%mm6, %%mm4\n" /* load mean value */ "pand %%mm1, %%mm4\n" /* mask mean for 1st texture line */ "por %%mm4, %%mm0\n" /* join texture and mean */ "movq %%mm6, %%mm4\n" /* load mean value */ "pand %%mm3, %%mm4\n" /* mask mean for 2nd texture line */ "por %%mm4, %%mm2\n" /* join texture and mean */ "movq %%mm0, %%mm1\n" /* copy 1st texture line */ "movq %%mm2, %%mm3\n" /* copy 2nd texture line */ "punpcklbw %%mm7, %%mm0\n" /* unpack texture to word */ "punpckhbw %%mm7, %%mm1\n" /* unpack texture to word */ "punpcklbw %%mm7, %%mm2\n" /* unpack texture to word */ "punpckhbw %%mm7, %%mm3\n" /* unpack texture to word */ "movq %%mm0, 0x60(%1)\n" /* store texture */ "movq %%mm1, 0x68(%1)\n" /* store texture */ "movq %%mm2, 0x70(%1)\n" /* store texture */ "movq %%mm3, 0x78(%1)\n" /* store texture */ "addl %2, %0\n" /* move one texture line down */ "addl %2, %0\n" /* move one texture line down */ : "=r"(dummy1), "=r"(output), "=r"(pitch), "=r"(dummy2) : "0"(input), "1"(output), "2"(pitch), "3"(mask) : "memory"); /* TODO: bilinear filtering */ } static void inline diff(unsigned char *input, unsigned char *ref, dct_t *output, int ipitch, int rpitch) { int dummy1, dummy2; /* simple diff */ #define DIFF_STEP(x) \ "movq (%0), %%mm0\n" \ "movq (%3), %%mm2\n" \ "movq %%mm0, %%mm1\n" \ "movq %%mm2, %%mm3\n" \ "punpcklbw %%mm7, %%mm0\n" \ "punpckhbw %%mm7, %%mm1\n" \ "punpcklbw %%mm7, %%mm2\n" \ "punpckhbw %%mm7, %%mm3\n" \ "psubsw %%mm2, %%mm0\n" \ "psubsw %%mm3, %%mm1\n" \ "movq %%mm0, 0x" #x "0(%1)\n" \ "movq %%mm1, 0x" #x "8(%1)\n" \ "addl %2, %0\n" \ "addl %4, %3\n" asm volatile ("pxor %%mm7, %%mm7\n" DIFF_STEP(0) DIFF_STEP(1) DIFF_STEP(2) DIFF_STEP(3) DIFF_STEP(4) DIFF_STEP(5) DIFF_STEP(6) DIFF_STEP(7) : "=r"(dummy1), "=r"(output), "=r"(ipitch), "=r"(dummy2), "=r"(rpitch) : "0"(input), "1"(output), "2"(ipitch), "3"(ref), "4"(rpitch) : "memory"); }