/******************************************************** Miscellaneous MMX-accelerated routines Copyright 2000 Eugene Kuznetsov (divx@euro.ru) *********************************************************/ #include "avm_cpuinfo.h" #include "mmx.h" #include #include #if !defined WIN32 && defined ARCH_X86 static inline void emms(void) { __asm __volatile ("emms;": : :"memory"); } #define emms_c() \ { \ emms(); \ } void mmx_copy(void* to, const void* from, uint_t size) { uint_t sx = size; uint_t s = size & 0x3f; if (s) { int dummy; asm volatile ( "rep; movsb" :"=&D"(to), "=&S"(from), "=&c"(dummy) :"0"(to), "1"(from), "2"(s) : "memory" ); } while (s < sx) { asm volatile ( #if 0 ".balign 16 \n\t" "1: \n\t" "movq (%0, %2), %%mm0\n\t" "movq 8(%0, %2), %%mm1\n\t" "movq 16(%0, %2), %%mm2\n\t" "movq 24(%0, %2), %%mm3\n\t" "movq 32(%0, %2), %%mm4\n\t" "movq 40(%0, %2), %%mm5\n\t" "movq 48(%0, %2), %%mm6\n\t" "movq 56(%0, %2), %%mm7\n\t" "movq %%mm0, (%1, %2)\n\t" "movq %%mm1, 8(%1, %2)\n\t" "movq %%mm2, 16(%1, %2)\n\t" "movq %%mm3, 24(%1, %2)\n\t" "movq %%mm4, 32(%1, %2)\n\t" "movq %%mm5, 40(%1, %2)\n\t" "movq %%mm6, 48(%1, %2)\n\t" "movq %%mm7, 56(%1, %2)\n\t" #else ".balign 16 \n\t" "1: \n\t" "movq (%0, %2), %%mm0\n\t" "movq 8(%0, %2), %%mm1\n\t" "movq %%mm0, (%1, %2)\n\t" "movq 16(%0, %2), %%mm2\n\t" "movq 24(%0, %2), %%mm3\n\t" "movq 32(%0, %2), %%mm4\n\t" "movq 40(%0, %2), %%mm5\n\t" "movq 48(%0, %2), %%mm6\n\t" "movq 56(%0, %2), %%mm7\n\t" "movq %%mm1, 8(%1, %2)\n\t" "movq %%mm2, 16(%1, %2)\n\t" "movq %%mm3, 24(%1, %2)\n\t" "movq %%mm4, 32(%1, %2)\n\t" "movq %%mm5, 40(%1, %2)\n\t" "movq %%mm6, 48(%1, %2)\n\t" "movq %%mm7, 56(%1, %2)\n\t" #endif : :"r"(from), "r"(to), "r"(s) :"memory" ); s += 64; } emms_c(); } static void zoom_16_bpp(uint16_t* dest, const uint16_t* src, int dst_w, int dst_h, int src_w, int src_h, int xdim) { int x_min = (src_w%dst_w); int y_min = src_h%dst_h; int x_accum = 0; int y_accum = 0; const uint16_t *src2=src; int dest_delta = (xdim!=0 ? xdim-dst_w : 0); int x_maj = src_w/dst_w*2; int y_maj = (src_h/dst_h)*src_w; if ((x_maj>0) || (y_maj>0)) { for(int i=0; i=dst_h) { y_accum-=dst_h; src2+=src_w; } src=src2; } } else // cout << "FIXME error - Tried to use ASM code which is not shlib friendly" << endl; // has to be fixed - causes reference .rel.text // objdump --headers --private-headers -T libaviplay.so | less // if this output doesn't contain .rel.text than its OK! #if 0 // BUGGY { int dest2=0; int i=0; __asm__ __volatile__ ( "pushal \n\t" "subl $16, %%esp \n\t" "movl %5, %%ebx \n\t" // 8(%%esp) = dest_h "movl %%ebx, 8(%%esp) \n\t" "movl %4, %%ebx \n\t" // 12(%%esp) = dest_w "movl %%ebx, 12(%%esp) \n\t" "movl %5, %%ebx \n\t" // i = src_h "movl %%ebx, (%%esp) \n\t" "for_i: \n\t" "movl %0, 4(%%esp) \n\t" // 4(%%esp) = dest "movl 12(%%esp), %%edx \n\t" "movl %4, %%ecx \n\t" // j = src_w "for_j: \n\t" "movw (%1), %%bx \n\t" "x_dest_copy_again: \n\t" "movw %%bx, (%0) \n\t" // *dest = *src "addl $2, %0 \n\t" // dest++; "subl %6, %%edx \n\t" // 12(%%esp)-=x_min "jnc x_dest_copy_again \n\t" // while (12(%%esp) > 0) "no_xdestcopy: \n\t" "addl %2, %%edx \n\t" // 12(%%esp) += dest_w "addl $2, %1 \n\t" // src++; "decl %%ecx \n\t" // j--; "jnz for_j \n\t" // while (j>0) Next "y: \n\t" "movl %%edx, 12(%%esp) \n\t" "addl %8, %0 \n\t" // dest += dest_delta "movl 8(%%esp), %%edx \n\t" "push %1 \n\t" // safe src "y_dest_copy_again: \n\t" "subl %7, %%edx \n\t" // 8(%%esp) -= y_min "jc y_no_cpy_again \n\t" // while (8(%%esp)>0) "movl 8(%%esp),%%esi \n\t" "movl %2, %%ecx \n\t" "shrl $1,%%ecx \n\t" "cld \n\t" "rep; movsl \n\t" "addl %8, %0 \n\t" // dest += dest_delta "jmp y_dest_copy_again \n\t" "y_no_cpy_again: \n\t" "addl %3, %%edx \n\t" // 8(%%esp) += dest_h "movl %%edx, 12(%%esp) \n\t" "pop %1 \n\t" // get src "decl (%%esp) \n\t" "jnz for_i \n\t" "addl $16, %%esp \n\t" "popal \n\t" : : "D"(dest), "S"(src), "r"(dst_w), "r"(dst_h), "r"(src_w), "r"(src_h), "a"(x_min), "r"(y_min), "r"(dest_delta) :"memory" ); } #else { // slow dumb implementation which doesn't work uint16_t* pdest = dest; for (int i = 0; i < src_w && i < dst_h; i++) { for (int j = 0; j < src_h; j++) pdest[j] = *src++; pdest += dst_w; } } #endif } static void zoom_24_bpp(int* dest, const int* src, int dst_w, int dst_h, int src_w, int src_h, int xdim) { int x_maj = src_w/dst_w; int x_min = src_w%dst_w; int y_maj = (src_h/dst_h)*src_w; int y_min = src_h%dst_h; int x_accum = 0; int y_accum = 0; const int *src2=src; int dest_delta = (xdim!=0 ? 3*(xdim-dst_w) : 0); // cout << "(" << dst_w << "," << dst_h << "," << src_w << "," << src_h << ")" << endl; for (int i=0; i=dst_w) { x_accum-=dst_w; src=(const int*)((const char*)src+3); } } dest = (int*) ((char*) dest + dest_delta); src2=(const int*)((const char*)src2+y_maj*3); y_accum+=y_min; if(y_accum>=dst_h) { y_accum-=dst_h; src2=(const int*)((const char*)src2+3*src_w); } src=src2; } } static void zoom_32_bpp(int* dest, const int* src, int dst_w, int dst_h, int src_w, int src_h, int xdim) { int x_maj = src_w/dst_w; int x_min = src_w%dst_w; int y_maj = (src_h/dst_h)*src_w; int y_min = src_h%dst_h; int x_accum = 0; int y_accum = 0; const int *src2=src; int dest_delta = (xdim!=0 ? xdim-dst_w : 0); // cout << "(" << dst_w << "," << dst_h << "," << src_w << "," << src_h << ")" << endl; for(int i=0; i=dst_w) { x_accum-=dst_w; src++; } } dest+=dest_delta; src2+=y_maj; y_accum+=y_min; if(y_accum>=dst_h) { y_accum-=dst_h; src2+=src_w; } src=src2; } } void zoom(uint16_t* dest, const uint16_t* src, int dst_w, int dst_h, int src_w, int src_h, int bpp, int xdim) { switch(bpp) { case 15: case 16: return zoom_16_bpp(dest,src,dst_w,dst_h,src_w,src_h,xdim); case 24: return zoom_24_bpp((int*)dest,(const int*)src,dst_w,dst_h,src_w,src_h,xdim); case 32: return zoom_32_bpp((int*)dest,(const int*)src,dst_w,dst_h,src_w,src_h,xdim); } } #endif // WIN32 static void zoom_2_16_nommx(uint16_t* dest, const uint16_t* src, int w, int h) { for(int i=0; i 1) { w -= 2; unsigned q = *((uint32_t *)&s[w]); *((uint32_t *)&d[w]) = (q & 0x7FFF7FFF) + (q & 0x7FE07FE0); } if (w) { uint16_t q = s[0]; q += (q & 0x7FE0); d[0] = q; } } #if !defined WIN32 && defined ARCH_X86 /********************************************* WARNING All MMX code assumes that dest scanline sizes are multiple of 8 bytes. *********************************************/ static void v555to565_mmx(uint16_t* dest, const uint16_t* src, int w, int h) { static uint64_t line __attribute__ ((aligned(8))) = 0xFFE0FFE0FFE0FFE0LL; bool flip=(h<0); if(flip) { h=-h; src+=w*(h-1); } __asm__ __volatile__ ( "movq %0, %%mm2 \n\t" : : "m"(line) ); // dest+=w*(h-1); for(int i=0; i 565. "movq %%mm0, %%mm1 \n\t" "pand %%mm2, %%mm1 \n\t" "paddw %%mm1, %%mm0 \n\t" //Store the result. "movq %%mm0, (%1) \n\t" "addl $8, %%ecx \n\t" "cmpl %0, %2 \n\t" "ja 1b \n\t" : :"r"(src), "r"(dest), "r"(src + 2*w) :"memory" ); src+=2*w; dest+=w/2; } emms_c(); } static void zoom_2_16_mmx(uint16_t *dest, const uint16_t *src, int w, int h) { static uint64_t line2 __attribute__ ((aligned(8))) = 0x0000FFFF0000FFFFLL; __asm__ __volatile__ ( "movq %0, %%mm3 \n\t" : : "m"(line2) ); for(int i=0; i