/* This optimized memcpy requires gcc 3.x to be installed (needs xmmintrin.h) */
/*#define _INTEL */
#if defined (_INTEL)
#include <xmmintrin.h>
#else
#include "xmmintrin.h"
#endif
#include <stdio.h>
/*construct a function that can do well on most bufferalignment */
#define LONGMSGSIZE (2.5*131072) /* Long message size */
/*#define BLOCKSIZE 131072 */
#define BLOCKSIZE 131072 /* Needs to be divisible by 16 */
#define PAGESIZE 4096
#define NUMPERPAGE 512 /* Number of elements fit in a page */
#define ALIGNMENT 16
/* #define P4 */
#if defined(P4)
#define CACHELINE 128 /* on Pentimum 4 */
#else
#define CACHELINE 32 /* on Pentimum 3 */
#endif
#define NTCOPY 1 /* Use nontemporal copy */
#define WACOPY 2 /* Write allocate copy */
#define CBCOPY 3 /*
* mixed copy, small message use
* write allocate copy, and long
* message use nontemporal copy
*/
#define COPY_TYPE CBCOPY
#define small_memcpy(dst,src,n) \
{ register unsigned long int dummy; \
asm volatile ( \
"rep; movsb\n\t" \
:"=&D"(dst), "=&S"(src), "=&c"(dummy) \
:"0" (dst), "1" (src),"2" (n) \
: "memory"); }
extern int myproc;
void ntcopy(void *dst, const void *src, int size);
void memcpy_8(void *destination, const void *source, int nbytes);
void memcpy_16(void *destination, const void *source, int nbytes);
void MP_memcpy(void *dst, const void *src, int nbytes);
int intlog2(int i)
{
float x = i;
return (*(int*)&x >> 23) - 127;
}
/*
* This function optimize the memory copy if number of bytes
* to transfer is not equal to 8
*/
void memcpy_8(void *destination, const void *source, int nbytes)
{
int nb_b4, nb_after;
char *dest = (char *)destination, *src = (char *) source;
nb_b4 = 8 - ((long int)src % 8);
if( nb_b4 != 8 && nb_b4 <= nbytes) { /*
* Copy up to an 8-byte boundary first
* considering that nbytes can be less
* than nb_b4
*/
memcpy( dest, src, nb_b4 );
src += nb_b4;
dest += nb_b4;
nbytes -= nb_b4;
}
nb_after = nbytes % 8;
nbytes -= nb_after;
if( nbytes > 0 ) { /* Copy the main data */
memcpy( dest, src, nbytes );
}
if( nb_after > 0 ) { /* Copy the last few bytes */
src += nbytes;
dest += nbytes;
memcpy( dest, src, nb_after );
}
}
void memcpy_16(void *destination, const void *source, int nbytes)
{
int nb_b4, nb_after;
char *dest = (char *)destination, *src = (char *)source;
nb_b4 = 16 - ((int) dest % 16);
if (nb_b4 != 16 && nb_b4 <= nbytes)
{
memcpy(dest, src, nb_b4);
src += nb_b4;
dest += nb_b4;
nbytes -= nb_b4;
}
/*memcpy(dest, src, nbytes); */
nb_after = nbytes % 16;
nbytes -= nb_after;
if ( nbytes > 0) {
memcpy(dest, src, nbytes);
}
if( nb_after > 0 ) {
src += nbytes;
dest += nbytes;
memcpy( dest, src, nb_after );
}
}
//#if defined(_INTEL)
void ntcopy(void *dst, const void *src, int size)
{
int ii, jj, kk, N, delta, LEFT, blocksize, size1;
double *a, *b;
double temp;
/* copy the first few bytes to make dest divisible by 8 */
if (size <= ALIGNMENT)
{
memcpy(dst, (void *)src, size);
return;
}
delta = ((int)dst) & (ALIGNMENT - 1);
if (delta != 0)
{
delta = ALIGNMENT - delta;
size -= delta;
memcpy(dst, (void *)src, delta);
}
a = (double *)(src + delta);
b = (double *)(dst + delta);
N = 2 * (size / 16); /* number of doubles */
LEFT = size % 16;
blocksize = N;
if (blocksize > BLOCKSIZE / 8)
blocksize = BLOCKSIZE / 8;
for (X3;;èi)
{
if (N < blocksize) blocksize = N;
_mm_prefetch((char*)&a[0], _MM_HINT_NTA);
/* prefetch a block of size blocksize */
for (jj = 0; jj < blocksize; jj += NUMPERPAGE)
{
/* prefetch one page of memory */
if (jj + NUMPERPAGE < blocksize )
{
temp = a[jj + NUMPERPAGE]; /* TLB priming */
}
for (kk = jj + 16; kk < jj + NUMPERPAGE && kk < blocksize; kk += 16) {
_mm_prefetch((char*)&a[kk], _MM_HINT_NTA);
}
}
if ( ((int) a) & (ALIGNMENT - 1) )
{
size1 = blocksize - blocksize % 16;
for (kk = 0; kk < size1; kk += 16)
{
/* copy one cacheline (128 bytes) */
_mm_stream_ps((float*)&b[kk],
_mm_loadu_ps((float*)&a[kk]));
_mm_stream_ps((float*)&b[kk+2],
_mm_loadu_ps((float*)&a[kk+2]));
_mm_stream_ps((float*)&b[kk+4],
_mm_loadu_ps((float*)&a[kk+4]));
_mm_stream_ps((float*)&b[kk+6],
_mm_loadu_ps((float*)&a[kk+6]));
_mm_stream_ps((float*)&b[kk+8],
_mm_loadu_ps((float*)&a[kk+8]));
_mm_stream_ps((float*)&b[kk+10],
_mm_loadu_ps((float*)&a[kk+10]));
_mm_stream_ps((float*)&b[kk+12],
_mm_loadu_ps((float*)&a[kk+12]));
_mm_stream_ps((float*)&b[kk+14],
_mm_loadu_ps((float*)&a[kk+14]));
}
for (kk = size1; kk < blocksize; kk += 2)
{
_mm_stream_ps((float*)&b[kk],
_mm_loadu_ps((float*)&a[kk]));
}
}
else
{
size1 = blocksize - blocksize % 16;
for (kk = 0; kk < size1; kk+=16)
{
_mm_stream_ps((float*)&b[kk],
_mm_load_ps((float*)&a[kk]));
_mm_stream_ps((float*)&b[kk+2],
_mm_load_ps((float*)&a[kk+2]));
_mm_stream_ps((float*)&b[kk+4],
_mm_load_ps((float*)&a[kk+4]));
_mm_stream_ps((float*)&b[kk+6],
_mm_load_ps((float*)&a[kk+6]));
_mm_stream_ps((float*)&b[kk+8],
_mm_load_ps((float*)&a[kk+8]));
_mm_stream_ps((float*)&b[kk+10],
_mm_load_ps((float*)&a[kk+10]));
_mm_stream_ps((float*)&b[kk+12],
_mm_load_ps((float*)&a[kk+12]));
_mm_stream_ps((float*)&b[kk+14],
_mm_load_ps((float*)&a[kk+14]));
}
for (kk = size1; kk < blocksize; kk += 2)
{
_mm_stream_ps((float*)&b[kk],
_mm_load_ps((float*)&a[kk]));
}
}
/* finished copying one block */
a = a + blocksize;
b = b + blocksize;
}
_mm_sfence();
if (LEFT > 0)
{
memcpy((char*)b, (char *)a, LEFT);
}
}
//#endif
void MP_memcpy(void *dst, const void *src, int nbytes)
{
#if COPY_TYPE == WACOPY
memcpy_16(dst, (void *)src, nbytes);
#elif COPY_TYPE == NTCOPY
ntcopy(dst, src, nbytes);
#elif COPY_TYPE == CBCOPY
if (nbytes > LONGMSGSIZE)
ntcopy(dst, src, nbytes);
else
memcpy_16(dst, src, nbytes);
#endif
}
syntax highlighted by Code2HTML, v. 0.9.1