#ifdef TERRAIN /*! * \file * \ingroup display_utils * \brief Normal map calculation using SSE, if defined USE_SSE2 using SSE2 * and if defined USE_SSE3 using SSE3. */ #ifndef NORMALS_SSE_H #define NORMALS_SSE_H #ifdef __cplusplus extern "C" { #endif #include "simd.h" #ifdef USE_SSE3 #include #else #ifdef USE_SSE2 #include #else #include #endif #endif /*! * \ingroup display_utils * \brief Calculates the current normal map of the float terrain height map. * * Calculates the current normal map of the float terrain height map with * SSE & MMX instructions, with SSE & SSE2 instructions if USE_SSE2 is defined or * with SSE, SSE2 & SSE3 if USE_SSE3 is defined. Then calls build_normal_texures for * normal texture construction. * Use this function with care because it frees h_map_f. * \param h_map The terrain height map. Address must be 16 Byte aligned for SSE2. * \param size_x The size of the terrain height map in x direction. Must be a multiple * of four for SSE and a multiple of eight for SSE2. * \param size_y The size of the terrain height map in y direction. * \param h_scale The scale of the terrain height (z direction). * * \callgraph */ static __inline__ void calc_normal_map_float_sse(float* h_map_f, const unsigned int size_x, const unsigned int size_y) { unsigned int i, j, n_index, n_row; __m128 h0, h1, h2, h3, h4, h5; __m128 t1, t2, t3, t4, t5, t6; __m128 v1_x, v1_y, v2_x, v2_y; __m128 v1, v2, v3, v4, v5, v6, v7, v8; #ifdef UNROLL4 __m128 v9, v10, v11, v12, v13, v14, v15, v16, v17, v18; #endif __m128 half, zero, one; VECTOR4* surface_normals; VECTOR4* normal_map; surface_normals = (VECTOR4*)_mm_malloc((size_x+1)*(size_y+1)*2*sizeof(__m128), sizeof(__m128)); normal_map = (VECTOR4*)_mm_malloc(size_x*size_y*sizeof(__m128), sizeof(__m128)); one = _mm_set1_ps(1.0f); zero = _mm_set1_ps(0.0f); half = _mm_set1_ps(0.5f); n_index = 0; for (i = 0; i < size_x+1; i++) { _mm_stream_ps(surface_normals[n_index+0], zero); _mm_stream_ps(surface_normals[n_index+1], zero); n_index += 2; } for (i = 0; i < size_y-1; i++) { _mm_stream_ps(surface_normals[n_index+0], zero); _mm_stream_ps(surface_normals[n_index+1], zero); n_index += 2; h1 = _mm_load_ps(&h_map_f[i*size_x]); h3 = _mm_load_ps(&h_map_f[(i+1)*size_x]); for (j = 4; j < size_x; j+=4) { h0 = h1; h1 = _mm_load_ps(&h_map_f[i*size_x+j]); h2 = h3; h3 = _mm_load_ps(&h_map_f[(i+1)*size_x+j]); h4 = _mm_move_ss(h0, h1); h5 = _mm_move_ss(h2, h3); h4 = _mm_shuffle_ps(h4, h4, _MM_SHUFFLE(0, 3, 2, 1)); h5 = _mm_shuffle_ps(h5, h5, _MM_SHUFFLE(0, 3, 2, 1)); v1_x = _mm_sub_ps(h0, h2); v1_y = _mm_sub_ps(h0, h4); v2_x = _mm_sub_ps(h4, h5); v2_y = _mm_sub_ps(h2, h5); t1 = _mm_add_ps(_mm_add_ps(_mm_mul_ps(v1_x, v1_x), _mm_mul_ps(v1_y, v1_y)), one); t2 = _mm_add_ps(_mm_add_ps(_mm_mul_ps(v2_x, v2_x), _mm_mul_ps(v2_y, v2_y)), one); t1 = _mm_rsqrt_ps(t1); // t1 = [1/sqrt(t1.x), 1/sqrt(t1.y), 1/sqrt(t1.z), 1/sqrt(t1.w)] t2 = _mm_rsqrt_ps(t2); // t2 = [1/sqrt(t2.x), 1/sqrt(t2.y), 1/sqrt(t2.z), 1/sqrt(t2.w)] v1_x = _mm_mul_ps(v1_x, t1); v1_y = _mm_mul_ps(v1_y, t1); v2_x = _mm_mul_ps(v2_x, t2); v2_y = _mm_mul_ps(v2_y, t2); t3 = _mm_unpacklo_ps(t1, zero); // t3 = {t1.x, 0.0f, t1.y, 0.0f} t4 = _mm_unpackhi_ps(t1, zero); // t4 = {t1.z, 0.0f, t1.w 0.0f} t5 = _mm_unpacklo_ps(t2, zero); // t5 = {t2.x, 0.0f, t2.y, 0.0f} t6 = _mm_unpackhi_ps(t2, zero); // t6 = {t2.z, 0.0f, t2.w, 0.0f} t1 = _mm_unpacklo_ps(v1_x, v1_y); // t1 = {v1_x.0, v1_y.0, v1_x.1, v1_y.1} t2 = _mm_unpackhi_ps(v1_x, v1_y); // t2 = {v1_x.2, v1_y.2, v1_x.3, v1_y.3} v1 = _mm_movelh_ps(t1, t3); // v1 = {v1_x.0, v1_y.0, t1.x, 0.0f} v2 = _mm_movehl_ps(t3, t1); // v2 = {v1_x.1, v1_y.1, t1.y, 0.0f} v3 = _mm_movelh_ps(t2, t4); // v3 = {v1_x.2, v1_y.2, t1.z, 0.0f} v4 = _mm_movehl_ps(t4, t2); // v4 = {v1_x.3, v1_y.3, t1.2, 0.0f} t1 = _mm_unpacklo_ps(v2_x, v2_y); // t1 = {v2_x.0, v2_y.0, v2_x.1, v2_y.1} t2 = _mm_unpackhi_ps(v2_x, v2_y); // t2 = {v2_x.2, v2_y.2, v2_x.3, v2_y.3} v5 = _mm_movelh_ps(t1, t5); // v5 = {v2_x.0, v2_y.0, t2.x, 0.0f} v6 = _mm_movehl_ps(t5, t1); // v6 = {v2_x.1, v2_y.1, t2.y, 0.0f} v7 = _mm_movelh_ps(t2, t6); // v7 = {v2_x.2, v2_y.2, t2.z, 0.0f} v8 = _mm_movehl_ps(t6, t2); // v8 = {v2_x.3, v2_y.3, t2.w, 0.0f} _mm_stream_ps(surface_normals[n_index+0], v1); _mm_stream_ps(surface_normals[n_index+2], v2); _mm_stream_ps(surface_normals[n_index+4], v3); _mm_stream_ps(surface_normals[n_index+6], v4); _mm_stream_ps(surface_normals[n_index+1], v5); _mm_stream_ps(surface_normals[n_index+3], v6); _mm_stream_ps(surface_normals[n_index+5], v7); _mm_stream_ps(surface_normals[n_index+7], v8); n_index += 8; } h0 = h1; h2 = h3; h4 = _mm_shuffle_ps(h0, h0, _MM_SHUFFLE(0, 3, 2, 1)); h5 = _mm_shuffle_ps(h2, h2, _MM_SHUFFLE(0, 3, 2, 1)); v1_x = _mm_sub_ps(h0, h2); v1_y = _mm_sub_ps(h0, h4); v2_x = _mm_sub_ps(h4, h5); v2_y = _mm_sub_ps(h2, h5); t1 = _mm_add_ps(_mm_add_ps(_mm_mul_ps(v1_x, v1_x), _mm_mul_ps(v1_y, v1_y)), one); t2 = _mm_add_ps(_mm_add_ps(_mm_mul_ps(v2_x, v2_x), _mm_mul_ps(v2_y, v2_y)), one); t1 = _mm_rsqrt_ps(t1); t2 = _mm_rsqrt_ps(t2); v1_x = _mm_mul_ps(v1_x, t1); v1_y = _mm_mul_ps(v1_y, t1); v2_x = _mm_mul_ps(v2_x, t2); v2_y = _mm_mul_ps(v2_y, t2); t3 = _mm_unpacklo_ps(t1, zero); // t3 = {t1.x, 0.0f, t1.y, 0.0f} t4 = _mm_unpackhi_ps(t1, zero); // t4 = {t1.z, 0.0f, t1.w 0.0f} t5 = _mm_unpacklo_ps(t2, zero); // t5 = {t2.x, 0.0f, t2.y, 0.0f} t6 = _mm_unpackhi_ps(t2, zero); // t6 = {t2.z, 0.0f, t2.w, 0.0f} t1 = _mm_unpacklo_ps(v1_x, v1_y); // t1 = {v1_x.0, v1_y.0, v1_x.1, v1_y.1} t2 = _mm_unpackhi_ps(v1_x, v1_y); // t2 = {v1_x.2, v1_y.2, v1_x.3, v1_y.3} v1 = _mm_movelh_ps(t1, t3); // v1 = {v1_x.0, v1_y.0, t1.x, 0.0f} v2 = _mm_movehl_ps(t3, t1); // v2 = {v1_x.1, v1_y.1, t1.y, 0.0f} v3 = _mm_movelh_ps(t2, t4); // v3 = {v1_x.2, v1_y.2, t1.z, 0.0f} t1 = _mm_unpacklo_ps(v2_x, v2_y); // t1 = {v2_x.0, v2_y.0, v2_x.1, v2_y.1} t2 = _mm_unpackhi_ps(v2_x, v2_y); // t2 = {v2_x.2, v2_y.2, v2_x.3, v2_y.3} v5 = _mm_movelh_ps(t1, t5); // v5 = {v2_x.0, v2_y.0, t2.x, 0.0f} v6 = _mm_movehl_ps(t5, t1); // v6 = {v2_x.1, v2_y.1, t2.y, 0.0f} v7 = _mm_movelh_ps(t2, t6); // v7 = {v2_x.2, v2_y.2, t2.z, 0.0f} _mm_stream_ps(surface_normals[n_index+0], v1); _mm_stream_ps(surface_normals[n_index+2], v2); _mm_stream_ps(surface_normals[n_index+4], v3); _mm_stream_ps(surface_normals[n_index+1], v5); _mm_stream_ps(surface_normals[n_index+3], v6); _mm_stream_ps(surface_normals[n_index+5], v7); _mm_stream_ps(surface_normals[n_index+6], zero); _mm_stream_ps(surface_normals[n_index+7], zero); n_index += 8; } for (i = 0; i < size_x+1; i++) { _mm_stream_ps(surface_normals[n_index+0], zero); _mm_stream_ps(surface_normals[n_index+1], zero); n_index += 2; } _mm_free(h_map_f); n_row = ((size_x+1)*2)-1; n_index = 1; for (i = 0; i < size_y; i++) { #ifdef UNROLL4 v9 = _mm_load_ps(surface_normals[n_index]); v18 = _mm_load_ps(surface_normals[n_index+n_row]); for (j = 0; j < size_x; j += 4) { v1 = v9; v2 = _mm_load_ps(surface_normals[n_index+1]); v3 = _mm_load_ps(surface_normals[n_index+2]); v4 = _mm_load_ps(surface_normals[n_index+3]); v5 = _mm_load_ps(surface_normals[n_index+4]); v6 = _mm_load_ps(surface_normals[n_index+5]); v7 = _mm_load_ps(surface_normals[n_index+6]); v8 = _mm_load_ps(surface_normals[n_index+7]); v9 = _mm_load_ps(surface_normals[n_index+8]); v10 = v18; v11 = _mm_load_ps(surface_normals[n_index+n_row+1]); v12 = _mm_load_ps(surface_normals[n_index+n_row+2]); v13 = _mm_load_ps(surface_normals[n_index+n_row+3]); v14 = _mm_load_ps(surface_normals[n_index+n_row+4]); v15 = _mm_load_ps(surface_normals[n_index+n_row+5]); v16 = _mm_load_ps(surface_normals[n_index+n_row+6]); v17 = _mm_load_ps(surface_normals[n_index+n_row+7]); v18 = _mm_load_ps(surface_normals[n_index+n_row+8]); t1 = _mm_add_ps(v1, v2); t1 = _mm_add_ps(t1, v3); t1 = _mm_add_ps(t1, v10); t1 = _mm_add_ps(t1, v11); t1 = _mm_add_ps(t1, v12); t2 = _mm_add_ps(v3, v4); t2 = _mm_add_ps(t2, v5); t2 = _mm_add_ps(t2, v12); t2 = _mm_add_ps(t2, v13); t2 = _mm_add_ps(t2, v14); t3 = _mm_add_ps(v5, v6); t3 = _mm_add_ps(t3, v7); t3 = _mm_add_ps(t3, v14); t3 = _mm_add_ps(t3, v15); t3 = _mm_add_ps(t3, v16); t4 = _mm_add_ps(v7, v8); t4 = _mm_add_ps(t4, v9); t4 = _mm_add_ps(t4, v16); t4 = _mm_add_ps(t4, v17); t4 = _mm_add_ps(t4, v18); h1 = _mm_mul_ps(t1, t1); h2 = _mm_mul_ps(t2, t2); h3 = _mm_mul_ps(t3, t3); h4 = _mm_mul_ps(t4, t4); #ifdef USE_SSE3 v1 = _mm_hadd_ps(h1, h2); v2 = _mm_hadd_ps(h3, h4); h1 = _mm_hadd_ps(v1, v2); #else v1 = _mm_unpacklo_ps(h1, h2); // v1 = {h1.x, h2.x, t1.y, h2.y} v2 = _mm_unpackhi_ps(h1, h2); // v2 = {h1.z, h2.z, t1.w h2.w} v3 = _mm_unpacklo_ps(h3, h4); // v3 = {h3.x, h4.x, h3.y, h4.y} v4 = _mm_unpackhi_ps(h3, h4); // v4 = {h3.z, h4.z, h3.w, h4.w} h1 = _mm_movelh_ps(v1, v3); h2 = _mm_movehl_ps(v3, v1); h3 = _mm_movelh_ps(v2, v4); h4 = _mm_movehl_ps(v4, v2); h1 = _mm_add_ps(h1, h2); h1 = _mm_add_ps(h1, h3); h1 = _mm_add_ps(h1, h4); #endif h5 = _mm_rsqrt_ps(h1); // h5 = [1/sqrt(h1.x), 1/sqrt(h1.y), 1/sqrt(h1.z), 1/sqrt(h1.w)] #ifdef USE_SSE2 h1 = (__m128)_mm_shuffle_epi32(h5, _MM_SHUFFLE(0, 0, 0, 0)); h2 = (__m128)_mm_shuffle_epi32(h5, _MM_SHUFFLE(1, 1, 1, 1)); h3 = (__m128)_mm_shuffle_epi32(h5, _MM_SHUFFLE(2, 2, 2, 2)); h4 = (__m128)_mm_shuffle_epi32(h5, _MM_SHUFFLE(3, 3, 3, 3)); #else h1 = _mm_shuffle_ps(h5, h5, _MM_SHUFFLE(0, 0, 0, 0)); h2 = _mm_shuffle_ps(h5, h5, _MM_SHUFFLE(1, 1, 1, 1)); h3 = _mm_shuffle_ps(h5, h5, _MM_SHUFFLE(2, 2, 2, 2)); h4 = _mm_shuffle_ps(h5, h5, _MM_SHUFFLE(3, 3, 3, 3)); #endif t1 = _mm_mul_ps(t1, h1); t2 = _mm_mul_ps(t2, h2); t3 = _mm_mul_ps(t3, h3); t4 = _mm_mul_ps(t4, h4); t1 = _mm_add_ps(t1, one); t1 = _mm_mul_ps(t1, half); t2 = _mm_add_ps(t2, one); t2 = _mm_mul_ps(t2, half); t3 = _mm_add_ps(t3, one); t3 = _mm_mul_ps(t3, half); t4 = _mm_add_ps(t4, one); t4 = _mm_mul_ps(t4, half); _mm_stream_ps(normal_map[i*size_x+j+0], t1); _mm_stream_ps(normal_map[i*size_x+j+1], t2); _mm_stream_ps(normal_map[i*size_x+j+2], t3); _mm_stream_ps(normal_map[i*size_x+j+3], t4); n_index += 8; } n_index += 2; #else v3 = _mm_load_ps(surface_normals[n_index]); v6 = _mm_load_ps(surface_normals[n_index+n_row]); for (j = 0; j < size_x; j++) { v1 = v3; v2 = _mm_load_ps(surface_normals[n_index+1]); v3 = _mm_load_ps(surface_normals[n_index+2]); v4 = v6; v5 = _mm_load_ps(surface_normals[n_index+n_row+1]); v6 = _mm_load_ps(surface_normals[n_index+n_row+2]); v1 = _mm_add_ps(v1, v2); v1 = _mm_add_ps(v1, v3); v1 = _mm_add_ps(v1, v4); v1 = _mm_add_ps(v1, v5); v1 = _mm_add_ps(v1, v6); t1 = _mm_mul_ps(v1, v1); #ifdef USE_SSE3 t1 = _mm_hadd_ps(t1, t1); t1 = _mm_hadd_ps(t1, t1); #else t2 = _mm_movehl_ps(t2, t1); #ifdef USE_SSE2 t3 = (__m128)_mm_shuffle_epi32(t1, _MM_SHUFFLE(1, 1, 1, 1)); #else t3 = _mm_shuffle_ps(t1, t1, _MM_SHUFFLE(1, 1, 1, 1)); #endif t1 = _mm_add_ss(t1, t2); t1 = _mm_add_ss(t1, t3); #endif t1 = _mm_rsqrt_ss(t1); // t1 = [1/sqrt(t1.x), 0.0f, 0.0f, 0.0f] t1 = _mm_shuffle_ps(t1, t1, _MM_SHUFFLE(0, 0, 0, 0)); // t1 = [t1.x, t1.x, t1.x, t1.x] v1 = _mm_mul_ps(v1, t1); v1 = _mm_add_ps(v1, one); v1 = _mm_mul_ps(v1, half); _mm_stream_ps(normal_map[i*size_x+j], v1); n_index += 2; } n_index += 2; #endif } _mm_free(surface_normals); build_normal_texures(normal_map, size_x, size_y); _mm_free(normal_map); } /*! * \ingroup display_utils * \brief Calculates the current normal map of the terrain height map. * * Converts the signed short terrain height map to a float terrain height map and calls * calc_normal_map_float_sse. * \param h_map The terrain height map. Address must be 16 Byte aligned for SSE2. * \param size_x The size of the terrain height map in x direction. Must be a multiple * of four for SSE and a multiple of eight for SSE2. * \param size_y The size of the terrain height map in y direction. * \param h_scale The scale of the terrain height (z direction). * * \callgraph */ static __inline__ void calc_normal_map_sse(unsigned short* h_map, unsigned int size_x, unsigned int size_y, float h_scale) { unsigned int i, j, index; float* h_map_f; __m128 vf1, vf2, v_scale; #ifdef USE_SSE2 __m128i vi0, vi1, vi2, zero; zero = (__m128i)_mm_set1_ps(0.0f); #else __m64 vi1, vi2; #endif v_scale = _mm_set1_ps(h_scale); h_map_f = (float*)_mm_malloc(size_x*size_y*sizeof(__m128), sizeof(__m128)); index = 0; for (i = 0; i < size_y; i++) { for (j = 0; j < size_x; j += 8) { #ifdef USE_SSE2 vi0 = (__m128i)_mm_load_ps((float*)&h_map[index]); vi1 = _mm_unpacklo_epi16(vi0, zero); vi2 = _mm_unpackhi_epi16(vi0, zero); vf1 = _mm_cvtepi32_ps(vi1); vf2 = _mm_cvtepi32_ps(vi2); #else vi1 = *((__m64*)&h_map[index]); vi2 = *((__m64*)&h_map[index+4]); vf1 = _mm_cvtpu16_ps(vi1); vf2 = _mm_cvtpu16_ps(vi2); #endif vf1 = _mm_mul_ps(vf1, v_scale); vf2 = _mm_mul_ps(vf2, v_scale); _mm_stream_ps(&h_map_f[index], vf1); _mm_stream_ps(&h_map_f[index+4], vf2); index += 8; } } #ifndef USE_SSE2 _mm_empty(); #endif calc_normal_map_float_sse(h_map_f, size_x, size_y); } #ifdef __cplusplus } // extern "C" #endif #endif #endif