11 #include <emmintrin.h> 12 #include <smmintrin.h> 18 inline void* gpu_memcpy(
void* d,
const void* s,
size_t size)
20 static const size_t regsInLoop =
sizeof(size_t) * 2;
22 if (d ==
nullptr || s ==
nullptr)
return nullptr;
25 bool isAligned = (((size_t)(s) | (size_t)(d)) & 0xF) == 0;
28 return memcpy(d, s, size);
31 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
33 __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
36 size_t reminder = size & (regsInLoop *
sizeof(xmm0) - 1);
39 __m128i* pTrg = (__m128i*)d;
40 __m128i* pTrgEnd = pTrg + ((size - reminder) >> 4);
41 __m128i* pSrc = (__m128i*)s;
46 while (pTrg < pTrgEnd)
50 xmm0 = _mm_stream_load_si128(pSrc);
51 xmm1 = _mm_stream_load_si128(pSrc + 1);
52 xmm2 = _mm_stream_load_si128(pSrc + 2);
53 xmm3 = _mm_stream_load_si128(pSrc + 3);
54 xmm4 = _mm_stream_load_si128(pSrc + 4);
55 xmm5 = _mm_stream_load_si128(pSrc + 5);
56 xmm6 = _mm_stream_load_si128(pSrc + 6);
57 xmm7 = _mm_stream_load_si128(pSrc + 7);
58 #ifdef _M_X64 // Use all 16 xmm registers 59 xmm8 = _mm_stream_load_si128(pSrc + 8);
60 xmm9 = _mm_stream_load_si128(pSrc + 9);
61 xmm10 = _mm_stream_load_si128(pSrc + 10);
62 xmm11 = _mm_stream_load_si128(pSrc + 11);
63 xmm12 = _mm_stream_load_si128(pSrc + 12);
64 xmm13 = _mm_stream_load_si128(pSrc + 13);
65 xmm14 = _mm_stream_load_si128(pSrc + 14);
66 xmm15 = _mm_stream_load_si128(pSrc + 15);
70 _mm_store_si128(pTrg , xmm0);
71 _mm_store_si128(pTrg + 1, xmm1);
72 _mm_store_si128(pTrg + 2, xmm2);
73 _mm_store_si128(pTrg + 3, xmm3);
74 _mm_store_si128(pTrg + 4, xmm4);
75 _mm_store_si128(pTrg + 5, xmm5);
76 _mm_store_si128(pTrg + 6, xmm6);
77 _mm_store_si128(pTrg + 7, xmm7);
78 #ifdef _M_X64 // Use all 16 xmm registers 79 _mm_store_si128(pTrg + 8, xmm8);
80 _mm_store_si128(pTrg + 9, xmm9);
81 _mm_store_si128(pTrg + 10, xmm10);
82 _mm_store_si128(pTrg + 11, xmm11);
83 _mm_store_si128(pTrg + 12, xmm12);
84 _mm_store_si128(pTrg + 13, xmm13);
85 _mm_store_si128(pTrg + 14, xmm14);
86 _mm_store_si128(pTrg + 15, xmm15);
97 for (
size_t i = 0; i < end; ++i)
99 pTrg[i] = _mm_stream_load_si128(pSrc + i);
106 __m128i temp = _mm_stream_load_si128(pSrc + end);
108 char* ps = (
char*)(&temp);
109 char* pt = (
char*)(pTrg + end);
111 for (
size_t i = 0; i < reminder; ++i)