xbmc
gpu_memcpy_sse4.h
1 /*
2  * Copyright (C) 2011-2015 Hendrik Leppkes
3  * http://www.1f0.de
4  *
5  * SPDX-License-Identifier: GPL-2.0-or-later
6  * See LICENSES/README.md for more information.
7  */
8 
9 #pragma once
10 
11 #include <emmintrin.h>
12 #include <smmintrin.h>
13 
14 // gpu_memcpy is a memcpy style function that copied data very fast from a
15 // GPU tiled memory (write back)
16 // Performance tip: page offset (12 lsb) of both addresses should be different
17 // optimally use a 2K offset between them.
18 inline void* gpu_memcpy(void* d, const void* s, size_t size)
19 {
20  static const size_t regsInLoop = sizeof(size_t) * 2; // 8 or 16
21 
22  if (d == nullptr || s == nullptr) return nullptr;
23 
24  // If memory is not aligned, use memcpy
25  bool isAligned = (((size_t)(s) | (size_t)(d)) & 0xF) == 0;
26  if (!isAligned)
27  {
28  return memcpy(d, s, size);
29  }
30 
31  __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
32 #ifdef _M_X64
33  __m128i xmm8, xmm9, xmm10, xmm11, xmm12, xmm13, xmm14, xmm15;
34 #endif
35 
36  size_t reminder = size & (regsInLoop * sizeof(xmm0) - 1); // Copy 128 or 256 bytes every loop
37  size_t end = 0;
38 
39  __m128i* pTrg = (__m128i*)d;
40  __m128i* pTrgEnd = pTrg + ((size - reminder) >> 4);
41  __m128i* pSrc = (__m128i*)s;
42 
43  // Make sure source is synced - doesn't hurt if not needed.
44  _mm_sfence();
45 
46  while (pTrg < pTrgEnd)
47  {
48  // _mm_stream_load_si128 emits the Streaming SIMD Extensions 4 (SSE4.1) instruction MOVNTDQA
49  // Fastest method for copying GPU RAM. Available since Penryn (45nm Core 2 Duo/Quad)
50  xmm0 = _mm_stream_load_si128(pSrc);
51  xmm1 = _mm_stream_load_si128(pSrc + 1);
52  xmm2 = _mm_stream_load_si128(pSrc + 2);
53  xmm3 = _mm_stream_load_si128(pSrc + 3);
54  xmm4 = _mm_stream_load_si128(pSrc + 4);
55  xmm5 = _mm_stream_load_si128(pSrc + 5);
56  xmm6 = _mm_stream_load_si128(pSrc + 6);
57  xmm7 = _mm_stream_load_si128(pSrc + 7);
58 #ifdef _M_X64 // Use all 16 xmm registers
59  xmm8 = _mm_stream_load_si128(pSrc + 8);
60  xmm9 = _mm_stream_load_si128(pSrc + 9);
61  xmm10 = _mm_stream_load_si128(pSrc + 10);
62  xmm11 = _mm_stream_load_si128(pSrc + 11);
63  xmm12 = _mm_stream_load_si128(pSrc + 12);
64  xmm13 = _mm_stream_load_si128(pSrc + 13);
65  xmm14 = _mm_stream_load_si128(pSrc + 14);
66  xmm15 = _mm_stream_load_si128(pSrc + 15);
67 #endif
68  pSrc += regsInLoop;
69  // _mm_store_si128 emit the SSE2 instruction MOVDQA (aligned store)
70  _mm_store_si128(pTrg , xmm0);
71  _mm_store_si128(pTrg + 1, xmm1);
72  _mm_store_si128(pTrg + 2, xmm2);
73  _mm_store_si128(pTrg + 3, xmm3);
74  _mm_store_si128(pTrg + 4, xmm4);
75  _mm_store_si128(pTrg + 5, xmm5);
76  _mm_store_si128(pTrg + 6, xmm6);
77  _mm_store_si128(pTrg + 7, xmm7);
78 #ifdef _M_X64 // Use all 16 xmm registers
79  _mm_store_si128(pTrg + 8, xmm8);
80  _mm_store_si128(pTrg + 9, xmm9);
81  _mm_store_si128(pTrg + 10, xmm10);
82  _mm_store_si128(pTrg + 11, xmm11);
83  _mm_store_si128(pTrg + 12, xmm12);
84  _mm_store_si128(pTrg + 13, xmm13);
85  _mm_store_si128(pTrg + 14, xmm14);
86  _mm_store_si128(pTrg + 15, xmm15);
87 #endif
88  pTrg += regsInLoop;
89  }
90 
91  // Copy in 16 byte steps
92  if (reminder >= 16)
93  {
94  size = reminder;
95  reminder = size & 15;
96  end = size >> 4;
97  for (size_t i = 0; i < end; ++i)
98  {
99  pTrg[i] = _mm_stream_load_si128(pSrc + i);
100  }
101  }
102 
103  // Copy last bytes - shouldn't happen as strides are modulo 16
104  if (reminder)
105  {
106  __m128i temp = _mm_stream_load_si128(pSrc + end);
107 
108  char* ps = (char*)(&temp);
109  char* pt = (char*)(pTrg + end);
110 
111  for (size_t i = 0; i < reminder; ++i)
112  {
113  pt[i] = ps[i];
114  }
115  }
116 
117  return d;
118 }