SH4ZAM! 0.7.0
Fast math library for the Sega Dreamcast's SH4 CPU
Loading...
Searching...
No Matches
shz_mem.h
Go to the documentation of this file.
1/*! \file
2 * \brief Memory API
3 * \ingroup memory
4 *
5 * API built around copying, assigning, and working with memory.
6 *
7 * \todo
8 * - shz_macw()
9 * - shz_memset2()
10 * - shz_memset4()
11 * - shz_memset32()
12 * - shz_memset()
13 * - shz_memmoveN()
14 *
15 * \author 2025, 2026 Falco Girgis
16 * \author 2020 MoopTheHedgehog
17 *
18 * \copyright MIT License
19 */
20
21#ifndef SHZ_MEM_H
22#define SHZ_MEM_H
23
24#include "shz_cdefs.h"
25
26#include <stdbool.h>
27#include <stddef.h>
28
29/*! \defgroup memory Memory
30 \brief Routines for managing memory.
31
32 This API provides the following types of memory routines:
33 - special instruction intrinsics
34 - cache operations
35 - memcpy()-type routines
36
37 \note
38 memcpy()-like routines will typically always check for
39 proper alignment and size increments of parameters using
40 assert(), so make sure to build a release build (-DNDEBUG)
41 for maximal gainz, when not debugging.
42 */
43
44SHZ_DECLS_BEGIN
45
46/*! \name C stdlib Replacements
47 \brief Routine replacing the C standard library copy/set API.
48 @{
49*/
50
51/*! Generic drop-in fast memcpy() replacement.
52
53 Copies \p bytes from \p src to \p dst, determining the most efficient
54 specialization to call into at run-time, returning \p dst.
55
56 There are no alignment or size requirements for this routine.
57
58 \note
59 When you know of and can control the \p src and \p dst alignments and
60 batch sizes, you can micro-optimize by calling into the most specific
61 memcpy() specialization for your given scenario, over just using this
62 generic implementation, which must choose which one to use at run-time.
63
64 \warning
65 \p dst and \p src buffers should not be overlapping.
66
67 \sa shz_memcpy1(), shz_memcpy2(), shz_memcpy4(), shz_memcpy8(), shz_memcpy32(),
68 shz_memcpy64(), shz_memcpy128()
69*/
70SHZ_INLINE void* shz_memcpy( void* SHZ_RESTRICT dst,
71 const void* SHZ_RESTRICT src,
72 size_t bytes) SHZ_NOEXCEPT;
73
74/*! Generic drop-in fast memmove() replacement.
75
76 Copies \p bytes from \p src to \p dst, determining the most efficient
77 specialization to call into at run-time, return \p dst. The source and
78 destination buffers are allowed to overlap, making this routine slightly
79 less efficient, but more versatile than shz_memcpy().
80
81 There is no alignment or size requirement for this routine.
82
83 \sa shz_memcpy()
84*/
85SHZ_INLINE void* shz_memmove(void* dst, const void* src, size_t bytes) SHZ_NOEXCEPT;
86
87//! @}
88
89/*! \name Specializations
90 \brief Specialized routines for specific sizes + alignments.
91 @{
92*/
93
94/*! Copies an unaligned buffer to another one byte at a time.
95
96 The \p dst pointer is returned.
97
98 \note
99 Typically, unless you know you are copying a tiny number of
100 definitely unaligned bytes, you want to use shz_memcpy(),
101 which automatically handles arbitrary alignment for you,
102 potentially more efficiently than copying byte-by-byte.
103
104 \warning
105 \p dst and \p src buffers should not be overlapping.
106
107 \sa shz_memcpy()
108*/
109SHZ_INLINE void* shz_memcpy1( void* SHZ_RESTRICT dst,
110 const void* SHZ_RESTRICT src,
111 size_t bytes) SHZ_NOEXCEPT;
112
113/*! Copies from one 2-byte aligned buffer to another two bytes at a time.
114
115 The \p dst pointer is returned.
116
117 \warning
118 \p dst and \p src must both be aligned by at least 2 bytes, and \p bytes
119 must be a multiple of 2.
120
121 \warning
122 \p dst and \p src buffers should not be overlapping.
123*/
124SHZ_INLINE void* shz_memcpy2( void* SHZ_RESTRICT dst,
125 const void* SHZ_RESTRICT src,
126 size_t bytes) SHZ_NOEXCEPT;
127
128/*! Copies a from one 4-byte aligned buffer to another 4 bytes at a time.
129
130 The \p dst buffer is returned.
131
132 \warning
133 \p dst and \p src must both be aligned by at least 4 bytes, and
134 \p bytes must be a multiple of 4.
135
136 \warning
137 \p dst and \p src buffers should not be overlapping.
138*/
139SHZ_INLINE void* shz_memcpy4( void* SHZ_RESTRICT dst,
140 const void* SHZ_RESTRICT src,
141 size_t bytes) SHZ_NOEXCEPT;
142
143/*! Copies a from one 8-byte aligned buffer to another 8 bytes at a time.
144
145 The \p dst buffer is returned.
146
147 \warning
148 \p dst and \p src must both be aligned by at least 8 bytes, and
149 \p bytes must be a multiple of 8.
150
151 \warning
152 \p src and \p dst should not overlap.
153*/
154SHZ_INLINE void* shz_memcpy8( void* SHZ_RESTRICT dst,
155 const void* SHZ_RESTRICT src,
156 size_t bytes) SHZ_NOEXCEPT;
157
158/*! Assigns the given 8-byte \p value to the \p bytes in \p dst.
159
160 \warning
161 \p dst should be at least 8-byte aligned, and \p bytes should be
162 a multiple of 8!
163*/
164SHZ_INLINE void* shz_memset8(void* dst, uint64_t value, size_t bytes) SHZ_NOEXCEPT;
165
166/*! Copies \p bytes from the \p src to the \p dst buffer in 32-byte chunks.
167
168 Transfers from 8-byte aligned buffer, \p src to 32-byte aligned buffer, \p dst,
169 32 bytes at a time. Returns the \p dst address.
170
171 \warning
172 \p dst must be 32-byte aligned, while \p src can be only 8-byte aligned. \p bytes must
173 be a multiple of 32.
174
175 \warning
176 \p src and \p dst buffers must not overlap.
177
178 \note
179 This is the quickest way to move 32-byte chunks of data around *within memory*, but
180 the shz_sq_memcpy32() will be faster when writing through the cache to external memory.
181
182 \sa shz_sq_memcpy32()
183*/
184SHZ_INLINE void* shz_memcpy32( void* SHZ_RESTRICT dst,
185 const void* SHZ_RESTRICT src,
186 size_t bytes) SHZ_NOEXCEPT;
187
188/*! Copies \p bytes from \p src to \p dst in 32-byte chunks, using the Store Queues.
189
190 Transfers from 8-byte aligned buffer, \p src to 4-byte aligned address, \p dst,
191 32 bytes at a time, writing through the cache, using the SH4's Store Queues.
192 Returns the \p dst address.
193
194 \warning
195 \p src must be at least 8-byte aligned, while \p dst can be only 4-byte aligned.
196 \p bytes must be a multiple of 32.
197
198 \note
199 This is the quickest way to move 32-byte chunks of data to *external memory*.
200 When copying to cached memory, you must invalidate the cache lines containing
201 \p dst before initiating the copy... Which means this routine becomes slower
202 than doing memory-to-memory copies with shz_memcpy32().
203
204 \sa shz_memcpy32(), shz_sq_memcpy32_1()
205*/
206SHZ_INLINE void* shz_sq_memcpy32( void* SHZ_RESTRICT dst,
207 const void* SHZ_RESTRICT src,
208 size_t bytes) SHZ_NOEXCEPT;
209
210/*! Copies \p bytes from \p src to \p dst in 32-byte chunks, using the Store Queues and XMTRX.
211
212 Equiavalent to shz_sq_memcpy32(), except copying is done through XMTRX.
213
214 \warning
215 This routine clobbers XMTRX.
216*/
217SHZ_INLINE void* shz_sq_memcpy32_xmtrx( void* SHZ_RESTRICT dst,
218 const void* SHZ_RESTRICT src,
219 size_t bytes) SHZ_NOEXCEPT;
220
221/*! Specialized memcpy() variant for copying multiples of 64-bytes.
222
223 Copies a from an 8-byte aligned buffer to a 32-byte aligned buffer, 64 bytes at a time.
224 Returns the \p dst address.
225
226 \warning
227 \p src and \p dst buffers must not overlap.
228
229 \warning
230 \p dst must be 32-byte aligned, while \p src can be only 8-byte aligned. \p bytes must
231 be a multiple of 64.
232*/
233SHZ_INLINE void* shz_memcpy64( void* SHZ_RESTRICT dst,
234 const void* SHZ_RESTRICT src,
235 size_t bytes) SHZ_NOEXCEPT;
236
237/*! Specialized memcpy() variant for copying multiples of 128 bytes.
238
239 Copies a from an 8-byte aligned buffer to a 32-byte aligned buffer, 128 bytes at a time.
240 Returns the \p dst address.
241
242 \warning
243 \p src and \p dst buffers must not overlap.
244
245 \warning
246 \p dst must be 32-byte aligned, while \p src can be only 8-byte aligned. \p bytes must
247 be a multiple of 128.
248*/
249SHZ_INLINE void* shz_memcpy128( void* SHZ_RESTRICT dst,
250 const void* SHZ_RESTRICT src,
251 size_t bytes) SHZ_NOEXCEPT;
252
253//! @}
254
255/*! \name Constant-sized Operations
256 \brief Specialized routines for operating on statically sized buffers.
257 @{
258*/
259
260/*! Copies 8 shorts from \p src to \p dst.
261
262 \warning
263 \p src and \p dst buffers must not overlap.
264
265 \warning
266 \p dst and \p src must both be aligned by at least two bytes.
267*/
268SHZ_INLINE void shz_memcpy2_8( void* SHZ_RESTRICT dst,
269 const void* SHZ_RESTRICT src) SHZ_NOEXCEPT;
270
271/*! Copies 16 shorts from \p src to \p dst.
272
273 \warning
274 \p src and \p dst buffers must not overlap.
275
276 \warning
277 \p dst and \p src must both be aligned by at least two bytes.
278*/
279SHZ_INLINE void shz_memcpy2_16( void* SHZ_RESTRICT dst,
280 const void* SHZ_RESTRICT src) SHZ_NOEXCEPT;
281
282/*! Sets the values of the 16 shorts pointed to by \p dst to the given \p value.
283
284 \warning
285 \p dst must be aligned by at least two bytes.
286*/
287SHZ_INLINE void shz_memset2_16(void* dst, uint16_t value) SHZ_NOEXCEPT;
288
289/*! Copies 16 4-byte, long values from \p src to \p dst.
290
291 \warning
292 \p src and \p dst buffers must not overlap.
293
294 \warning
295 The \p src and \p dst buffers must both be at least 4-byte aligned.
296*/
297SHZ_INLINE void shz_memcpy4_16( void* SHZ_RESTRICT dst,
298 const void* SHZ_RESTRICT src) SHZ_NOEXCEPT;
299
300/*! Copies 32 bytes from \p p1 to \p p2 as a single chunk.
301
302 \warning
303 \p dst must be 32-byte aligned, while \p src can be only 8-byte aligned.
304*/
305SHZ_INLINE void shz_memcpy32_1( void* SHZ_RESTRICT dst,
306 const void* SHZ_RESTRICT src) SHZ_NOEXCEPT;
307
308/*! Swaps the values within the given 32-byte buffers.
309
310 \warning
311 \p p1 and \p p2 must be at least 8-byte aligned.
312*/
313SHZ_INLINE void shz_memswap32_1(void* SHZ_RESTRICT p1,
314 void* SHZ_RESTRICT p2) SHZ_NOEXCEPT;
315
316/*! Swaps the values within the given 32-byte buffers, using XMTRX.
317
318 Equivalent to shz_memcpy32_1(), except copying is done through XMTRX.
319
320 \warning
321 This routine clobbers XMTRX!
322*/
323SHZ_INLINE void shz_memswap32_1_xmtrx(void* SHZ_RESTRICT p1,
324 void* SHZ_RESTRICT p2) SHZ_NOEXCEPT;
325
326/*! Copies \p src to \p dst in a single 32-byte transaction using the Store Queues.
327
328 \note
329 The Store Queues bypass the SH4's data-cache! They are typically used to
330 transfer to *external memory* and are slower for memory-to-memory transactions.
331
332 \warning
333 \p dst must be at least 4-byte aligned, while \p src must be at least 8-byte aligned.
334
335 \sa shz_memcpy32()
336*/
337SHZ_INLINE void* shz_sq_memcpy32_1( void* SHZ_RESTRICT dst,
338 const void* SHZ_RESTRICT src) SHZ_NOEXCEPT;
339
340
341/*! Copies \p src to \p dst in a single 32-byte transaction using the Store Queues and XMTRX.
342
343 Equivalent to shz_sq_memcpy32_1(), except copying is done through XMTRX.
344
345 \warning
346 This routine clobberx XMTRX.
347
348 \sa shz_memcpy32()
349*/
350SHZ_INLINE void* shz_sq_memcpy32_1_xmtrx( void* SHZ_RESTRICT dst,
351 const void* SHZ_RESTRICT src) SHZ_NOEXCEPT;
352
353/*! Intrinsic around the SH4 `MOVCA.L` instruction.
354
355 Preallocates the cache-line containing \p src.
356
357 Zero-initializes all 32-bytes within the \p src cache-line,
358 setting the valid bit to `1`.
359*/
360SHZ_INLINE void shz_dcache_alloc_line(void* src) SHZ_NOEXCEPT;
361
362//! @}
363
364#include "inline/shz_mem.inl.h"
365
366SHZ_DECLS_END
367
368#endif
void * shz_memset8(void *dst, uint64_t value, size_t bytes) SHZ_NOEXCEPT
Assigns the given 8-byte value to the bytes in dst.
void * shz_memcpy4(void *SHZ_RESTRICT dst, const void *SHZ_RESTRICT src, size_t bytes) SHZ_NOEXCEPT
Copies a from one 4-byte aligned buffer to another 4 bytes at a time.
void * shz_sq_memcpy32_xmtrx(void *SHZ_RESTRICT dst, const void *SHZ_RESTRICT src, size_t bytes) SHZ_NOEXCEPT
Copies bytes from src to dst in 32-byte chunks, using the Store Queues and XMTRX.
void * shz_sq_memcpy32(void *SHZ_RESTRICT dst, const void *SHZ_RESTRICT src, size_t bytes) SHZ_NOEXCEPT
Copies bytes from src to dst in 32-byte chunks, using the Store Queues.
void shz_memcpy2_16(void *SHZ_RESTRICT dst, const void *SHZ_RESTRICT src) SHZ_NOEXCEPT
Copies 16 shorts from src to dst.
void shz_memswap32_1_xmtrx(void *SHZ_RESTRICT p1, void *SHZ_RESTRICT p2) SHZ_NOEXCEPT
Swaps the values within the given 32-byte buffers, using XMTRX.
void * shz_sq_memcpy32_1(void *SHZ_RESTRICT dst, const void *SHZ_RESTRICT src) SHZ_NOEXCEPT
Copies src to dst in a single 32-byte transaction using the Store Queues.
void shz_memcpy2_8(void *SHZ_RESTRICT dst, const void *SHZ_RESTRICT src) SHZ_NOEXCEPT
Copies 8 shorts from src to dst.
void shz_memswap32_1(void *SHZ_RESTRICT p1, void *SHZ_RESTRICT p2) SHZ_NOEXCEPT
Swaps the values within the given 32-byte buffers.
void shz_dcache_alloc_line(void *src) SHZ_NOEXCEPT
Intrinsic around the SH4 MOVCA.L instruction.
void * shz_memcpy2(void *SHZ_RESTRICT dst, const void *SHZ_RESTRICT src, size_t bytes) SHZ_NOEXCEPT
Copies from one 2-byte aligned buffer to another two bytes at a time.
void * shz_sq_memcpy32_1_xmtrx(void *SHZ_RESTRICT dst, const void *SHZ_RESTRICT src) SHZ_NOEXCEPT
Copies src to dst in a single 32-byte transaction using the Store Queues and XMTRX.
void * shz_memmove(void *dst, const void *src, size_t bytes) SHZ_NOEXCEPT
Generic drop-in fast memmove() replacement.
void * shz_memcpy64(void *SHZ_RESTRICT dst, const void *SHZ_RESTRICT src, size_t bytes) SHZ_NOEXCEPT
Specialized memcpy() variant for copying multiples of 64-bytes.
void * shz_memcpy1(void *SHZ_RESTRICT dst, const void *SHZ_RESTRICT src, size_t bytes) SHZ_NOEXCEPT
Copies an unaligned buffer to another one byte at a time.
void * shz_memcpy(void *SHZ_RESTRICT dst, const void *SHZ_RESTRICT src, size_t bytes) SHZ_NOEXCEPT
Generic drop-in fast memcpy() replacement.
void shz_memcpy32_1(void *SHZ_RESTRICT dst, const void *SHZ_RESTRICT src) SHZ_NOEXCEPT
Copies 32 bytes from p1 to p2 as a single chunk.
void shz_memcpy4_16(void *SHZ_RESTRICT dst, const void *SHZ_RESTRICT src) SHZ_NOEXCEPT
Copies 16 4-byte, long values from src to dst.
void * shz_memcpy32(void *SHZ_RESTRICT dst, const void *SHZ_RESTRICT src, size_t bytes) SHZ_NOEXCEPT
Copies bytes from the src to the dst buffer in 32-byte chunks.
void shz_memset2_16(void *dst, uint16_t value) SHZ_NOEXCEPT
Sets the values of the 16 shorts pointed to by dst to the given value.
void * shz_memcpy8(void *SHZ_RESTRICT dst, const void *SHZ_RESTRICT src, size_t bytes) SHZ_NOEXCEPT
Copies a from one 8-byte aligned buffer to another 8 bytes at a time.
void * shz_memcpy128(void *SHZ_RESTRICT dst, const void *SHZ_RESTRICT src, size_t bytes) SHZ_NOEXCEPT
Specialized memcpy() variant for copying multiples of 128 bytes.