]> git.saurik.com Git - apple/xnu.git/blob - osfmk/arm64/bcopy.s
xnu-7195.50.7.100.1.tar.gz
[apple/xnu.git] / osfmk / arm64 / bcopy.s
1 /*
2 * Copyright (c) 2012 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 *
28 * This file implements the following functions for the arm64 architecture.
29 *
30 * void bcopy(const void * source,
31 * void * destination,
32 * size_t length);
33 *
34 * void *memmove(void * destination,
35 * const void * source,
36 * size_t n);
37 *
38 * void *memcpy(void * restrict destination,
39 * const void * restrict source,
40 * size_t n);
41 *
42 * All copy n successive bytes from source to destination. Memmove and memcpy
43 * return destination, whereas bcopy has no return value. Copying takes place
44 * as if it were through a temporary buffer -- after return destination
45 * contains exactly the bytes from source, even if the buffers overlap (this is
46 * not required of memcpy by the C standard; its behavior is undefined if the
47 * buffers overlap, but we are holding ourselves to the historical behavior of
48 * this function on MacOS).
49 */
50
51 #include "asm.h"
52
53 .globl _bcopy
54 .globl _ovbcopy
55 .globl _memcpy
56 .globl _memmove
57
58 /*****************************************************************************
59 * Macros *
60 *****************************************************************************/
61
62 #define kSmallCopy 64
63
64 /*****************************************************************************
65 * Entrypoints *
66 *****************************************************************************/
67
68 .text
69 .align 5
70 _bcopy:
71 _ovbcopy:
72 // Translate bcopy into memcpy by swapping the first and second arguments.
73 mov x3, x0
74 mov x0, x1
75 mov x1, x3
76
77 .align 4
78 _memcpy:
79 _memmove:
80 // Our preference is to copy the data in ascending address order, but if the
81 // buffers overlap such that the beginning of the destination buffer aliases
82 // the end of the source buffer, we need to copy in descending address order
83 // instead to preserve the memmove semantics. We detect this case with the
84 // test:
85 //
86 // destination - source < length (unsigned compare)
87 //
88 // If the address of the source buffer is higher than the address of the
89 // destination buffer, this arithmetic can overflow, but the overflowed value
90 // can only be smaller than length if the buffers do not overlap, so we don't
91 // need to worry about false positives due to the overflow (they happen, but
92 // only in cases where copying in either order is correct).
93 ARM64_STACK_PROLOG
94 PUSH_FRAME
95 sub x3, x0, x1
96 cmp x3, x2
97 b.cc L_reverse
98 mov x3, x0 // copy destination pointer
99 cmp x2, #(kSmallCopy)
100 b.cc L_forwardSmallCopy
101
102 /*****************************************************************************
103 * Forward large copy *
104 *****************************************************************************/
105
106 // Load the first 32 bytes from src, and compute the number of bytes to the
107 // first 32-byte aligned location in dst. Even though we are going to copy
108 // 32 bytes, only those preceeding that 32-byte location "count" towards
109 // reducing the length of the buffer or advancing the pointers. We will need
110 // to issue the first load from the advanced src pointer BEFORE the store to
111 // the unmodified dst pointer.
112 add x3, x3, #32
113 and x3, x3, #-32 // aligned dst
114 ldp x12,x13,[x1]
115 ldp x14,x15,[x1, #16]
116 sub x5, x3, x0 // bytes between original dst and aligned dst
117 add x1, x1, x5 // update src pointer
118
119 // At this point, data in the following registers is in flight:
120 //
121 // x0 original dst pointer
122 // x1 corresponding location in src buffer.
123 // x2 length from aligned location in dst to end of buffer. This is
124 // guaranteed to be >= (64 - 32).
125 // x3 aligned location in dst buffer.
126 // x12:x15 first 32 bytes of src buffer.
127 //
128 // We now load 32 bytes from x1, and store 32 bytes from x12:x15 to x3. The
129 // store *may* overlap the first 32 bytes of the load, so in order to get
130 // correct memmove semantics, the first 32 byte load must occur before the
131 // store.
132 //
133 // After loading these 32 bytes, we advance x1, and decrement the length by
134 // 64. If the remaining length of the buffer was less than 64, then we jump
135 // directly to the cleanup path.
136 ldp x8, x9, [x1]
137 ldp x10,x11,[x1, #16]
138 add x1, x1, #32
139 sub x2, x2, x5 // update length
140 stp x12,x13,[x0] // initial unaligned store
141 stp x14,x15,[x0, #16] // initial unaligned store
142 subs x2, x2, #64
143 b.ls L_forwardCleanup
144
145 L_forwardCopyLoop:
146 // Main copy loop:
147 //
148 // 1. store the 32 bytes loaded in the previous loop iteration
149 // 2. advance the destination pointer
150 // 3. load the next 32 bytes
151 // 4. advance the source pointer
152 // 5. subtract 32 from the length
153 //
154 // The loop is terminated when 32 or fewer bytes remain to be loaded. Those
155 // trailing 1-32 bytes will be copied in the loop cleanup.
156 stnp x8, x9, [x3]
157 stnp x10,x11,[x3, #16]
158 add x3, x3, #32
159 ldnp x8, x9, [x1]
160 ldnp x10,x11,[x1, #16]
161 add x1, x1, #32
162 subs x2, x2, #32
163 b.hi L_forwardCopyLoop
164
165 L_forwardCleanup:
166 // There are 32 bytes in x8-x11 that were loaded in the previous loop
167 // iteration, which need to be stored to [x3,x3+32). In addition, between
168 // 0 and 32 more bytes need to be copied from x1 to x3 + 32. The exact
169 // number of bytes to copy is x2 + 32. Instead of using smaller conditional
170 // copies, we simply copy 32 unaligned bytes from x1+x2 to 64+x3+x2.
171 // This copy may overlap with the first store, so the loads must come before
172 // the store of the data from the previous loop iteration.
173 add x1, x1, x2
174 ldp x12,x13,[x1]
175 ldp x14,x15,[x1, #16]
176 stp x8, x9, [x3]
177 stp x10,x11,[x3, #16]
178 add x3, x3, x2
179 stp x12,x13,[x3, #32]
180 stp x14,x15,[x3, #48]
181 POP_FRAME
182 ARM64_STACK_EPILOG
183
184 /*****************************************************************************
185 * forward small copy *
186 *****************************************************************************/
187
188 // Copy one quadword at a time until less than 8 bytes remain to be copied.
189 // At the point of entry to L_forwardSmallCopy, the "calling convention"
190 // is as follows:
191 //
192 // x0 pointer to first byte of destination
193 // x1 pointer to first byte of source
194 // x2 length of buffers
195 // x3 pointer to first byte of destination
196 0: ldr x6, [x1],#8
197 str x6, [x3],#8
198 L_forwardSmallCopy:
199 subs x2, x2, #8
200 b.cs 0b
201 adds x2, x2, #8
202 b.eq 2f
203 1: ldrb w6, [x1],#1
204 strb w6, [x3],#1
205 subs x2, x2, #1
206 b.ne 1b
207 2: POP_FRAME
208 ARM64_STACK_EPILOG
209
210 /*****************************************************************************
211 * Reverse copy engines *
212 *****************************************************************************/
213
214 // The reverse copy engines are identical in every way to the forward copy
215 // engines, except in that they do everything backwards. For this reason, they
216 // are somewhat more sparsely commented than the forward copy loops. I have
217 // tried to only comment things that might be somewhat surprising in how they
218 // differ from the forward implementation.
219 //
220 // The one important thing to note is that (almost without fail), x1 and x3
221 // will point to ONE BYTE BEYOND the "right-hand edge" of the active buffer
222 // throughout these copy loops. They are initially advanced to that position
223 // in the L_reverse jump island. Because of this, whereas the forward copy
224 // loops generally follow a "copy data, then advance pointers" scheme, in the
225 // reverse copy loops, we advance the pointers, then copy the data.
226
227 L_reverse:
228 // As a minor optimization, we early out if dst == src.
229 cbz x3, L_return
230 // advance both pointers to the ends of their respective buffers before
231 // jumping into the appropriate reverse copy loop.
232 add x4, x0, x2
233 add x1, x1, x2
234 cmp x2, #(kSmallCopy)
235 b.cc L_reverseSmallCopy
236
237 /*****************************************************************************
238 * Reverse large copy *
239 *****************************************************************************/
240
241 ldp x12,x13,[x1, #-16]
242 ldp x14,x15,[x1, #-32]
243 sub x3, x4, #1 // In the forward copy, we used dst+32 & -32
244 and x3, x3, #-32 // to find an aligned location in the dest
245 sub x5, x4, x3 // buffer. Here we use dst-1 & -32 instead,
246 sub x1, x1, x5 // because we are going backwards.
247 sub x2, x2, x5
248 ldp x8, x9, [x1, #-16]
249 ldp x10,x11,[x1, #-32]
250 stp x12,x13,[x4, #-16]
251 stp x14,x15,[x4, #-32]
252 sub x1, x1, #32
253 subs x2, x2, #64
254 b.ls L_reverseCleanup
255
256 L_reverseCopyLoop:
257 stnp x8, x9, [x3, #-16]
258 stnp x10,x11,[x3, #-32]
259 sub x3, x3, #32
260 ldnp x8, x9, [x1, #-16]
261 ldnp x10,x11,[x1, #-32]
262 sub x1, x1, #32
263 subs x2, x2, #32
264 b.hi L_reverseCopyLoop
265
266 L_reverseCleanup:
267 sub x1, x1, x2
268 ldp x12,x13,[x1, #-16]
269 ldp x14,x15,[x1, #-32]
270 stp x8, x9, [x3, #-16]
271 stp x10,x11,[x3, #-32]
272 stp x12,x13,[x0, #16] // In the forward copy, we need to compute the
273 stp x14,x15,[x0] // address of these stores, but here we already
274 POP_FRAME // have a pointer to the start of the buffer.
275 ARM64_STACK_EPILOG
276
277 /*****************************************************************************
278 * reverse small copy *
279 *****************************************************************************/
280
281 0: ldr x6, [x1,#-8]!
282 str x6, [x4,#-8]!
283 L_reverseSmallCopy:
284 subs x2, x2, #8
285 b.cs 0b
286 adds x2, x2, #8
287 b.eq 2f
288 1: ldrb w6, [x1,#-1]!
289 strb w6, [x4,#-1]!
290 subs x2, x2, #1
291 b.ne 1b
292 2: POP_FRAME
293 ARM64_STACK_EPILOG
294
295
296 L_return:
297 POP_FRAME
298 ARM64_STACK_EPILOG