]>
Commit | Line | Data |
---|---|---|
1 | /* | |
2 | * Copyright (c) 2012 Apple Computer, Inc. All rights reserved. | |
3 | * | |
4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ | |
5 | * | |
6 | * This file contains Original Code and/or Modifications of Original Code | |
7 | * as defined in and that are subject to the Apple Public Source License | |
8 | * Version 2.0 (the 'License'). You may not use this file except in | |
9 | * compliance with the License. The rights granted to you under the License | |
10 | * may not be used to create, or enable the creation or redistribution of, | |
11 | * unlawful or unlicensed copies of an Apple operating system, or to | |
12 | * circumvent, violate, or enable the circumvention or violation of, any | |
13 | * terms of an Apple operating system software license agreement. | |
14 | * | |
15 | * Please obtain a copy of the License at | |
16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. | |
17 | * | |
18 | * The Original Code and all software distributed under the License are | |
19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, | |
21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. | |
23 | * Please see the License for the specific language governing rights and | |
24 | * limitations under the License. | |
25 | * | |
26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ | |
27 | * | |
28 | * This file implements the following functions for the arm64 architecture. | |
29 | * | |
30 | * void bcopy(const void * source, | |
31 | * void * destination, | |
32 | * size_t length); | |
33 | * | |
34 | * void *memmove(void * destination, | |
35 | * const void * source, | |
36 | * size_t n); | |
37 | * | |
38 | * void *memcpy(void * restrict destination, | |
39 | * const void * restrict source, | |
40 | * size_t n); | |
41 | * | |
42 | * All copy n successive bytes from source to destination. Memmove and memcpy | |
43 | * return destination, whereas bcopy has no return value. Copying takes place | |
44 | * as if it were through a temporary buffer -- after return destination | |
45 | * contains exactly the bytes from source, even if the buffers overlap (this is | |
46 | * not required of memcpy by the C standard; its behavior is undefined if the | |
47 | * buffers overlap, but we are holding ourselves to the historical behavior of | |
48 | * this function on MacOS). | |
49 | */ | |
50 | ||
51 | #include "asm.h" | |
52 | ||
53 | .globl _bcopy | |
54 | .globl _ovbcopy | |
55 | .globl _memcpy | |
56 | .globl _memmove | |
57 | ||
58 | /***************************************************************************** | |
59 | * Macros * | |
60 | *****************************************************************************/ | |
61 | ||
62 | #define kSmallCopy 64 | |
63 | ||
64 | /***************************************************************************** | |
65 | * Entrypoints * | |
66 | *****************************************************************************/ | |
67 | ||
68 | .text | |
69 | .align 5 | |
70 | _bcopy: | |
71 | _ovbcopy: | |
72 | // Translate bcopy into memcpy by swapping the first and second arguments. | |
73 | mov x3, x0 | |
74 | mov x0, x1 | |
75 | mov x1, x3 | |
76 | ||
77 | .align 4 | |
78 | _memcpy: | |
79 | _memmove: | |
80 | // Our preference is to copy the data in ascending address order, but if the | |
81 | // buffers overlap such that the beginning of the destination buffer aliases | |
82 | // the end of the source buffer, we need to copy in descending address order | |
83 | // instead to preserve the memmove semantics. We detect this case with the | |
84 | // test: | |
85 | // | |
86 | // destination - source < length (unsigned compare) | |
87 | // | |
88 | // If the address of the source buffer is higher than the address of the | |
89 | // destination buffer, this arithmetic can overflow, but the overflowed value | |
90 | // can only be smaller than length if the buffers do not overlap, so we don't | |
91 | // need to worry about false positives due to the overflow (they happen, but | |
92 | // only in cases where copying in either order is correct). | |
93 | ARM64_STACK_PROLOG | |
94 | PUSH_FRAME | |
95 | sub x3, x0, x1 | |
96 | cmp x3, x2 | |
97 | b.cc L_reverse | |
98 | mov x3, x0 // copy destination pointer | |
99 | cmp x2, #(kSmallCopy) | |
100 | b.cc L_forwardSmallCopy | |
101 | ||
102 | /***************************************************************************** | |
103 | * Forward large copy * | |
104 | *****************************************************************************/ | |
105 | ||
106 | // Load the first 32 bytes from src, and compute the number of bytes to the | |
107 | // first 32-byte aligned location in dst. Even though we are going to copy | |
108 | // 32 bytes, only those preceeding that 32-byte location "count" towards | |
109 | // reducing the length of the buffer or advancing the pointers. We will need | |
110 | // to issue the first load from the advanced src pointer BEFORE the store to | |
111 | // the unmodified dst pointer. | |
112 | add x3, x3, #32 | |
113 | and x3, x3, #-32 // aligned dst | |
114 | ldp x12,x13,[x1] | |
115 | ldp x14,x15,[x1, #16] | |
116 | sub x5, x3, x0 // bytes between original dst and aligned dst | |
117 | add x1, x1, x5 // update src pointer | |
118 | ||
119 | // At this point, data in the following registers is in flight: | |
120 | // | |
121 | // x0 original dst pointer | |
122 | // x1 corresponding location in src buffer. | |
123 | // x2 length from aligned location in dst to end of buffer. This is | |
124 | // guaranteed to be >= (64 - 32). | |
125 | // x3 aligned location in dst buffer. | |
126 | // x12:x15 first 32 bytes of src buffer. | |
127 | // | |
128 | // We now load 32 bytes from x1, and store 32 bytes from x12:x15 to x3. The | |
129 | // store *may* overlap the first 32 bytes of the load, so in order to get | |
130 | // correct memmove semantics, the first 32 byte load must occur before the | |
131 | // store. | |
132 | // | |
133 | // After loading these 32 bytes, we advance x1, and decrement the length by | |
134 | // 64. If the remaining length of the buffer was less than 64, then we jump | |
135 | // directly to the cleanup path. | |
136 | ldp x8, x9, [x1] | |
137 | ldp x10,x11,[x1, #16] | |
138 | add x1, x1, #32 | |
139 | sub x2, x2, x5 // update length | |
140 | stp x12,x13,[x0] // initial unaligned store | |
141 | stp x14,x15,[x0, #16] // initial unaligned store | |
142 | subs x2, x2, #64 | |
143 | b.ls L_forwardCleanup | |
144 | ||
145 | L_forwardCopyLoop: | |
146 | // Main copy loop: | |
147 | // | |
148 | // 1. store the 32 bytes loaded in the previous loop iteration | |
149 | // 2. advance the destination pointer | |
150 | // 3. load the next 32 bytes | |
151 | // 4. advance the source pointer | |
152 | // 5. subtract 32 from the length | |
153 | // | |
154 | // The loop is terminated when 32 or fewer bytes remain to be loaded. Those | |
155 | // trailing 1-32 bytes will be copied in the loop cleanup. | |
156 | stnp x8, x9, [x3] | |
157 | stnp x10,x11,[x3, #16] | |
158 | add x3, x3, #32 | |
159 | ldnp x8, x9, [x1] | |
160 | ldnp x10,x11,[x1, #16] | |
161 | add x1, x1, #32 | |
162 | subs x2, x2, #32 | |
163 | b.hi L_forwardCopyLoop | |
164 | ||
165 | L_forwardCleanup: | |
166 | // There are 32 bytes in x8-x11 that were loaded in the previous loop | |
167 | // iteration, which need to be stored to [x3,x3+32). In addition, between | |
168 | // 0 and 32 more bytes need to be copied from x1 to x3 + 32. The exact | |
169 | // number of bytes to copy is x2 + 32. Instead of using smaller conditional | |
170 | // copies, we simply copy 32 unaligned bytes from x1+x2 to 64+x3+x2. | |
171 | // This copy may overlap with the first store, so the loads must come before | |
172 | // the store of the data from the previous loop iteration. | |
173 | add x1, x1, x2 | |
174 | ldp x12,x13,[x1] | |
175 | ldp x14,x15,[x1, #16] | |
176 | stp x8, x9, [x3] | |
177 | stp x10,x11,[x3, #16] | |
178 | add x3, x3, x2 | |
179 | stp x12,x13,[x3, #32] | |
180 | stp x14,x15,[x3, #48] | |
181 | POP_FRAME | |
182 | ARM64_STACK_EPILOG | |
183 | ||
184 | /***************************************************************************** | |
185 | * forward small copy * | |
186 | *****************************************************************************/ | |
187 | ||
188 | // Copy one quadword at a time until less than 8 bytes remain to be copied. | |
189 | // At the point of entry to L_forwardSmallCopy, the "calling convention" | |
190 | // is as follows: | |
191 | // | |
192 | // x0 pointer to first byte of destination | |
193 | // x1 pointer to first byte of source | |
194 | // x2 length of buffers | |
195 | // x3 pointer to first byte of destination | |
196 | 0: ldr x6, [x1],#8 | |
197 | str x6, [x3],#8 | |
198 | L_forwardSmallCopy: | |
199 | subs x2, x2, #8 | |
200 | b.cs 0b | |
201 | adds x2, x2, #8 | |
202 | b.eq 2f | |
203 | 1: ldrb w6, [x1],#1 | |
204 | strb w6, [x3],#1 | |
205 | subs x2, x2, #1 | |
206 | b.ne 1b | |
207 | 2: POP_FRAME | |
208 | ARM64_STACK_EPILOG | |
209 | ||
210 | /***************************************************************************** | |
211 | * Reverse copy engines * | |
212 | *****************************************************************************/ | |
213 | ||
214 | // The reverse copy engines are identical in every way to the forward copy | |
215 | // engines, except in that they do everything backwards. For this reason, they | |
216 | // are somewhat more sparsely commented than the forward copy loops. I have | |
217 | // tried to only comment things that might be somewhat surprising in how they | |
218 | // differ from the forward implementation. | |
219 | // | |
220 | // The one important thing to note is that (almost without fail), x1 and x3 | |
221 | // will point to ONE BYTE BEYOND the "right-hand edge" of the active buffer | |
222 | // throughout these copy loops. They are initially advanced to that position | |
223 | // in the L_reverse jump island. Because of this, whereas the forward copy | |
224 | // loops generally follow a "copy data, then advance pointers" scheme, in the | |
225 | // reverse copy loops, we advance the pointers, then copy the data. | |
226 | ||
227 | L_reverse: | |
228 | // As a minor optimization, we early out if dst == src. | |
229 | cbz x3, L_return | |
230 | // advance both pointers to the ends of their respective buffers before | |
231 | // jumping into the appropriate reverse copy loop. | |
232 | add x4, x0, x2 | |
233 | add x1, x1, x2 | |
234 | cmp x2, #(kSmallCopy) | |
235 | b.cc L_reverseSmallCopy | |
236 | ||
237 | /***************************************************************************** | |
238 | * Reverse large copy * | |
239 | *****************************************************************************/ | |
240 | ||
241 | ldp x12,x13,[x1, #-16] | |
242 | ldp x14,x15,[x1, #-32] | |
243 | sub x3, x4, #1 // In the forward copy, we used dst+32 & -32 | |
244 | and x3, x3, #-32 // to find an aligned location in the dest | |
245 | sub x5, x4, x3 // buffer. Here we use dst-1 & -32 instead, | |
246 | sub x1, x1, x5 // because we are going backwards. | |
247 | sub x2, x2, x5 | |
248 | ldp x8, x9, [x1, #-16] | |
249 | ldp x10,x11,[x1, #-32] | |
250 | stp x12,x13,[x4, #-16] | |
251 | stp x14,x15,[x4, #-32] | |
252 | sub x1, x1, #32 | |
253 | subs x2, x2, #64 | |
254 | b.ls L_reverseCleanup | |
255 | ||
256 | L_reverseCopyLoop: | |
257 | stnp x8, x9, [x3, #-16] | |
258 | stnp x10,x11,[x3, #-32] | |
259 | sub x3, x3, #32 | |
260 | ldnp x8, x9, [x1, #-16] | |
261 | ldnp x10,x11,[x1, #-32] | |
262 | sub x1, x1, #32 | |
263 | subs x2, x2, #32 | |
264 | b.hi L_reverseCopyLoop | |
265 | ||
266 | L_reverseCleanup: | |
267 | sub x1, x1, x2 | |
268 | ldp x12,x13,[x1, #-16] | |
269 | ldp x14,x15,[x1, #-32] | |
270 | stp x8, x9, [x3, #-16] | |
271 | stp x10,x11,[x3, #-32] | |
272 | stp x12,x13,[x0, #16] // In the forward copy, we need to compute the | |
273 | stp x14,x15,[x0] // address of these stores, but here we already | |
274 | POP_FRAME // have a pointer to the start of the buffer. | |
275 | ARM64_STACK_EPILOG | |
276 | ||
277 | /***************************************************************************** | |
278 | * reverse small copy * | |
279 | *****************************************************************************/ | |
280 | ||
281 | 0: ldr x6, [x1,#-8]! | |
282 | str x6, [x4,#-8]! | |
283 | L_reverseSmallCopy: | |
284 | subs x2, x2, #8 | |
285 | b.cs 0b | |
286 | adds x2, x2, #8 | |
287 | b.eq 2f | |
288 | 1: ldrb w6, [x1,#-1]! | |
289 | strb w6, [x4,#-1]! | |
290 | subs x2, x2, #1 | |
291 | b.ne 1b | |
292 | 2: POP_FRAME | |
293 | ARM64_STACK_EPILOG | |
294 | ||
295 | ||
296 | L_return: | |
297 | POP_FRAME | |
298 | ARM64_STACK_EPILOG |