]>
Commit | Line | Data |
---|---|---|
1f2f436a A |
1 | /* |
2 | * Copyright (c) 2008 Apple Computer, Inc. All rights reserved. | |
3 | * | |
4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ | |
5 | * | |
6 | * This file contains Original Code and/or Modifications of Original Code | |
7 | * as defined in and that are subject to the Apple Public Source License | |
8 | * Version 2.0 (the 'License'). You may not use this file except in | |
9 | * compliance with the License. The rights granted to you under the License | |
10 | * may not be used to create, or enable the creation or redistribution of, | |
11 | * unlawful or unlicensed copies of an Apple operating system, or to | |
12 | * circumvent, violate, or enable the circumvention or violation of, any | |
13 | * terms of an Apple operating system software license agreement. | |
14 | * | |
15 | * Please obtain a copy of the License at | |
16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. | |
17 | * | |
18 | * The Original Code and all software distributed under the License are | |
19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, | |
21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. | |
23 | * Please see the License for the specific language governing rights and | |
24 | * limitations under the License. | |
25 | * | |
26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ | |
27 | */ | |
28 | ||
29 | #include <machine/cpu_capabilities.h> | |
ad3c9f2a | 30 | #include "platfunc.h" |
1f2f436a A |
31 | |
32 | /* | |
33 | * The bcopy/memcpy loops, tuned for Nehalem. This is the 64-bit version. | |
34 | * | |
35 | * The following #defines are tightly coupled to the u-architecture: | |
36 | */ | |
37 | ||
38 | #define kShort 80 // too short to bother with SSE (must be >=80) | |
39 | ||
40 | ||
41 | // void bcopy(const void *src, void *dst, size_t len); | |
42 | ||
43 | PLATFUNC_FUNCTION_START(bcopy, sse42, 64, 5) | |
44 | pushq %rbp // set up a frame for backtraces | |
45 | movq %rsp,%rbp | |
46 | movq %rsi,%rax // copy dest ptr | |
47 | movq %rdi,%rsi // xchange source and dest ptrs | |
48 | movq %rax,%rdi | |
49 | subq %rsi,%rax // (dest - source) | |
50 | cmpq %rdx,%rax // must move in reverse if (dest - source) < length | |
51 | jb LReverseIsland | |
52 | cmpq $(kShort),%rdx // long enough to bother with SSE? | |
53 | jbe LShort // no | |
54 | jmp LNotShort | |
55 | ||
56 | // | |
57 | // void *memcpy(void *dst, const void *src, size_t len); | |
58 | // void *memmove(void *dst, const void *src, size_t len); | |
59 | // | |
60 | ||
61 | PLATFUNC_FUNCTION_START(memcpy, sse42, 64, 0) // void *memcpy(void *dst, const void *src, size_t len) | |
62 | PLATFUNC_FUNCTION_START(memmove, sse42, 64, 0) // void *memmove(void *dst, const void *src, size_t len) | |
63 | pushq %rbp // set up a frame for backtraces | |
64 | movq %rsp,%rbp | |
65 | movq %rdi,%r11 // save return value here | |
66 | movq %rdi,%rax | |
67 | subq %rsi,%rax // (dest - source) | |
68 | cmpq %rdx,%rax // must move in reverse if (dest - source) < length | |
69 | jb LReverseIsland | |
70 | cmpq $(kShort),%rdx // long enough to bother with SSE? | |
71 | ja LNotShort // yes | |
72 | ||
73 | // Handle short forward copies. As the most common case, this is the fall-through path. | |
74 | // rdx = length (<= kShort) | |
75 | // rsi = source ptr | |
76 | // rdi = dest ptr | |
77 | ||
78 | LShort: | |
79 | movl %edx,%ecx // copy length using 32-bit operation | |
80 | shrl $2,%ecx // get #doublewords | |
81 | jz 3f | |
82 | 2: // loop copying doublewords | |
83 | movl (%rsi),%eax | |
84 | addq $4,%rsi | |
85 | movl %eax,(%rdi) | |
86 | addq $4,%rdi | |
87 | decl %ecx | |
88 | jnz 2b | |
89 | 3: // handle leftover bytes (0..3) in last word | |
90 | andl $3,%edx // any leftover bytes? | |
91 | jz 5f | |
92 | 4: // loop copying bytes | |
93 | movb (%rsi),%al | |
94 | incq %rsi | |
95 | movb %al,(%rdi) | |
96 | incq %rdi | |
97 | decl %edx | |
98 | jnz 4b | |
99 | 5: | |
100 | movq %r11,%rax // get return value (dst ptr) for memcpy/memmove | |
101 | popq %rbp | |
102 | ret | |
103 | ||
104 | ||
105 | LReverseIsland: // keep the "jb" above a short branch... | |
106 | jmp LReverse // ...because reverse moves are uncommon | |
107 | ||
108 | ||
109 | // Handle forward moves that are long enough to justify use of SSE. | |
110 | // First, 16-byte align the destination. | |
111 | // rdx = length (> kShort) | |
112 | // rsi = source ptr | |
113 | // rdi = dest ptr | |
114 | ||
115 | LNotShort: | |
116 | movl %edi,%ecx // copy low half of destination ptr | |
117 | negl %ecx | |
118 | andl $15,%ecx // get #bytes to align destination | |
119 | jz LDestAligned // already aligned | |
120 | subl %ecx,%edx // decrement length | |
121 | 1: // loop copying 1..15 bytes | |
122 | movb (%rsi),%al | |
123 | inc %rsi | |
124 | movb %al,(%rdi) | |
125 | inc %rdi | |
126 | dec %ecx | |
127 | jnz 1b | |
128 | ||
129 | ||
130 | // Destination is now aligned. Nehalem does a great job with unaligned SSE loads, | |
131 | // so we use MOVDQU rather than aligned loads and shifts. Since kShort>=80, we | |
132 | // know there is at least one 64-byte chunk to move. | |
133 | // When we enter the copy loops, the following registers are set up: | |
134 | // rdx = residual length (0..63) | |
135 | // rcx = -(length to move), a multiple of 64 less than 2GB | |
136 | // rsi = ptr to 1st source byte not to move (unaligned) | |
137 | // rdi = ptr to 1st dest byte not to move (aligned) | |
138 | ||
139 | LDestAligned: | |
140 | movq %rdx,%rcx // copy length | |
141 | andl $63,%edx // get remaining bytes for LShort | |
142 | andq $-64,%rcx // get number of bytes we will copy in inner loop | |
143 | addq %rcx,%rsi // point to 1st byte not copied | |
144 | addq %rcx,%rdi | |
145 | negq %rcx // now generate offset to 1st byte to be copied | |
146 | testl $15,%esi // source also aligned? | |
147 | jnz LUnalignedLoop | |
148 | jmp LAlignedLoop | |
149 | ||
150 | ||
151 | // Forward loop for aligned operands. | |
152 | ||
153 | .align 4,0x90 // 16-byte align inner loops | |
154 | LAlignedLoop: // loop over 64-byte chunks | |
155 | movdqa (%rsi,%rcx),%xmm0 | |
156 | movdqa 16(%rsi,%rcx),%xmm1 | |
157 | movdqa 32(%rsi,%rcx),%xmm2 | |
158 | movdqa 48(%rsi,%rcx),%xmm3 | |
159 | ||
160 | movdqa %xmm0,(%rdi,%rcx) | |
161 | movdqa %xmm1,16(%rdi,%rcx) | |
162 | movdqa %xmm2,32(%rdi,%rcx) | |
163 | movdqa %xmm3,48(%rdi,%rcx) | |
164 | ||
165 | addq $64,%rcx | |
166 | jnz LAlignedLoop | |
167 | ||
168 | jmp LShort // copy remaining 0..63 bytes and done | |
169 | ||
170 | ||
171 | // Forward loop for unaligned operands. | |
172 | ||
173 | .align 4,0x90 // 16-byte align inner loops | |
174 | LUnalignedLoop: // loop over 64-byte chunks | |
175 | movdqu (%rsi,%rcx),%xmm0 | |
176 | movdqu 16(%rsi,%rcx),%xmm1 | |
177 | movdqu 32(%rsi,%rcx),%xmm2 | |
178 | movdqu 48(%rsi,%rcx),%xmm3 | |
179 | ||
180 | movdqa %xmm0,(%rdi,%rcx) | |
181 | movdqa %xmm1,16(%rdi,%rcx) | |
182 | movdqa %xmm2,32(%rdi,%rcx) | |
183 | movdqa %xmm3,48(%rdi,%rcx) | |
184 | ||
185 | addq $64,%rcx | |
186 | jnz LUnalignedLoop | |
187 | ||
188 | jmp LShort // copy remaining 0..63 bytes and done | |
189 | ||
190 | ||
191 | // Reverse moves. These are only used with destructive overlap. | |
192 | // rdx = length | |
193 | // rsi = source ptr | |
194 | // rdi = dest ptr | |
195 | ||
196 | LReverse: | |
197 | addq %rdx,%rsi // point to end of strings | |
198 | addq %rdx,%rdi | |
199 | cmpq $(kShort),%rdx // long enough to bother with SSE? | |
200 | ja LReverseNotShort // yes | |
201 | ||
202 | // Handle reverse short copies. | |
203 | // edx = length (<= kShort) | |
204 | // rsi = one byte past end of source | |
205 | // rdi = one byte past end of dest | |
206 | ||
207 | LReverseShort: | |
208 | movl %edx,%ecx // copy length | |
209 | shrl $3,%ecx // #quadwords | |
210 | jz 3f | |
211 | 1: | |
212 | subq $8,%rsi | |
213 | movq (%rsi),%rax | |
214 | subq $8,%rdi | |
215 | movq %rax,(%rdi) | |
216 | decl %ecx | |
217 | jnz 1b | |
218 | 3: | |
219 | andl $7,%edx // bytes? | |
220 | jz 5f | |
221 | 4: | |
222 | decq %rsi | |
223 | movb (%rsi),%al | |
224 | decq %rdi | |
225 | movb %al,(%rdi) | |
226 | decl %edx | |
227 | jnz 4b | |
228 | 5: | |
229 | movq %r11,%rax // get return value (dst ptr) for memcpy/memmove | |
230 | popq %rbp | |
231 | ret | |
232 | ||
233 | // Handle a reverse move long enough to justify using SSE. | |
234 | // rdx = length (> kShort) | |
235 | // rsi = one byte past end of source | |
236 | // rdi = one byte past end of dest | |
237 | ||
238 | LReverseNotShort: | |
239 | movl %edi,%ecx // copy destination | |
240 | andl $15,%ecx // get #bytes to align destination | |
241 | jz LReverseDestAligned // already aligned | |
242 | subq %rcx,%rdx // adjust length | |
243 | 1: // loop copying 1..15 bytes | |
244 | decq %rsi | |
245 | movb (%rsi),%al | |
246 | decq %rdi | |
247 | movb %al,(%rdi) | |
248 | decl %ecx | |
249 | jnz 1b | |
250 | ||
251 | // Destination is now aligned. Prepare for reverse loops. | |
252 | ||
253 | LReverseDestAligned: | |
254 | movq %rdx,%rcx // copy length | |
255 | andl $63,%edx // get remaining bytes for LReverseShort | |
256 | andq $-64,%rcx // get number of bytes we will copy in inner loop | |
257 | subq %rcx,%rsi // point to endpoint of copy | |
258 | subq %rcx,%rdi | |
259 | testl $15,%esi // is source aligned too? | |
260 | jnz LReverseUnalignedLoop // no | |
261 | ||
262 | LReverseAlignedLoop: // loop over 64-byte chunks | |
263 | movdqa -16(%rsi,%rcx),%xmm0 | |
264 | movdqa -32(%rsi,%rcx),%xmm1 | |
265 | movdqa -48(%rsi,%rcx),%xmm2 | |
266 | movdqa -64(%rsi,%rcx),%xmm3 | |
267 | ||
268 | movdqa %xmm0,-16(%rdi,%rcx) | |
269 | movdqa %xmm1,-32(%rdi,%rcx) | |
270 | movdqa %xmm2,-48(%rdi,%rcx) | |
271 | movdqa %xmm3,-64(%rdi,%rcx) | |
272 | ||
273 | subq $64,%rcx | |
274 | jne LReverseAlignedLoop | |
275 | ||
276 | jmp LReverseShort // copy remaining 0..63 bytes and done | |
277 | ||
278 | ||
279 | // Reverse, unaligned loop. LDDQU==MOVDQU on these machines. | |
280 | ||
281 | LReverseUnalignedLoop: // loop over 64-byte chunks | |
282 | movdqu -16(%rsi,%rcx),%xmm0 | |
283 | movdqu -32(%rsi,%rcx),%xmm1 | |
284 | movdqu -48(%rsi,%rcx),%xmm2 | |
285 | movdqu -64(%rsi,%rcx),%xmm3 | |
286 | ||
287 | movdqa %xmm0,-16(%rdi,%rcx) | |
288 | movdqa %xmm1,-32(%rdi,%rcx) | |
289 | movdqa %xmm2,-48(%rdi,%rcx) | |
290 | movdqa %xmm3,-64(%rdi,%rcx) | |
291 | ||
292 | subq $64,%rcx | |
293 | jne LReverseUnalignedLoop | |
294 | ||
295 | jmp LReverseShort // copy remaining 0..63 bytes and done | |
296 | ||
297 | ||
298 | PLATFUNC_DESCRIPTOR(bcopy,sse42,kHasSSE4_2,0) | |
299 | PLATFUNC_DESCRIPTOR(memcpy,sse42,kHasSSE4_2,0) | |
300 | PLATFUNC_DESCRIPTOR(memmove,sse42,kHasSSE4_2,0) |