2 * Copyright (c) 2008 Apple Computer, Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
29 #include <machine/cpu_capabilities.h>
33 * The bcopy/memcpy loops, tuned for Nehalem.
35 * The following #defines are tightly coupled to the u-architecture:
38 #define kShort 80 // too short to bother with SSE (must be >=80)
41 // void bcopy(const void *src, void *dst, size_t len);
43 PLATFUNC_FUNCTION_START(bcopy, sse42, 32, 5)
44 pushl %ebp // set up a frame for backtraces
48 movl 8(%ebp),%esi // get source ptr
49 movl 12(%ebp),%edi // get dest ptr
50 movl 16(%ebp),%ecx // get length
52 subl %esi,%edx // (dest - source)
53 cmpl %ecx,%edx // must move in reverse if (dest - source) < length
55 cmpl $(kShort),%ecx // long enough to bother with SSE?
60 // void *memcpy(void *dst, const void *src, size_t len);
61 // void *memmove(void *dst, const void *src, size_t len);
64 PLATFUNC_FUNCTION_START(memcpy, sse42, 32, 0) // void *memcpy(void *dst, const void *src, size_t len)
65 PLATFUNC_FUNCTION_START(memmove, sse42, 32, 0) // void *memmove(void *dst, const void *src, size_t len)
66 pushl %ebp // set up a frame for backtraces
70 movl 8(%ebp),%edi // get dest ptr
71 movl 12(%ebp),%esi // get source ptr
72 movl 16(%ebp),%ecx // get length
74 subl %esi,%edx // (dest - source)
75 cmpl %ecx,%edx // must move in reverse if (dest - source) < length
77 cmpl $(kShort),%ecx // long enough to bother with SSE?
80 // Handle short forward copies. As the most common case, this is the fall-through path.
81 // ecx = length (<= kShort)
86 movl %ecx,%edx // copy length
87 shrl $2,%ecx // get #doublewords
89 2: // loop copying doublewords
96 3: // handle leftover bytes (0..3) in last word
97 andl $3,%edx // any leftover bytes?
99 4: // loop copying bytes
107 movl 8(%ebp),%eax // get return value (dst ptr) for memcpy/memmove
114 LReverseIsland: // keep the "jb" above a short branch...
115 jmp LReverse // ...because reverse moves are uncommon
118 // Handle forward moves that are long enough to justify use of SSE.
119 // First, 16-byte align the destination.
120 // ecx = length (> kShort)
125 movl %edi,%edx // copy destination
127 andl $15,%edx // get #bytes to align destination
128 jz LDestAligned // already aligned
129 subl %edx,%ecx // decrement length
130 1: // loop copying 1..15 bytes
138 // Destination is now aligned. Nehalem does a great job with unaligned SSE loads,
139 // so we use MOVDQU rather than aligned loads and shifts. Since kShort>=80, we
140 // know there is at least one 64-byte chunk to move.
141 // When we enter the copy loops, the following registers are set up:
142 // ecx = residual length (0..63)
143 // edx = -(length to move), a multiple of 64
144 // esi = ptr to 1st source byte not to move (unaligned)
145 // edi = ptr to 1st dest byte not to move (aligned)
148 movl %ecx,%edx // copy length
149 andl $63,%ecx // get remaining bytes for Lshort
150 andl $-64,%edx // get number of bytes we will copy in inner loop
151 addl %edx,%esi // point to 1st byte not copied
153 negl %edx // now generate offset to 1st byte to be copied
154 testl $15,%esi // source also aligned?
159 // Forward loop for aligned operands.
161 .align 4,0x90 // 16-byte align inner loops
162 LAlignedLoop: // loop over 64-byte chunks
163 movdqa (%esi,%edx),%xmm0
164 movdqa 16(%esi,%edx),%xmm1
165 movdqa 32(%esi,%edx),%xmm2
166 movdqa 48(%esi,%edx),%xmm3
168 movdqa %xmm0,(%edi,%edx)
169 movdqa %xmm1,16(%edi,%edx)
170 movdqa %xmm2,32(%edi,%edx)
171 movdqa %xmm3,48(%edi,%edx)
176 jmp Lshort // copy remaining 0..63 bytes and done
179 // Forward loop for unaligned operands.
181 .align 4,0x90 // 16-byte align inner loops
182 LUnalignedLoop: // loop over 64-byte chunks
183 movdqu (%esi,%edx),%xmm0
184 movdqu 16(%esi,%edx),%xmm1
185 movdqu 32(%esi,%edx),%xmm2
186 movdqu 48(%esi,%edx),%xmm3
188 movdqa %xmm0,(%edi,%edx)
189 movdqa %xmm1,16(%edi,%edx)
190 movdqa %xmm2,32(%edi,%edx)
191 movdqa %xmm3,48(%edi,%edx)
196 jmp Lshort // copy remaining 0..63 bytes and done
199 // Reverse moves. They are only used with destructive overlap.
205 addl %ecx,%esi // point to end of strings
207 cmpl $(kShort),%ecx // long enough to bother with SSE?
208 ja LReverseNotShort // yes
210 // Handle reverse short copies.
212 // esi = one byte past end of source
213 // edi = one byte past end of dest
216 movl %ecx,%edx // copy length
217 shrl $2,%ecx // #words
227 andl $3,%edx // bytes?
237 movl 8(%ebp),%eax // get return value (dst ptr) for memcpy/memmove
243 // Handle a reverse move long enough to justify using SSE.
245 // esi = one byte past end of source
246 // edi = one byte past end of dest
249 movl %edi,%edx // copy destination
250 andl $15,%edx // get #bytes to align destination
251 je LReverseDestAligned // already aligned
252 subl %edx,%ecx // adjust length
253 1: // loop copying 1..15 bytes
261 // Destination is now aligned. Prepare for reverse loops.
264 movl %ecx,%edx // copy length
265 andl $63,%ecx // get remaining bytes for Lshort
266 andl $-64,%edx // get number of bytes we will copy in inner loop
267 subl %edx,%esi // point to endpoint of copy
269 testl $15,%esi // is source aligned too?
270 jnz LReverseUnalignedLoop // no
272 LReverseAlignedLoop: // loop over 64-byte chunks
273 movdqa -16(%esi,%edx),%xmm0
274 movdqa -32(%esi,%edx),%xmm1
275 movdqa -48(%esi,%edx),%xmm2
276 movdqa -64(%esi,%edx),%xmm3
278 movdqa %xmm0,-16(%edi,%edx)
279 movdqa %xmm1,-32(%edi,%edx)
280 movdqa %xmm2,-48(%edi,%edx)
281 movdqa %xmm3,-64(%edi,%edx)
284 jne LReverseAlignedLoop
286 jmp LReverseShort // copy remaining 0..63 bytes and done
289 // Reverse, unaligned loop. LDDQU==MOVDQU on these machines.
291 LReverseUnalignedLoop: // loop over 64-byte chunks
292 movdqu -16(%esi,%edx),%xmm0
293 movdqu -32(%esi,%edx),%xmm1
294 movdqu -48(%esi,%edx),%xmm2
295 movdqu -64(%esi,%edx),%xmm3
297 movdqa %xmm0,-16(%edi,%edx)
298 movdqa %xmm1,-32(%edi,%edx)
299 movdqa %xmm2,-48(%edi,%edx)
300 movdqa %xmm3,-64(%edi,%edx)
303 jne LReverseUnalignedLoop
305 jmp LReverseShort // copy remaining 0..63 bytes and done
308 PLATFUNC_DESCRIPTOR(bcopy,sse42,kHasSSE4_2,0)
309 PLATFUNC_DESCRIPTOR(memcpy,sse42,kHasSSE4_2,0)
310 PLATFUNC_DESCRIPTOR(memmove,sse42,kHasSSE4_2,0)