2 * Copyright (c) 2008 Apple Computer, Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
29 #include <machine/cpu_capabilities.h>
30 #include <machine/commpage.h>
33 * The bcopy/memcpy loops, tuned for Nehalem.
35 * The following #defines are tightly coupled to the u-architecture:
38 #define kShort 80 // too short to bother with SSE (must be >=80)
41 // void bcopy(const void *src, void *dst, size_t len);
43 COMMPAGE_FUNCTION_START(bcopy_sse42, 32, 5)
44 pushl %ebp // set up a frame for backtraces
48 movl 8(%ebp),%esi // get source ptr
49 movl 12(%ebp),%edi // get dest ptr
50 movl 16(%ebp),%ecx // get length
52 subl %esi,%edx // (dest - source)
53 cmpl %ecx,%edx // must move in reverse if (dest - source) < length
55 cmpl $(kShort),%ecx // long enough to bother with SSE?
60 // void *memcpy(void *dst, const void *src, size_t len);
61 // void *memmove(void *dst, const void *src, size_t len);
63 // NB: These need to be 32 bytes from bcopy():
67 Lmemcpy: // void *memcpy(void *dst, const void *src, size_t len)
68 Lmemmove: // void *memmove(void *dst, const void *src, size_t len)
69 pushl %ebp // set up a frame for backtraces
73 movl 8(%ebp),%edi // get dest ptr
74 movl 12(%ebp),%esi // get source ptr
75 movl 16(%ebp),%ecx // get length
77 subl %esi,%edx // (dest - source)
78 cmpl %ecx,%edx // must move in reverse if (dest - source) < length
80 cmpl $(kShort),%ecx // long enough to bother with SSE?
83 // Handle short forward copies. As the most common case, this is the fall-through path.
84 // ecx = length (<= kShort)
89 movl %ecx,%edx // copy length
90 shrl $2,%ecx // get #doublewords
92 2: // loop copying doublewords
99 3: // handle leftover bytes (0..3) in last word
100 andl $3,%edx // any leftover bytes?
102 4: // loop copying bytes
110 movl 8(%ebp),%eax // get return value (dst ptr) for memcpy/memmove
117 LReverseIsland: // keep the "jb" above a short branch...
118 jmp LReverse // ...because reverse moves are uncommon
121 // Handle forward moves that are long enough to justify use of SSE.
122 // First, 16-byte align the destination.
123 // ecx = length (> kShort)
128 movl %edi,%edx // copy destination
130 andl $15,%edx // get #bytes to align destination
131 jz LDestAligned // already aligned
132 subl %edx,%ecx // decrement length
133 1: // loop copying 1..15 bytes
141 // Destination is now aligned. Nehalem does a great job with unaligned SSE loads,
142 // so we use MOVDQU rather than aligned loads and shifts. Since kShort>=80, we
143 // know there is at least one 64-byte chunk to move.
144 // When we enter the copy loops, the following registers are set up:
145 // ecx = residual length (0..63)
146 // edx = -(length to move), a multiple of 64
147 // esi = ptr to 1st source byte not to move (unaligned)
148 // edi = ptr to 1st dest byte not to move (aligned)
151 movl %ecx,%edx // copy length
152 andl $63,%ecx // get remaining bytes for Lshort
153 andl $-64,%edx // get number of bytes we will copy in inner loop
154 addl %edx,%esi // point to 1st byte not copied
156 negl %edx // now generate offset to 1st byte to be copied
157 testl $15,%esi // source also aligned?
162 // Forward loop for aligned operands.
164 .align 4,0x90 // 16-byte align inner loops
165 LAlignedLoop: // loop over 64-byte chunks
166 movdqa (%esi,%edx),%xmm0
167 movdqa 16(%esi,%edx),%xmm1
168 movdqa 32(%esi,%edx),%xmm2
169 movdqa 48(%esi,%edx),%xmm3
171 movdqa %xmm0,(%edi,%edx)
172 movdqa %xmm1,16(%edi,%edx)
173 movdqa %xmm2,32(%edi,%edx)
174 movdqa %xmm3,48(%edi,%edx)
179 jmp Lshort // copy remaining 0..63 bytes and done
182 // Forward loop for unaligned operands.
184 .align 4,0x90 // 16-byte align inner loops
185 LUnalignedLoop: // loop over 64-byte chunks
186 movdqu (%esi,%edx),%xmm0
187 movdqu 16(%esi,%edx),%xmm1
188 movdqu 32(%esi,%edx),%xmm2
189 movdqu 48(%esi,%edx),%xmm3
191 movdqa %xmm0,(%edi,%edx)
192 movdqa %xmm1,16(%edi,%edx)
193 movdqa %xmm2,32(%edi,%edx)
194 movdqa %xmm3,48(%edi,%edx)
199 jmp Lshort // copy remaining 0..63 bytes and done
202 // Reverse moves. They are only used with destructive overlap.
208 addl %ecx,%esi // point to end of strings
210 cmpl $(kShort),%ecx // long enough to bother with SSE?
211 ja LReverseNotShort // yes
213 // Handle reverse short copies.
215 // esi = one byte past end of source
216 // edi = one byte past end of dest
219 movl %ecx,%edx // copy length
220 shrl $2,%ecx // #words
230 andl $3,%edx // bytes?
240 movl 8(%ebp),%eax // get return value (dst ptr) for memcpy/memmove
246 // Handle a reverse move long enough to justify using SSE.
248 // esi = one byte past end of source
249 // edi = one byte past end of dest
252 movl %edi,%edx // copy destination
253 andl $15,%edx // get #bytes to align destination
254 je LReverseDestAligned // already aligned
255 subl %edx,%ecx // adjust length
256 1: // loop copying 1..15 bytes
264 // Destination is now aligned. Prepare for reverse loops.
267 movl %ecx,%edx // copy length
268 andl $63,%ecx // get remaining bytes for Lshort
269 andl $-64,%edx // get number of bytes we will copy in inner loop
270 subl %edx,%esi // point to endpoint of copy
272 testl $15,%esi // is source aligned too?
273 jnz LReverseUnalignedLoop // no
275 LReverseAlignedLoop: // loop over 64-byte chunks
276 movdqa -16(%esi,%edx),%xmm0
277 movdqa -32(%esi,%edx),%xmm1
278 movdqa -48(%esi,%edx),%xmm2
279 movdqa -64(%esi,%edx),%xmm3
281 movdqa %xmm0,-16(%edi,%edx)
282 movdqa %xmm1,-32(%edi,%edx)
283 movdqa %xmm2,-48(%edi,%edx)
284 movdqa %xmm3,-64(%edi,%edx)
287 jne LReverseAlignedLoop
289 jmp LReverseShort // copy remaining 0..63 bytes and done
292 // Reverse, unaligned loop. LDDQU==MOVDQU on these machines.
294 LReverseUnalignedLoop: // loop over 64-byte chunks
295 movdqu -16(%esi,%edx),%xmm0
296 movdqu -32(%esi,%edx),%xmm1
297 movdqu -48(%esi,%edx),%xmm2
298 movdqu -64(%esi,%edx),%xmm3
300 movdqa %xmm0,-16(%edi,%edx)
301 movdqa %xmm1,-32(%edi,%edx)
302 movdqa %xmm2,-48(%edi,%edx)
303 movdqa %xmm3,-64(%edi,%edx)
306 jne LReverseUnalignedLoop
308 jmp LReverseShort // copy remaining 0..63 bytes and done
311 COMMPAGE_DESCRIPTOR(bcopy_sse42,_COMM_PAGE_BCOPY,kHasSSE4_2,0)