]> git.saurik.com Git - apple/libc.git/blob - x86_64/string/bcopy_sse42.s
Libc-825.40.1.tar.gz
[apple/libc.git] / x86_64 / string / bcopy_sse42.s
1 /*
2 * Copyright (c) 2008 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <machine/cpu_capabilities.h>
30 #include "platfunc.h"
31
32 /*
33 * The bcopy/memcpy loops, tuned for Nehalem. This is the 64-bit version.
34 *
35 * The following #defines are tightly coupled to the u-architecture:
36 */
37
38 #define kShort 80 // too short to bother with SSE (must be >=80)
39
40
41 // void bcopy(const void *src, void *dst, size_t len);
42
43 PLATFUNC_FUNCTION_START(bcopy, sse42, 64, 5)
44 pushq %rbp // set up a frame for backtraces
45 movq %rsp,%rbp
46 movq %rsi,%rax // copy dest ptr
47 movq %rdi,%rsi // xchange source and dest ptrs
48 movq %rax,%rdi
49 subq %rsi,%rax // (dest - source)
50 cmpq %rdx,%rax // must move in reverse if (dest - source) < length
51 jb LReverseIsland
52 cmpq $(kShort),%rdx // long enough to bother with SSE?
53 jbe LShort // no
54 jmp LNotShort
55
56 //
57 // void *memcpy(void *dst, const void *src, size_t len);
58 // void *memmove(void *dst, const void *src, size_t len);
59 //
60
61 PLATFUNC_FUNCTION_START(memcpy, sse42, 64, 0) // void *memcpy(void *dst, const void *src, size_t len)
62 PLATFUNC_FUNCTION_START(memmove, sse42, 64, 0) // void *memmove(void *dst, const void *src, size_t len)
63 pushq %rbp // set up a frame for backtraces
64 movq %rsp,%rbp
65 movq %rdi,%r11 // save return value here
66 movq %rdi,%rax
67 subq %rsi,%rax // (dest - source)
68 cmpq %rdx,%rax // must move in reverse if (dest - source) < length
69 jb LReverseIsland
70 cmpq $(kShort),%rdx // long enough to bother with SSE?
71 ja LNotShort // yes
72
73 // Handle short forward copies. As the most common case, this is the fall-through path.
74 // rdx = length (<= kShort)
75 // rsi = source ptr
76 // rdi = dest ptr
77
78 LShort:
79 movl %edx,%ecx // copy length using 32-bit operation
80 shrl $2,%ecx // get #doublewords
81 jz 3f
82 2: // loop copying doublewords
83 movl (%rsi),%eax
84 addq $4,%rsi
85 movl %eax,(%rdi)
86 addq $4,%rdi
87 decl %ecx
88 jnz 2b
89 3: // handle leftover bytes (0..3) in last word
90 andl $3,%edx // any leftover bytes?
91 jz 5f
92 4: // loop copying bytes
93 movb (%rsi),%al
94 incq %rsi
95 movb %al,(%rdi)
96 incq %rdi
97 decl %edx
98 jnz 4b
99 5:
100 movq %r11,%rax // get return value (dst ptr) for memcpy/memmove
101 popq %rbp
102 ret
103
104
105 LReverseIsland: // keep the "jb" above a short branch...
106 jmp LReverse // ...because reverse moves are uncommon
107
108
109 // Handle forward moves that are long enough to justify use of SSE.
110 // First, 16-byte align the destination.
111 // rdx = length (> kShort)
112 // rsi = source ptr
113 // rdi = dest ptr
114
115 LNotShort:
116 movl %edi,%ecx // copy low half of destination ptr
117 negl %ecx
118 andl $15,%ecx // get #bytes to align destination
119 jz LDestAligned // already aligned
120 subl %ecx,%edx // decrement length
121 1: // loop copying 1..15 bytes
122 movb (%rsi),%al
123 inc %rsi
124 movb %al,(%rdi)
125 inc %rdi
126 dec %ecx
127 jnz 1b
128
129
130 // Destination is now aligned. Nehalem does a great job with unaligned SSE loads,
131 // so we use MOVDQU rather than aligned loads and shifts. Since kShort>=80, we
132 // know there is at least one 64-byte chunk to move.
133 // When we enter the copy loops, the following registers are set up:
134 // rdx = residual length (0..63)
135 // rcx = -(length to move), a multiple of 64 less than 2GB
136 // rsi = ptr to 1st source byte not to move (unaligned)
137 // rdi = ptr to 1st dest byte not to move (aligned)
138
139 LDestAligned:
140 movq %rdx,%rcx // copy length
141 andl $63,%edx // get remaining bytes for LShort
142 andq $-64,%rcx // get number of bytes we will copy in inner loop
143 addq %rcx,%rsi // point to 1st byte not copied
144 addq %rcx,%rdi
145 negq %rcx // now generate offset to 1st byte to be copied
146 testl $15,%esi // source also aligned?
147 jnz LUnalignedLoop
148 jmp LAlignedLoop
149
150
151 // Forward loop for aligned operands.
152
153 .align 4,0x90 // 16-byte align inner loops
154 LAlignedLoop: // loop over 64-byte chunks
155 movdqa (%rsi,%rcx),%xmm0
156 movdqa 16(%rsi,%rcx),%xmm1
157 movdqa 32(%rsi,%rcx),%xmm2
158 movdqa 48(%rsi,%rcx),%xmm3
159
160 movdqa %xmm0,(%rdi,%rcx)
161 movdqa %xmm1,16(%rdi,%rcx)
162 movdqa %xmm2,32(%rdi,%rcx)
163 movdqa %xmm3,48(%rdi,%rcx)
164
165 addq $64,%rcx
166 jnz LAlignedLoop
167
168 jmp LShort // copy remaining 0..63 bytes and done
169
170
171 // Forward loop for unaligned operands.
172
173 .align 4,0x90 // 16-byte align inner loops
174 LUnalignedLoop: // loop over 64-byte chunks
175 movdqu (%rsi,%rcx),%xmm0
176 movdqu 16(%rsi,%rcx),%xmm1
177 movdqu 32(%rsi,%rcx),%xmm2
178 movdqu 48(%rsi,%rcx),%xmm3
179
180 movdqa %xmm0,(%rdi,%rcx)
181 movdqa %xmm1,16(%rdi,%rcx)
182 movdqa %xmm2,32(%rdi,%rcx)
183 movdqa %xmm3,48(%rdi,%rcx)
184
185 addq $64,%rcx
186 jnz LUnalignedLoop
187
188 jmp LShort // copy remaining 0..63 bytes and done
189
190
191 // Reverse moves. These are only used with destructive overlap.
192 // rdx = length
193 // rsi = source ptr
194 // rdi = dest ptr
195
196 LReverse:
197 addq %rdx,%rsi // point to end of strings
198 addq %rdx,%rdi
199 cmpq $(kShort),%rdx // long enough to bother with SSE?
200 ja LReverseNotShort // yes
201
202 // Handle reverse short copies.
203 // edx = length (<= kShort)
204 // rsi = one byte past end of source
205 // rdi = one byte past end of dest
206
207 LReverseShort:
208 movl %edx,%ecx // copy length
209 shrl $3,%ecx // #quadwords
210 jz 3f
211 1:
212 subq $8,%rsi
213 movq (%rsi),%rax
214 subq $8,%rdi
215 movq %rax,(%rdi)
216 decl %ecx
217 jnz 1b
218 3:
219 andl $7,%edx // bytes?
220 jz 5f
221 4:
222 decq %rsi
223 movb (%rsi),%al
224 decq %rdi
225 movb %al,(%rdi)
226 decl %edx
227 jnz 4b
228 5:
229 movq %r11,%rax // get return value (dst ptr) for memcpy/memmove
230 popq %rbp
231 ret
232
233 // Handle a reverse move long enough to justify using SSE.
234 // rdx = length (> kShort)
235 // rsi = one byte past end of source
236 // rdi = one byte past end of dest
237
238 LReverseNotShort:
239 movl %edi,%ecx // copy destination
240 andl $15,%ecx // get #bytes to align destination
241 jz LReverseDestAligned // already aligned
242 subq %rcx,%rdx // adjust length
243 1: // loop copying 1..15 bytes
244 decq %rsi
245 movb (%rsi),%al
246 decq %rdi
247 movb %al,(%rdi)
248 decl %ecx
249 jnz 1b
250
251 // Destination is now aligned. Prepare for reverse loops.
252
253 LReverseDestAligned:
254 movq %rdx,%rcx // copy length
255 andl $63,%edx // get remaining bytes for LReverseShort
256 andq $-64,%rcx // get number of bytes we will copy in inner loop
257 subq %rcx,%rsi // point to endpoint of copy
258 subq %rcx,%rdi
259 testl $15,%esi // is source aligned too?
260 jnz LReverseUnalignedLoop // no
261
262 LReverseAlignedLoop: // loop over 64-byte chunks
263 movdqa -16(%rsi,%rcx),%xmm0
264 movdqa -32(%rsi,%rcx),%xmm1
265 movdqa -48(%rsi,%rcx),%xmm2
266 movdqa -64(%rsi,%rcx),%xmm3
267
268 movdqa %xmm0,-16(%rdi,%rcx)
269 movdqa %xmm1,-32(%rdi,%rcx)
270 movdqa %xmm2,-48(%rdi,%rcx)
271 movdqa %xmm3,-64(%rdi,%rcx)
272
273 subq $64,%rcx
274 jne LReverseAlignedLoop
275
276 jmp LReverseShort // copy remaining 0..63 bytes and done
277
278
279 // Reverse, unaligned loop. LDDQU==MOVDQU on these machines.
280
281 LReverseUnalignedLoop: // loop over 64-byte chunks
282 movdqu -16(%rsi,%rcx),%xmm0
283 movdqu -32(%rsi,%rcx),%xmm1
284 movdqu -48(%rsi,%rcx),%xmm2
285 movdqu -64(%rsi,%rcx),%xmm3
286
287 movdqa %xmm0,-16(%rdi,%rcx)
288 movdqa %xmm1,-32(%rdi,%rcx)
289 movdqa %xmm2,-48(%rdi,%rcx)
290 movdqa %xmm3,-64(%rdi,%rcx)
291
292 subq $64,%rcx
293 jne LReverseUnalignedLoop
294
295 jmp LReverseShort // copy remaining 0..63 bytes and done
296
297
298 PLATFUNC_DESCRIPTOR(bcopy,sse42,kHasSSE4_2,0)
299 PLATFUNC_DESCRIPTOR(memcpy,sse42,kHasSSE4_2,0)
300 PLATFUNC_DESCRIPTOR(memmove,sse42,kHasSSE4_2,0)