]> git.saurik.com Git - apple/xnu.git/blame - osfmk/i386/commpage/bcopy_sse42_64.s
xnu-1228.12.14.tar.gz
[apple/xnu.git] / osfmk / i386 / commpage / bcopy_sse42_64.s
CommitLineData
c910b4d9
A
1/*
2 * Copyright (c) 2008 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <machine/cpu_capabilities.h>
30#include <machine/commpage.h>
31
32/*
33 * The bcopy/memcpy loops, tuned for Nehalem. This is the 64-bit version.
34 *
35 * The following #defines are tightly coupled to the u-architecture:
36 */
37
38#define kShort 80 // too short to bother with SSE (must be >=80)
39
40
41// void bcopy(const void *src, void *dst, size_t len);
42
43 .text
44 .code64
45 .align 5, 0x90
46Lbcopy_sse42_64: // void bcopy(const void *src, void *dst, size_t len)
47 pushq %rbp // set up a frame for backtraces
48 movq %rsp,%rbp
49 movq %rsi,%rax // copy dest ptr
50 movq %rdi,%rsi // xchange source and dest ptrs
51 movq %rax,%rdi
52 subq %rsi,%rax // (dest - source)
53 cmpq %rdx,%rax // must move in reverse if (dest - source) < length
54 jb LReverseIsland
55 cmpq $(kShort),%rdx // long enough to bother with SSE?
56 jbe LShort // no
57 jmp LNotShort
58
59//
60// void *memcpy(void *dst, const void *src, size_t len);
61// void *memmove(void *dst, const void *src, size_t len);
62//
63// NB: These need to be 32 bytes from bcopy():
64//
65
66 .align 5, 0x90
67Lmemcpy: // void *memcpy(void *dst, const void *src, size_t len)
68Lmemmove: // void *memmove(void *dst, const void *src, size_t len)
69 pushq %rbp // set up a frame for backtraces
70 movq %rsp,%rbp
71 movq %rdi,%r11 // save return value here
72 movq %rdi,%rax
73 subq %rsi,%rax // (dest - source)
74 cmpq %rdx,%rax // must move in reverse if (dest - source) < length
75 jb LReverseIsland
76 cmpq $(kShort),%rdx // long enough to bother with SSE?
77 ja LNotShort // yes
78
79// Handle short forward copies. As the most common case, this is the fall-through path.
80// rdx = length (<= kShort)
81// rsi = source ptr
82// rdi = dest ptr
83
84LShort:
85 movl %edx,%ecx // copy length using 32-bit operation
86 shrl $2,%ecx // get #doublewords
87 jz 3f
882: // loop copying doublewords
89 movl (%rsi),%eax
90 addq $4,%rsi
91 movl %eax,(%rdi)
92 addq $4,%rdi
93 decl %ecx
94 jnz 2b
953: // handle leftover bytes (0..3) in last word
96 andl $3,%edx // any leftover bytes?
97 jz 5f
984: // loop copying bytes
99 movb (%rsi),%al
100 incq %rsi
101 movb %al,(%rdi)
102 incq %rdi
103 decl %edx
104 jnz 4b
1055:
106 movq %r11,%rax // get return value (dst ptr) for memcpy/memmove
107 popq %rbp
108 ret
109
110
111LReverseIsland: // keep the "jb" above a short branch...
112 jmp LReverse // ...because reverse moves are uncommon
113
114
115// Handle forward moves that are long enough to justify use of SSE.
116// First, 16-byte align the destination.
117// rdx = length (> kShort)
118// rsi = source ptr
119// rdi = dest ptr
120
121LNotShort:
122 movl %edi,%ecx // copy low half of destination ptr
123 negl %ecx
124 andl $15,%ecx // get #bytes to align destination
125 jz LDestAligned // already aligned
126 subl %ecx,%edx // decrement length
1271: // loop copying 1..15 bytes
128 movb (%rsi),%al
129 inc %rsi
130 movb %al,(%rdi)
131 inc %rdi
132 dec %ecx
133 jnz 1b
134
135
136// Destination is now aligned. Nehalem does a great job with unaligned SSE loads,
137// so we use MOVDQU rather than aligned loads and shifts. Since kShort>=80, we
138// know there is at least one 64-byte chunk to move.
139// When we enter the copy loops, the following registers are set up:
140// rdx = residual length (0..63)
141// rcx = -(length to move), a multiple of 64 less than 2GB
142// rsi = ptr to 1st source byte not to move (unaligned)
143// rdi = ptr to 1st dest byte not to move (aligned)
144
145LDestAligned:
146 movq %rdx,%rcx // copy length
147 andl $63,%edx // get remaining bytes for LShort
148 andq $-64,%rcx // get number of bytes we will copy in inner loop
149 addq %rcx,%rsi // point to 1st byte not copied
150 addq %rcx,%rdi
151 negq %rcx // now generate offset to 1st byte to be copied
152 testl $15,%esi // source also aligned?
153 jnz LUnalignedLoop
154 jmp LAlignedLoop
155
156
157// Forward loop for aligned operands.
158
159 .align 4,0x90 // 16-byte align inner loops
160LAlignedLoop: // loop over 64-byte chunks
161 movdqa (%rsi,%rcx),%xmm0
162 movdqa 16(%rsi,%rcx),%xmm1
163 movdqa 32(%rsi,%rcx),%xmm2
164 movdqa 48(%rsi,%rcx),%xmm3
165
166 movdqa %xmm0,(%rdi,%rcx)
167 movdqa %xmm1,16(%rdi,%rcx)
168 movdqa %xmm2,32(%rdi,%rcx)
169 movdqa %xmm3,48(%rdi,%rcx)
170
171 addq $64,%rcx
172 jnz LAlignedLoop
173
174 jmp LShort // copy remaining 0..63 bytes and done
175
176
177// Forward loop for unaligned operands.
178
179 .align 4,0x90 // 16-byte align inner loops
180LUnalignedLoop: // loop over 64-byte chunks
181 movdqu (%rsi,%rcx),%xmm0
182 movdqu 16(%rsi,%rcx),%xmm1
183 movdqu 32(%rsi,%rcx),%xmm2
184 movdqu 48(%rsi,%rcx),%xmm3
185
186 movdqa %xmm0,(%rdi,%rcx)
187 movdqa %xmm1,16(%rdi,%rcx)
188 movdqa %xmm2,32(%rdi,%rcx)
189 movdqa %xmm3,48(%rdi,%rcx)
190
191 addq $64,%rcx
192 jnz LUnalignedLoop
193
194 jmp LShort // copy remaining 0..63 bytes and done
195
196
197// Reverse moves. These are only used with destructive overlap.
198// rdx = length
199// rsi = source ptr
200// rdi = dest ptr
201
202LReverse:
203 addq %rdx,%rsi // point to end of strings
204 addq %rdx,%rdi
205 cmpq $(kShort),%rdx // long enough to bother with SSE?
206 ja LReverseNotShort // yes
207
208// Handle reverse short copies.
209// edx = length (<= kShort)
210// rsi = one byte past end of source
211// rdi = one byte past end of dest
212
213LReverseShort:
214 movl %edx,%ecx // copy length
215 shrl $3,%ecx // #quadwords
216 jz 3f
2171:
218 subq $8,%rsi
219 movq (%rsi),%rax
220 subq $8,%rdi
221 movq %rax,(%rdi)
222 decl %ecx
223 jnz 1b
2243:
225 andl $7,%edx // bytes?
226 jz 5f
2274:
228 decq %rsi
229 movb (%rsi),%al
230 decq %rdi
231 movb %al,(%rdi)
232 decl %edx
233 jnz 4b
2345:
235 movq %r11,%rax // get return value (dst ptr) for memcpy/memmove
236 popq %rbp
237 ret
238
239// Handle a reverse move long enough to justify using SSE.
240// rdx = length (> kShort)
241// rsi = one byte past end of source
242// rdi = one byte past end of dest
243
244LReverseNotShort:
245 movl %edi,%ecx // copy destination
246 andl $15,%ecx // get #bytes to align destination
247 jz LReverseDestAligned // already aligned
248 subq %rcx,%rdx // adjust length
2491: // loop copying 1..15 bytes
250 decq %rsi
251 movb (%rsi),%al
252 decq %rdi
253 movb %al,(%rdi)
254 decl %ecx
255 jnz 1b
256
257// Destination is now aligned. Prepare for reverse loops.
258
259LReverseDestAligned:
260 movq %rdx,%rcx // copy length
261 andl $63,%edx // get remaining bytes for LReverseShort
262 andq $-64,%rcx // get number of bytes we will copy in inner loop
263 subq %rcx,%rsi // point to endpoint of copy
264 subq %rcx,%rdi
265 testl $15,%esi // is source aligned too?
266 jnz LReverseUnalignedLoop // no
267
268LReverseAlignedLoop: // loop over 64-byte chunks
269 movdqa -16(%rsi,%rcx),%xmm0
270 movdqa -32(%rsi,%rcx),%xmm1
271 movdqa -48(%rsi,%rcx),%xmm2
272 movdqa -64(%rsi,%rcx),%xmm3
273
274 movdqa %xmm0,-16(%rdi,%rcx)
275 movdqa %xmm1,-32(%rdi,%rcx)
276 movdqa %xmm2,-48(%rdi,%rcx)
277 movdqa %xmm3,-64(%rdi,%rcx)
278
279 subq $64,%rcx
280 jne LReverseAlignedLoop
281
282 jmp LReverseShort // copy remaining 0..63 bytes and done
283
284
285// Reverse, unaligned loop. LDDQU==MOVDQU on these machines.
286
287LReverseUnalignedLoop: // loop over 64-byte chunks
288 movdqu -16(%rsi,%rcx),%xmm0
289 movdqu -32(%rsi,%rcx),%xmm1
290 movdqu -48(%rsi,%rcx),%xmm2
291 movdqu -64(%rsi,%rcx),%xmm3
292
293 movdqa %xmm0,-16(%rdi,%rcx)
294 movdqa %xmm1,-32(%rdi,%rcx)
295 movdqa %xmm2,-48(%rdi,%rcx)
296 movdqa %xmm3,-64(%rdi,%rcx)
297
298 subq $64,%rcx
299 jne LReverseUnalignedLoop
300
301 jmp LReverseShort // copy remaining 0..63 bytes and done
302
303
304 COMMPAGE_DESCRIPTOR(bcopy_sse42_64,_COMM_PAGE_BCOPY,kHasSSE4_2,0)