]> git.saurik.com Git - apple/xnu.git/blob - osfmk/i386/commpage/bcopy_sse42_64.s
xnu-1504.3.12.tar.gz
[apple/xnu.git] / osfmk / i386 / commpage / bcopy_sse42_64.s
1 /*
2 * Copyright (c) 2008 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <machine/cpu_capabilities.h>
30 #include <machine/commpage.h>
31
32 /*
33 * The bcopy/memcpy loops, tuned for Nehalem. This is the 64-bit version.
34 *
35 * The following #defines are tightly coupled to the u-architecture:
36 */
37
38 #define kShort 80 // too short to bother with SSE (must be >=80)
39
40
41 // void bcopy(const void *src, void *dst, size_t len);
42
43 COMMPAGE_FUNCTION_START(bcopy_sse42_64, 64, 5)
44 pushq %rbp // set up a frame for backtraces
45 movq %rsp,%rbp
46 movq %rsi,%rax // copy dest ptr
47 movq %rdi,%rsi // xchange source and dest ptrs
48 movq %rax,%rdi
49 subq %rsi,%rax // (dest - source)
50 cmpq %rdx,%rax // must move in reverse if (dest - source) < length
51 jb LReverseIsland
52 cmpq $(kShort),%rdx // long enough to bother with SSE?
53 jbe LShort // no
54 jmp LNotShort
55
56 //
57 // void *memcpy(void *dst, const void *src, size_t len);
58 // void *memmove(void *dst, const void *src, size_t len);
59 //
60 // NB: These need to be 32 bytes from bcopy():
61 //
62
63 .align 5, 0x90
64 Lmemcpy: // void *memcpy(void *dst, const void *src, size_t len)
65 Lmemmove: // void *memmove(void *dst, const void *src, size_t len)
66 pushq %rbp // set up a frame for backtraces
67 movq %rsp,%rbp
68 movq %rdi,%r11 // save return value here
69 movq %rdi,%rax
70 subq %rsi,%rax // (dest - source)
71 cmpq %rdx,%rax // must move in reverse if (dest - source) < length
72 jb LReverseIsland
73 cmpq $(kShort),%rdx // long enough to bother with SSE?
74 ja LNotShort // yes
75
76 // Handle short forward copies. As the most common case, this is the fall-through path.
77 // rdx = length (<= kShort)
78 // rsi = source ptr
79 // rdi = dest ptr
80
81 LShort:
82 movl %edx,%ecx // copy length using 32-bit operation
83 shrl $2,%ecx // get #doublewords
84 jz 3f
85 2: // loop copying doublewords
86 movl (%rsi),%eax
87 addq $4,%rsi
88 movl %eax,(%rdi)
89 addq $4,%rdi
90 decl %ecx
91 jnz 2b
92 3: // handle leftover bytes (0..3) in last word
93 andl $3,%edx // any leftover bytes?
94 jz 5f
95 4: // loop copying bytes
96 movb (%rsi),%al
97 incq %rsi
98 movb %al,(%rdi)
99 incq %rdi
100 decl %edx
101 jnz 4b
102 5:
103 movq %r11,%rax // get return value (dst ptr) for memcpy/memmove
104 popq %rbp
105 ret
106
107
108 LReverseIsland: // keep the "jb" above a short branch...
109 jmp LReverse // ...because reverse moves are uncommon
110
111
112 // Handle forward moves that are long enough to justify use of SSE.
113 // First, 16-byte align the destination.
114 // rdx = length (> kShort)
115 // rsi = source ptr
116 // rdi = dest ptr
117
118 LNotShort:
119 movl %edi,%ecx // copy low half of destination ptr
120 negl %ecx
121 andl $15,%ecx // get #bytes to align destination
122 jz LDestAligned // already aligned
123 subl %ecx,%edx // decrement length
124 1: // loop copying 1..15 bytes
125 movb (%rsi),%al
126 inc %rsi
127 movb %al,(%rdi)
128 inc %rdi
129 dec %ecx
130 jnz 1b
131
132
133 // Destination is now aligned. Nehalem does a great job with unaligned SSE loads,
134 // so we use MOVDQU rather than aligned loads and shifts. Since kShort>=80, we
135 // know there is at least one 64-byte chunk to move.
136 // When we enter the copy loops, the following registers are set up:
137 // rdx = residual length (0..63)
138 // rcx = -(length to move), a multiple of 64 less than 2GB
139 // rsi = ptr to 1st source byte not to move (unaligned)
140 // rdi = ptr to 1st dest byte not to move (aligned)
141
142 LDestAligned:
143 movq %rdx,%rcx // copy length
144 andl $63,%edx // get remaining bytes for LShort
145 andq $-64,%rcx // get number of bytes we will copy in inner loop
146 addq %rcx,%rsi // point to 1st byte not copied
147 addq %rcx,%rdi
148 negq %rcx // now generate offset to 1st byte to be copied
149 testl $15,%esi // source also aligned?
150 jnz LUnalignedLoop
151 jmp LAlignedLoop
152
153
154 // Forward loop for aligned operands.
155
156 .align 4,0x90 // 16-byte align inner loops
157 LAlignedLoop: // loop over 64-byte chunks
158 movdqa (%rsi,%rcx),%xmm0
159 movdqa 16(%rsi,%rcx),%xmm1
160 movdqa 32(%rsi,%rcx),%xmm2
161 movdqa 48(%rsi,%rcx),%xmm3
162
163 movdqa %xmm0,(%rdi,%rcx)
164 movdqa %xmm1,16(%rdi,%rcx)
165 movdqa %xmm2,32(%rdi,%rcx)
166 movdqa %xmm3,48(%rdi,%rcx)
167
168 addq $64,%rcx
169 jnz LAlignedLoop
170
171 jmp LShort // copy remaining 0..63 bytes and done
172
173
174 // Forward loop for unaligned operands.
175
176 .align 4,0x90 // 16-byte align inner loops
177 LUnalignedLoop: // loop over 64-byte chunks
178 movdqu (%rsi,%rcx),%xmm0
179 movdqu 16(%rsi,%rcx),%xmm1
180 movdqu 32(%rsi,%rcx),%xmm2
181 movdqu 48(%rsi,%rcx),%xmm3
182
183 movdqa %xmm0,(%rdi,%rcx)
184 movdqa %xmm1,16(%rdi,%rcx)
185 movdqa %xmm2,32(%rdi,%rcx)
186 movdqa %xmm3,48(%rdi,%rcx)
187
188 addq $64,%rcx
189 jnz LUnalignedLoop
190
191 jmp LShort // copy remaining 0..63 bytes and done
192
193
194 // Reverse moves. These are only used with destructive overlap.
195 // rdx = length
196 // rsi = source ptr
197 // rdi = dest ptr
198
199 LReverse:
200 addq %rdx,%rsi // point to end of strings
201 addq %rdx,%rdi
202 cmpq $(kShort),%rdx // long enough to bother with SSE?
203 ja LReverseNotShort // yes
204
205 // Handle reverse short copies.
206 // edx = length (<= kShort)
207 // rsi = one byte past end of source
208 // rdi = one byte past end of dest
209
210 LReverseShort:
211 movl %edx,%ecx // copy length
212 shrl $3,%ecx // #quadwords
213 jz 3f
214 1:
215 subq $8,%rsi
216 movq (%rsi),%rax
217 subq $8,%rdi
218 movq %rax,(%rdi)
219 decl %ecx
220 jnz 1b
221 3:
222 andl $7,%edx // bytes?
223 jz 5f
224 4:
225 decq %rsi
226 movb (%rsi),%al
227 decq %rdi
228 movb %al,(%rdi)
229 decl %edx
230 jnz 4b
231 5:
232 movq %r11,%rax // get return value (dst ptr) for memcpy/memmove
233 popq %rbp
234 ret
235
236 // Handle a reverse move long enough to justify using SSE.
237 // rdx = length (> kShort)
238 // rsi = one byte past end of source
239 // rdi = one byte past end of dest
240
241 LReverseNotShort:
242 movl %edi,%ecx // copy destination
243 andl $15,%ecx // get #bytes to align destination
244 jz LReverseDestAligned // already aligned
245 subq %rcx,%rdx // adjust length
246 1: // loop copying 1..15 bytes
247 decq %rsi
248 movb (%rsi),%al
249 decq %rdi
250 movb %al,(%rdi)
251 decl %ecx
252 jnz 1b
253
254 // Destination is now aligned. Prepare for reverse loops.
255
256 LReverseDestAligned:
257 movq %rdx,%rcx // copy length
258 andl $63,%edx // get remaining bytes for LReverseShort
259 andq $-64,%rcx // get number of bytes we will copy in inner loop
260 subq %rcx,%rsi // point to endpoint of copy
261 subq %rcx,%rdi
262 testl $15,%esi // is source aligned too?
263 jnz LReverseUnalignedLoop // no
264
265 LReverseAlignedLoop: // loop over 64-byte chunks
266 movdqa -16(%rsi,%rcx),%xmm0
267 movdqa -32(%rsi,%rcx),%xmm1
268 movdqa -48(%rsi,%rcx),%xmm2
269 movdqa -64(%rsi,%rcx),%xmm3
270
271 movdqa %xmm0,-16(%rdi,%rcx)
272 movdqa %xmm1,-32(%rdi,%rcx)
273 movdqa %xmm2,-48(%rdi,%rcx)
274 movdqa %xmm3,-64(%rdi,%rcx)
275
276 subq $64,%rcx
277 jne LReverseAlignedLoop
278
279 jmp LReverseShort // copy remaining 0..63 bytes and done
280
281
282 // Reverse, unaligned loop. LDDQU==MOVDQU on these machines.
283
284 LReverseUnalignedLoop: // loop over 64-byte chunks
285 movdqu -16(%rsi,%rcx),%xmm0
286 movdqu -32(%rsi,%rcx),%xmm1
287 movdqu -48(%rsi,%rcx),%xmm2
288 movdqu -64(%rsi,%rcx),%xmm3
289
290 movdqa %xmm0,-16(%rdi,%rcx)
291 movdqa %xmm1,-32(%rdi,%rcx)
292 movdqa %xmm2,-48(%rdi,%rcx)
293 movdqa %xmm3,-64(%rdi,%rcx)
294
295 subq $64,%rcx
296 jne LReverseUnalignedLoop
297
298 jmp LReverseShort // copy remaining 0..63 bytes and done
299
300
301 COMMPAGE_DESCRIPTOR(bcopy_sse42_64,_COMM_PAGE_BCOPY,kHasSSE4_2,0)