]> git.saurik.com Git - apple/xnu.git/blob - osfmk/i386/commpage/bcopy_sse42.s
xnu-1486.2.11.tar.gz
[apple/xnu.git] / osfmk / i386 / commpage / bcopy_sse42.s
1 /*
2 * Copyright (c) 2008 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <machine/cpu_capabilities.h>
30 #include <machine/commpage.h>
31
32 /*
33 * The bcopy/memcpy loops, tuned for Nehalem.
34 *
35 * The following #defines are tightly coupled to the u-architecture:
36 */
37
38 #define kShort 80 // too short to bother with SSE (must be >=80)
39
40
41 // void bcopy(const void *src, void *dst, size_t len);
42
43 COMMPAGE_FUNCTION_START(bcopy_sse42, 32, 5)
44 pushl %ebp // set up a frame for backtraces
45 movl %esp,%ebp
46 pushl %esi
47 pushl %edi
48 movl 8(%ebp),%esi // get source ptr
49 movl 12(%ebp),%edi // get dest ptr
50 movl 16(%ebp),%ecx // get length
51 movl %edi,%edx
52 subl %esi,%edx // (dest - source)
53 cmpl %ecx,%edx // must move in reverse if (dest - source) < length
54 jb LReverseIsland
55 cmpl $(kShort),%ecx // long enough to bother with SSE?
56 jbe Lshort // no
57 jmp LNotShort
58
59 //
60 // void *memcpy(void *dst, const void *src, size_t len);
61 // void *memmove(void *dst, const void *src, size_t len);
62 //
63 // NB: These need to be 32 bytes from bcopy():
64 //
65
66 .align 5, 0x90
67 Lmemcpy: // void *memcpy(void *dst, const void *src, size_t len)
68 Lmemmove: // void *memmove(void *dst, const void *src, size_t len)
69 pushl %ebp // set up a frame for backtraces
70 movl %esp,%ebp
71 pushl %esi
72 pushl %edi
73 movl 8(%ebp),%edi // get dest ptr
74 movl 12(%ebp),%esi // get source ptr
75 movl 16(%ebp),%ecx // get length
76 movl %edi,%edx
77 subl %esi,%edx // (dest - source)
78 cmpl %ecx,%edx // must move in reverse if (dest - source) < length
79 jb LReverseIsland
80 cmpl $(kShort),%ecx // long enough to bother with SSE?
81 ja LNotShort // yes
82
83 // Handle short forward copies. As the most common case, this is the fall-through path.
84 // ecx = length (<= kShort)
85 // esi = source ptr
86 // edi = dest ptr
87
88 Lshort:
89 movl %ecx,%edx // copy length
90 shrl $2,%ecx // get #doublewords
91 jz 3f
92 2: // loop copying doublewords
93 movl (%esi),%eax
94 addl $4,%esi
95 movl %eax,(%edi)
96 addl $4,%edi
97 dec %ecx
98 jnz 2b
99 3: // handle leftover bytes (0..3) in last word
100 andl $3,%edx // any leftover bytes?
101 jz Lexit
102 4: // loop copying bytes
103 movb (%esi),%al
104 inc %esi
105 movb %al,(%edi)
106 inc %edi
107 dec %edx
108 jnz 4b
109 Lexit:
110 movl 8(%ebp),%eax // get return value (dst ptr) for memcpy/memmove
111 popl %edi
112 popl %esi
113 popl %ebp
114 ret
115
116
117 LReverseIsland: // keep the "jb" above a short branch...
118 jmp LReverse // ...because reverse moves are uncommon
119
120
121 // Handle forward moves that are long enough to justify use of SSE.
122 // First, 16-byte align the destination.
123 // ecx = length (> kShort)
124 // esi = source ptr
125 // edi = dest ptr
126
127 LNotShort:
128 movl %edi,%edx // copy destination
129 negl %edx
130 andl $15,%edx // get #bytes to align destination
131 jz LDestAligned // already aligned
132 subl %edx,%ecx // decrement length
133 1: // loop copying 1..15 bytes
134 movb (%esi),%al
135 inc %esi
136 movb %al,(%edi)
137 inc %edi
138 dec %edx
139 jnz 1b
140
141 // Destination is now aligned. Nehalem does a great job with unaligned SSE loads,
142 // so we use MOVDQU rather than aligned loads and shifts. Since kShort>=80, we
143 // know there is at least one 64-byte chunk to move.
144 // When we enter the copy loops, the following registers are set up:
145 // ecx = residual length (0..63)
146 // edx = -(length to move), a multiple of 64
147 // esi = ptr to 1st source byte not to move (unaligned)
148 // edi = ptr to 1st dest byte not to move (aligned)
149
150 LDestAligned:
151 movl %ecx,%edx // copy length
152 andl $63,%ecx // get remaining bytes for Lshort
153 andl $-64,%edx // get number of bytes we will copy in inner loop
154 addl %edx,%esi // point to 1st byte not copied
155 addl %edx,%edi
156 negl %edx // now generate offset to 1st byte to be copied
157 testl $15,%esi // source also aligned?
158 jnz LUnalignedLoop
159 jmp LAlignedLoop
160
161
162 // Forward loop for aligned operands.
163
164 .align 4,0x90 // 16-byte align inner loops
165 LAlignedLoop: // loop over 64-byte chunks
166 movdqa (%esi,%edx),%xmm0
167 movdqa 16(%esi,%edx),%xmm1
168 movdqa 32(%esi,%edx),%xmm2
169 movdqa 48(%esi,%edx),%xmm3
170
171 movdqa %xmm0,(%edi,%edx)
172 movdqa %xmm1,16(%edi,%edx)
173 movdqa %xmm2,32(%edi,%edx)
174 movdqa %xmm3,48(%edi,%edx)
175
176 addl $64,%edx
177 jnz LAlignedLoop
178
179 jmp Lshort // copy remaining 0..63 bytes and done
180
181
182 // Forward loop for unaligned operands.
183
184 .align 4,0x90 // 16-byte align inner loops
185 LUnalignedLoop: // loop over 64-byte chunks
186 movdqu (%esi,%edx),%xmm0
187 movdqu 16(%esi,%edx),%xmm1
188 movdqu 32(%esi,%edx),%xmm2
189 movdqu 48(%esi,%edx),%xmm3
190
191 movdqa %xmm0,(%edi,%edx)
192 movdqa %xmm1,16(%edi,%edx)
193 movdqa %xmm2,32(%edi,%edx)
194 movdqa %xmm3,48(%edi,%edx)
195
196 addl $64,%edx
197 jnz LUnalignedLoop
198
199 jmp Lshort // copy remaining 0..63 bytes and done
200
201
202 // Reverse moves. They are only used with destructive overlap.
203 // ecx = length
204 // esi = source ptr
205 // edi = dest ptr
206
207 LReverse:
208 addl %ecx,%esi // point to end of strings
209 addl %ecx,%edi
210 cmpl $(kShort),%ecx // long enough to bother with SSE?
211 ja LReverseNotShort // yes
212
213 // Handle reverse short copies.
214 // ecx = length
215 // esi = one byte past end of source
216 // edi = one byte past end of dest
217
218 LReverseShort:
219 movl %ecx,%edx // copy length
220 shrl $2,%ecx // #words
221 jz 3f
222 1:
223 subl $4,%esi
224 movl (%esi),%eax
225 subl $4,%edi
226 movl %eax,(%edi)
227 dec %ecx
228 jnz 1b
229 3:
230 andl $3,%edx // bytes?
231 jz 5f
232 4:
233 dec %esi
234 movb (%esi),%al
235 dec %edi
236 movb %al,(%edi)
237 dec %edx
238 jnz 4b
239 5:
240 movl 8(%ebp),%eax // get return value (dst ptr) for memcpy/memmove
241 popl %edi
242 popl %esi
243 popl %ebp
244 ret
245
246 // Handle a reverse move long enough to justify using SSE.
247 // ecx = length
248 // esi = one byte past end of source
249 // edi = one byte past end of dest
250
251 LReverseNotShort:
252 movl %edi,%edx // copy destination
253 andl $15,%edx // get #bytes to align destination
254 je LReverseDestAligned // already aligned
255 subl %edx,%ecx // adjust length
256 1: // loop copying 1..15 bytes
257 dec %esi
258 movb (%esi),%al
259 dec %edi
260 movb %al,(%edi)
261 dec %edx
262 jnz 1b
263
264 // Destination is now aligned. Prepare for reverse loops.
265
266 LReverseDestAligned:
267 movl %ecx,%edx // copy length
268 andl $63,%ecx // get remaining bytes for Lshort
269 andl $-64,%edx // get number of bytes we will copy in inner loop
270 subl %edx,%esi // point to endpoint of copy
271 subl %edx,%edi
272 testl $15,%esi // is source aligned too?
273 jnz LReverseUnalignedLoop // no
274
275 LReverseAlignedLoop: // loop over 64-byte chunks
276 movdqa -16(%esi,%edx),%xmm0
277 movdqa -32(%esi,%edx),%xmm1
278 movdqa -48(%esi,%edx),%xmm2
279 movdqa -64(%esi,%edx),%xmm3
280
281 movdqa %xmm0,-16(%edi,%edx)
282 movdqa %xmm1,-32(%edi,%edx)
283 movdqa %xmm2,-48(%edi,%edx)
284 movdqa %xmm3,-64(%edi,%edx)
285
286 subl $64,%edx
287 jne LReverseAlignedLoop
288
289 jmp LReverseShort // copy remaining 0..63 bytes and done
290
291
292 // Reverse, unaligned loop. LDDQU==MOVDQU on these machines.
293
294 LReverseUnalignedLoop: // loop over 64-byte chunks
295 movdqu -16(%esi,%edx),%xmm0
296 movdqu -32(%esi,%edx),%xmm1
297 movdqu -48(%esi,%edx),%xmm2
298 movdqu -64(%esi,%edx),%xmm3
299
300 movdqa %xmm0,-16(%edi,%edx)
301 movdqa %xmm1,-32(%edi,%edx)
302 movdqa %xmm2,-48(%edi,%edx)
303 movdqa %xmm3,-64(%edi,%edx)
304
305 subl $64,%edx
306 jne LReverseUnalignedLoop
307
308 jmp LReverseShort // copy remaining 0..63 bytes and done
309
310
311 COMMPAGE_DESCRIPTOR(bcopy_sse42,_COMM_PAGE_BCOPY,kHasSSE4_2,0)