]>
Commit | Line | Data |
---|---|---|
1f2f436a A |
1 | /* |
2 | * Copyright (c) 2008 Apple Computer, Inc. All rights reserved. | |
3 | * | |
4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ | |
5 | * | |
6 | * This file contains Original Code and/or Modifications of Original Code | |
7 | * as defined in and that are subject to the Apple Public Source License | |
8 | * Version 2.0 (the 'License'). You may not use this file except in | |
9 | * compliance with the License. The rights granted to you under the License | |
10 | * may not be used to create, or enable the creation or redistribution of, | |
11 | * unlawful or unlicensed copies of an Apple operating system, or to | |
12 | * circumvent, violate, or enable the circumvention or violation of, any | |
13 | * terms of an Apple operating system software license agreement. | |
14 | * | |
15 | * Please obtain a copy of the License at | |
16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. | |
17 | * | |
18 | * The Original Code and all software distributed under the License are | |
19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, | |
21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. | |
23 | * Please see the License for the specific language governing rights and | |
24 | * limitations under the License. | |
25 | * | |
26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ | |
27 | */ | |
28 | ||
29 | #include <machine/cpu_capabilities.h> | |
30 | #include <platfunc.h> | |
31 | ||
32 | /* | |
33 | * The bcopy/memcpy loops, tuned for Nehalem. | |
34 | * | |
35 | * The following #defines are tightly coupled to the u-architecture: | |
36 | */ | |
37 | ||
38 | #define kShort 80 // too short to bother with SSE (must be >=80) | |
39 | ||
40 | ||
41 | // void bcopy(const void *src, void *dst, size_t len); | |
42 | ||
43 | PLATFUNC_FUNCTION_START(bcopy, sse42, 32, 5) | |
44 | pushl %ebp // set up a frame for backtraces | |
45 | movl %esp,%ebp | |
46 | pushl %esi | |
47 | pushl %edi | |
48 | movl 8(%ebp),%esi // get source ptr | |
49 | movl 12(%ebp),%edi // get dest ptr | |
50 | movl 16(%ebp),%ecx // get length | |
51 | movl %edi,%edx | |
52 | subl %esi,%edx // (dest - source) | |
53 | cmpl %ecx,%edx // must move in reverse if (dest - source) < length | |
54 | jb LReverseIsland | |
55 | cmpl $(kShort),%ecx // long enough to bother with SSE? | |
56 | jbe Lshort // no | |
57 | jmp LNotShort | |
58 | ||
59 | // | |
60 | // void *memcpy(void *dst, const void *src, size_t len); | |
61 | // void *memmove(void *dst, const void *src, size_t len); | |
62 | // | |
63 | ||
64 | PLATFUNC_FUNCTION_START(memcpy, sse42, 32, 0) // void *memcpy(void *dst, const void *src, size_t len) | |
65 | PLATFUNC_FUNCTION_START(memmove, sse42, 32, 0) // void *memmove(void *dst, const void *src, size_t len) | |
66 | pushl %ebp // set up a frame for backtraces | |
67 | movl %esp,%ebp | |
68 | pushl %esi | |
69 | pushl %edi | |
70 | movl 8(%ebp),%edi // get dest ptr | |
71 | movl 12(%ebp),%esi // get source ptr | |
72 | movl 16(%ebp),%ecx // get length | |
73 | movl %edi,%edx | |
74 | subl %esi,%edx // (dest - source) | |
75 | cmpl %ecx,%edx // must move in reverse if (dest - source) < length | |
76 | jb LReverseIsland | |
77 | cmpl $(kShort),%ecx // long enough to bother with SSE? | |
78 | ja LNotShort // yes | |
79 | ||
80 | // Handle short forward copies. As the most common case, this is the fall-through path. | |
81 | // ecx = length (<= kShort) | |
82 | // esi = source ptr | |
83 | // edi = dest ptr | |
84 | ||
85 | Lshort: | |
86 | movl %ecx,%edx // copy length | |
87 | shrl $2,%ecx // get #doublewords | |
88 | jz 3f | |
89 | 2: // loop copying doublewords | |
90 | movl (%esi),%eax | |
91 | addl $4,%esi | |
92 | movl %eax,(%edi) | |
93 | addl $4,%edi | |
94 | dec %ecx | |
95 | jnz 2b | |
96 | 3: // handle leftover bytes (0..3) in last word | |
97 | andl $3,%edx // any leftover bytes? | |
98 | jz Lexit | |
99 | 4: // loop copying bytes | |
100 | movb (%esi),%al | |
101 | inc %esi | |
102 | movb %al,(%edi) | |
103 | inc %edi | |
104 | dec %edx | |
105 | jnz 4b | |
106 | Lexit: | |
107 | movl 8(%ebp),%eax // get return value (dst ptr) for memcpy/memmove | |
108 | popl %edi | |
109 | popl %esi | |
110 | popl %ebp | |
111 | ret | |
112 | ||
113 | ||
114 | LReverseIsland: // keep the "jb" above a short branch... | |
115 | jmp LReverse // ...because reverse moves are uncommon | |
116 | ||
117 | ||
118 | // Handle forward moves that are long enough to justify use of SSE. | |
119 | // First, 16-byte align the destination. | |
120 | // ecx = length (> kShort) | |
121 | // esi = source ptr | |
122 | // edi = dest ptr | |
123 | ||
124 | LNotShort: | |
125 | movl %edi,%edx // copy destination | |
126 | negl %edx | |
127 | andl $15,%edx // get #bytes to align destination | |
128 | jz LDestAligned // already aligned | |
129 | subl %edx,%ecx // decrement length | |
130 | 1: // loop copying 1..15 bytes | |
131 | movb (%esi),%al | |
132 | inc %esi | |
133 | movb %al,(%edi) | |
134 | inc %edi | |
135 | dec %edx | |
136 | jnz 1b | |
137 | ||
138 | // Destination is now aligned. Nehalem does a great job with unaligned SSE loads, | |
139 | // so we use MOVDQU rather than aligned loads and shifts. Since kShort>=80, we | |
140 | // know there is at least one 64-byte chunk to move. | |
141 | // When we enter the copy loops, the following registers are set up: | |
142 | // ecx = residual length (0..63) | |
143 | // edx = -(length to move), a multiple of 64 | |
144 | // esi = ptr to 1st source byte not to move (unaligned) | |
145 | // edi = ptr to 1st dest byte not to move (aligned) | |
146 | ||
147 | LDestAligned: | |
148 | movl %ecx,%edx // copy length | |
149 | andl $63,%ecx // get remaining bytes for Lshort | |
150 | andl $-64,%edx // get number of bytes we will copy in inner loop | |
151 | addl %edx,%esi // point to 1st byte not copied | |
152 | addl %edx,%edi | |
153 | negl %edx // now generate offset to 1st byte to be copied | |
154 | testl $15,%esi // source also aligned? | |
155 | jnz LUnalignedLoop | |
156 | jmp LAlignedLoop | |
157 | ||
158 | ||
159 | // Forward loop for aligned operands. | |
160 | ||
161 | .align 4,0x90 // 16-byte align inner loops | |
162 | LAlignedLoop: // loop over 64-byte chunks | |
163 | movdqa (%esi,%edx),%xmm0 | |
164 | movdqa 16(%esi,%edx),%xmm1 | |
165 | movdqa 32(%esi,%edx),%xmm2 | |
166 | movdqa 48(%esi,%edx),%xmm3 | |
167 | ||
168 | movdqa %xmm0,(%edi,%edx) | |
169 | movdqa %xmm1,16(%edi,%edx) | |
170 | movdqa %xmm2,32(%edi,%edx) | |
171 | movdqa %xmm3,48(%edi,%edx) | |
172 | ||
173 | addl $64,%edx | |
174 | jnz LAlignedLoop | |
175 | ||
176 | jmp Lshort // copy remaining 0..63 bytes and done | |
177 | ||
178 | ||
179 | // Forward loop for unaligned operands. | |
180 | ||
181 | .align 4,0x90 // 16-byte align inner loops | |
182 | LUnalignedLoop: // loop over 64-byte chunks | |
183 | movdqu (%esi,%edx),%xmm0 | |
184 | movdqu 16(%esi,%edx),%xmm1 | |
185 | movdqu 32(%esi,%edx),%xmm2 | |
186 | movdqu 48(%esi,%edx),%xmm3 | |
187 | ||
188 | movdqa %xmm0,(%edi,%edx) | |
189 | movdqa %xmm1,16(%edi,%edx) | |
190 | movdqa %xmm2,32(%edi,%edx) | |
191 | movdqa %xmm3,48(%edi,%edx) | |
192 | ||
193 | addl $64,%edx | |
194 | jnz LUnalignedLoop | |
195 | ||
196 | jmp Lshort // copy remaining 0..63 bytes and done | |
197 | ||
198 | ||
199 | // Reverse moves. They are only used with destructive overlap. | |
200 | // ecx = length | |
201 | // esi = source ptr | |
202 | // edi = dest ptr | |
203 | ||
204 | LReverse: | |
205 | addl %ecx,%esi // point to end of strings | |
206 | addl %ecx,%edi | |
207 | cmpl $(kShort),%ecx // long enough to bother with SSE? | |
208 | ja LReverseNotShort // yes | |
209 | ||
210 | // Handle reverse short copies. | |
211 | // ecx = length | |
212 | // esi = one byte past end of source | |
213 | // edi = one byte past end of dest | |
214 | ||
215 | LReverseShort: | |
216 | movl %ecx,%edx // copy length | |
217 | shrl $2,%ecx // #words | |
218 | jz 3f | |
219 | 1: | |
220 | subl $4,%esi | |
221 | movl (%esi),%eax | |
222 | subl $4,%edi | |
223 | movl %eax,(%edi) | |
224 | dec %ecx | |
225 | jnz 1b | |
226 | 3: | |
227 | andl $3,%edx // bytes? | |
228 | jz 5f | |
229 | 4: | |
230 | dec %esi | |
231 | movb (%esi),%al | |
232 | dec %edi | |
233 | movb %al,(%edi) | |
234 | dec %edx | |
235 | jnz 4b | |
236 | 5: | |
237 | movl 8(%ebp),%eax // get return value (dst ptr) for memcpy/memmove | |
238 | popl %edi | |
239 | popl %esi | |
240 | popl %ebp | |
241 | ret | |
242 | ||
243 | // Handle a reverse move long enough to justify using SSE. | |
244 | // ecx = length | |
245 | // esi = one byte past end of source | |
246 | // edi = one byte past end of dest | |
247 | ||
248 | LReverseNotShort: | |
249 | movl %edi,%edx // copy destination | |
250 | andl $15,%edx // get #bytes to align destination | |
251 | je LReverseDestAligned // already aligned | |
252 | subl %edx,%ecx // adjust length | |
253 | 1: // loop copying 1..15 bytes | |
254 | dec %esi | |
255 | movb (%esi),%al | |
256 | dec %edi | |
257 | movb %al,(%edi) | |
258 | dec %edx | |
259 | jnz 1b | |
260 | ||
261 | // Destination is now aligned. Prepare for reverse loops. | |
262 | ||
263 | LReverseDestAligned: | |
264 | movl %ecx,%edx // copy length | |
265 | andl $63,%ecx // get remaining bytes for Lshort | |
266 | andl $-64,%edx // get number of bytes we will copy in inner loop | |
267 | subl %edx,%esi // point to endpoint of copy | |
268 | subl %edx,%edi | |
269 | testl $15,%esi // is source aligned too? | |
270 | jnz LReverseUnalignedLoop // no | |
271 | ||
272 | LReverseAlignedLoop: // loop over 64-byte chunks | |
273 | movdqa -16(%esi,%edx),%xmm0 | |
274 | movdqa -32(%esi,%edx),%xmm1 | |
275 | movdqa -48(%esi,%edx),%xmm2 | |
276 | movdqa -64(%esi,%edx),%xmm3 | |
277 | ||
278 | movdqa %xmm0,-16(%edi,%edx) | |
279 | movdqa %xmm1,-32(%edi,%edx) | |
280 | movdqa %xmm2,-48(%edi,%edx) | |
281 | movdqa %xmm3,-64(%edi,%edx) | |
282 | ||
283 | subl $64,%edx | |
284 | jne LReverseAlignedLoop | |
285 | ||
286 | jmp LReverseShort // copy remaining 0..63 bytes and done | |
287 | ||
288 | ||
289 | // Reverse, unaligned loop. LDDQU==MOVDQU on these machines. | |
290 | ||
291 | LReverseUnalignedLoop: // loop over 64-byte chunks | |
292 | movdqu -16(%esi,%edx),%xmm0 | |
293 | movdqu -32(%esi,%edx),%xmm1 | |
294 | movdqu -48(%esi,%edx),%xmm2 | |
295 | movdqu -64(%esi,%edx),%xmm3 | |
296 | ||
297 | movdqa %xmm0,-16(%edi,%edx) | |
298 | movdqa %xmm1,-32(%edi,%edx) | |
299 | movdqa %xmm2,-48(%edi,%edx) | |
300 | movdqa %xmm3,-64(%edi,%edx) | |
301 | ||
302 | subl $64,%edx | |
303 | jne LReverseUnalignedLoop | |
304 | ||
305 | jmp LReverseShort // copy remaining 0..63 bytes and done | |
306 | ||
307 | ||
308 | PLATFUNC_DESCRIPTOR(bcopy,sse42,kHasSSE4_2,0) | |
309 | PLATFUNC_DESCRIPTOR(memcpy,sse42,kHasSSE4_2,0) | |
310 | PLATFUNC_DESCRIPTOR(memmove,sse42,kHasSSE4_2,0) |