]>
Commit | Line | Data |
---|---|---|
1 | /* | |
2 | * Copyright (c) 2006 Apple Computer, Inc. All rights reserved. | |
3 | * | |
4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ | |
5 | * | |
6 | * This file contains Original Code and/or Modifications of Original Code | |
7 | * as defined in and that are subject to the Apple Public Source License | |
8 | * Version 2.0 (the 'License'). You may not use this file except in | |
9 | * compliance with the License. The rights granted to you under the License | |
10 | * may not be used to create, or enable the creation or redistribution of, | |
11 | * unlawful or unlicensed copies of an Apple operating system, or to | |
12 | * circumvent, violate, or enable the circumvention or violation of, any | |
13 | * terms of an Apple operating system software license agreement. | |
14 | * | |
15 | * Please obtain a copy of the License at | |
16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. | |
17 | * | |
18 | * The Original Code and all software distributed under the License are | |
19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, | |
21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. | |
23 | * Please see the License for the specific language governing rights and | |
24 | * limitations under the License. | |
25 | * | |
26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ | |
27 | */ | |
28 | ||
29 | #include <machine/cpu_capabilities.h> | |
30 | #include <platfunc.h> | |
31 | ||
32 | /* | |
33 | * The bcopy/memcpy loops, tuned for Pentium-M class processors with | |
34 | * Supplemental SSE3 and 64-byte cache lines. | |
35 | * | |
36 | * The following #defines are tightly coupled to the u-architecture: | |
37 | */ | |
38 | ||
39 | #define kShort 80 // too short to bother with SSE (must be >=80) | |
40 | #define kVeryLong (500*1024) // large enough for non-temporal stores (must be >= 8192) | |
41 | #define kFastUCode ((16*1024)-15) // cutoff for microcode fastpath for "rep/movsl" | |
42 | ||
43 | // void bcopy(const void *src, void *dst, size_t len); | |
44 | ||
45 | PLATFUNC_FUNCTION_START(bcopy, sse3x, 32, 5) | |
46 | pushl %ebp // set up a frame for backtraces | |
47 | movl %esp,%ebp | |
48 | pushl %esi | |
49 | pushl %edi | |
50 | pushl %ebx | |
51 | movl 8(%ebp),%esi // get source ptr | |
52 | movl 12(%ebp),%edi // get dest ptr | |
53 | movl 16(%ebp),%ecx // get length | |
54 | movl %edi,%edx | |
55 | subl %esi,%edx // (dest - source) | |
56 | cmpl %ecx,%edx // must move in reverse if (dest - source) < length | |
57 | jb LReverseIsland | |
58 | cmpl $(kShort),%ecx // long enough to bother with SSE? | |
59 | jbe Lshort // no | |
60 | jmp LNotShort | |
61 | ||
62 | // | |
63 | // void *memcpy(void *dst, const void *src, size_t len); | |
64 | // void *memmove(void *dst, const void *src, size_t len); | |
65 | // | |
66 | ||
67 | PLATFUNC_FUNCTION_START(memcpy, sse3x, 32, 0) // void *memcpy(void *dst, const void *src, size_t len) | |
68 | PLATFUNC_FUNCTION_START(memmove, sse3x, 32, 0) // void *memmove(void *dst, const void *src, size_t len) | |
69 | pushl %ebp // set up a frame for backtraces | |
70 | movl %esp,%ebp | |
71 | pushl %esi | |
72 | pushl %edi | |
73 | pushl %ebx | |
74 | movl 8(%ebp),%edi // get dest ptr | |
75 | movl 12(%ebp),%esi // get source ptr | |
76 | movl 16(%ebp),%ecx // get length | |
77 | movl %edi,%edx | |
78 | subl %esi,%edx // (dest - source) | |
79 | cmpl %ecx,%edx // must move in reverse if (dest - source) < length | |
80 | jb LReverseIsland | |
81 | cmpl $(kShort),%ecx // long enough to bother with SSE? | |
82 | ja LNotShort // yes | |
83 | ||
84 | // Handle short forward copies. As the most common case, this is the fall-through path. | |
85 | // ecx = length (<= kShort) | |
86 | // esi = source ptr | |
87 | // edi = dest ptr | |
88 | ||
89 | Lshort: | |
90 | movl %ecx,%edx // copy length | |
91 | shrl $2,%ecx // get #doublewords | |
92 | jz LLeftovers | |
93 | 2: // loop copying doublewords | |
94 | movl (%esi),%eax | |
95 | addl $4,%esi | |
96 | movl %eax,(%edi) | |
97 | addl $4,%edi | |
98 | dec %ecx | |
99 | jnz 2b | |
100 | LLeftovers: // handle leftover bytes (0..3) in last word | |
101 | andl $3,%edx // any leftover bytes? | |
102 | jz Lexit | |
103 | 4: // loop copying bytes | |
104 | movb (%esi),%al | |
105 | inc %esi | |
106 | movb %al,(%edi) | |
107 | inc %edi | |
108 | dec %edx | |
109 | jnz 4b | |
110 | Lexit: | |
111 | movl 8(%ebp),%eax // get return value (dst ptr) for memcpy/memmove | |
112 | popl %ebx | |
113 | popl %edi | |
114 | popl %esi | |
115 | popl %ebp | |
116 | ret | |
117 | ||
118 | ||
119 | LReverseIsland: // keep the "jb" above a short branch... | |
120 | jmp LReverse // ...because reverse moves are uncommon | |
121 | ||
122 | ||
123 | // Handle forward moves that are long enough to justify use of SSE3. | |
124 | // First, 16-byte align the destination. | |
125 | // ecx = length (> kShort) | |
126 | // esi = source ptr | |
127 | // edi = dest ptr | |
128 | ||
129 | LNotShort: | |
130 | cmpl $(kVeryLong),%ecx // long enough to justify heavyweight loops? | |
131 | movl %edi,%edx // copy destination | |
132 | jae LVeryLong // use very-long-operand path | |
133 | negl %edx | |
134 | andl $15,%edx // get #bytes to align destination | |
135 | jz LDestAligned // already aligned | |
136 | subl %edx,%ecx // decrement length | |
137 | 1: // loop copying 1..15 bytes | |
138 | movb (%esi),%al | |
139 | inc %esi | |
140 | movb %al,(%edi) | |
141 | inc %edi | |
142 | dec %edx | |
143 | jnz 1b | |
144 | ||
145 | // Destination is now aligned. Dispatch to one of sixteen loops over 64-byte chunks, | |
146 | // based on the alignment of the source. All vector loads and stores are aligned. | |
147 | // Even though this means we have to shift and repack vectors, doing so is much faster | |
148 | // than unaligned loads. Since kShort>=80 and we've moved at most 15 bytes already, | |
149 | // there is at least one chunk. When we enter the copy loops, the following registers | |
150 | // are set up: | |
151 | // ecx = residual length (0..63) | |
152 | // edx = -(length to move), a multiple of 64 | |
153 | // esi = ptr to 1st source byte not to move (unaligned) | |
154 | // edi = ptr to 1st dest byte not to move (aligned) | |
155 | ||
156 | LDestAligned: | |
157 | movl %ecx,%edx // copy length | |
158 | movl %esi,%eax // copy source address | |
159 | andl $63,%ecx // get remaining bytes for Lshort | |
160 | andl $-64,%edx // get number of bytes we will copy in inner loop | |
161 | andl $15,%eax // mask to low 4 bits of source address | |
162 | addl %edx,%esi // point to 1st byte not copied | |
163 | addl %edx,%edi | |
164 | negl %edx // now generate offset to 1st byte to be copied | |
165 | call 1f | |
166 | 1: | |
167 | popl %ebx | |
168 | movl (LTable-1b)(%ebx,%eax,4), %eax // load jump table entry address, relative to LZero | |
169 | leal (LTable-1b)(%ebx,%eax,1), %eax | |
170 | jmp *%eax | |
171 | ||
172 | .align 2 | |
173 | LTable: // table of copy loop addresses | |
174 | .long LMod0 -LTable | |
175 | .long LMod1 -LTable | |
176 | .long LMod2 -LTable | |
177 | .long LMod3 -LTable | |
178 | .long LMod4 -LTable | |
179 | .long LMod5 -LTable | |
180 | .long LMod6 -LTable | |
181 | .long LMod7 -LTable | |
182 | .long LMod8 -LTable | |
183 | .long LMod9 -LTable | |
184 | .long LMod10 -LTable | |
185 | .long LMod11 -LTable | |
186 | .long LMod12 -LTable | |
187 | .long LMod13 -LTable | |
188 | .long LMod14 -LTable | |
189 | .long LMod15 -LTable | |
190 | ||
191 | ||
192 | // Very long forward moves. These are at least several pages. They are special cased | |
193 | // and aggressively optimized, not so much because they are common or useful, but | |
194 | // because they are subject to benchmark. There isn't enough room for them in the | |
195 | // area reserved on the platfunc for bcopy, so we put them elsewhere. We call | |
196 | // the longcopy routine using the normal ABI. | |
197 | ||
198 | LVeryLong: | |
199 | pushl %ecx // length (>= kVeryLong) | |
200 | pushl %esi // source ptr | |
201 | pushl %edi // dest ptr | |
202 | call _longcopy | |
203 | addl $12,%esp // pop off our parameters | |
204 | jmp Lexit | |
205 | ||
206 | ||
207 | // On Pentium-M, the microcode for "rep/movsl" is faster than SSE for 8-byte | |
208 | // aligned operands from about 32KB up to kVeryLong for the hot cache case, and from | |
209 | // about 256 bytes up to kVeryLong for cold caches. This is because the microcode | |
210 | // avoids having to read destination cache lines that will be completely overwritten. | |
211 | // The cutoff we use (ie, kFastUCode) must somehow balance the two cases, since | |
212 | // we do not know if the destination is in cache or not. | |
213 | ||
214 | Lfastpath: | |
215 | addl %edx,%esi // restore ptrs to 1st byte of source and dest | |
216 | addl %edx,%edi | |
217 | negl %edx // make length positive | |
218 | orl %edx,%ecx // restore total #bytes remaining to move | |
219 | cld // we'll move forward | |
220 | movl %ecx,%edx // copy total length to move | |
221 | shrl $2,%ecx // compute #words to move | |
222 | rep // the u-code will optimize this | |
223 | movsl | |
224 | jmp LLeftovers // handle 0..3 leftover bytes | |
225 | ||
226 | ||
227 | // Forward loop for medium length operands in which low four bits of %esi == 0000 | |
228 | ||
229 | LMod0: | |
230 | cmpl $(-kFastUCode),%edx // %edx == -length, where (length < kVeryLong) | |
231 | jle Lfastpath // long enough for fastpath in microcode | |
232 | jmp 1f | |
233 | .align 4,0x90 // 16-byte align inner loops | |
234 | 1: // loop over 64-byte chunks | |
235 | movdqa (%esi,%edx),%xmm0 | |
236 | movdqa 16(%esi,%edx),%xmm1 | |
237 | movdqa 32(%esi,%edx),%xmm2 | |
238 | movdqa 48(%esi,%edx),%xmm3 | |
239 | ||
240 | movdqa %xmm0,(%edi,%edx) | |
241 | movdqa %xmm1,16(%edi,%edx) | |
242 | movdqa %xmm2,32(%edi,%edx) | |
243 | movdqa %xmm3,48(%edi,%edx) | |
244 | ||
245 | addl $64,%edx | |
246 | jnz 1b | |
247 | ||
248 | jmp Lshort // copy remaining 0..63 bytes and done | |
249 | ||
250 | ||
251 | // Forward loop for medium length operands in which low four bits of %esi == 0001 | |
252 | ||
253 | LMod1: | |
254 | movdqa -1(%esi,%edx),%xmm0 // prime the loop by loading 1st quadword | |
255 | 1: // loop over 64-byte chunks | |
256 | movdqa 15(%esi,%edx),%xmm1 | |
257 | movdqa 31(%esi,%edx),%xmm2 | |
258 | movdqa 47(%esi,%edx),%xmm3 | |
259 | movdqa 63(%esi,%edx),%xmm4 | |
260 | ||
261 | movdqa %xmm0,%xmm5 | |
262 | movdqa %xmm4,%xmm0 | |
263 | ||
264 | palignr $1,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) | |
265 | palignr $1,%xmm2,%xmm3 | |
266 | palignr $1,%xmm1,%xmm2 | |
267 | palignr $1,%xmm5,%xmm1 | |
268 | ||
269 | movdqa %xmm1,(%edi,%edx) | |
270 | movdqa %xmm2,16(%edi,%edx) | |
271 | movdqa %xmm3,32(%edi,%edx) | |
272 | movdqa %xmm4,48(%edi,%edx) | |
273 | ||
274 | addl $64,%edx | |
275 | jnz 1b | |
276 | ||
277 | jmp Lshort // copy remaining 0..63 bytes and done | |
278 | ||
279 | ||
280 | // Forward loop for medium length operands in which low four bits of %esi == 0010 | |
281 | ||
282 | LMod2: | |
283 | movdqa -2(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq | |
284 | 1: // loop over 64-byte chunks | |
285 | movdqa 14(%esi,%edx),%xmm1 | |
286 | movdqa 30(%esi,%edx),%xmm2 | |
287 | movdqa 46(%esi,%edx),%xmm3 | |
288 | movdqa 62(%esi,%edx),%xmm4 | |
289 | ||
290 | movdqa %xmm0,%xmm5 | |
291 | movdqa %xmm4,%xmm0 | |
292 | ||
293 | palignr $2,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) | |
294 | palignr $2,%xmm2,%xmm3 | |
295 | palignr $2,%xmm1,%xmm2 | |
296 | palignr $2,%xmm5,%xmm1 | |
297 | ||
298 | movdqa %xmm1,(%edi,%edx) | |
299 | movdqa %xmm2,16(%edi,%edx) | |
300 | movdqa %xmm3,32(%edi,%edx) | |
301 | movdqa %xmm4,48(%edi,%edx) | |
302 | ||
303 | addl $64,%edx | |
304 | jnz 1b | |
305 | ||
306 | jmp Lshort // copy remaining 0..63 bytes and done | |
307 | ||
308 | ||
309 | // Forward loop for medium length operands in which low four bits of %esi == 0011 | |
310 | ||
311 | LMod3: | |
312 | movdqa -3(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq | |
313 | 1: // loop over 64-byte chunks | |
314 | movdqa 13(%esi,%edx),%xmm1 | |
315 | movdqa 29(%esi,%edx),%xmm2 | |
316 | movdqa 45(%esi,%edx),%xmm3 | |
317 | movdqa 61(%esi,%edx),%xmm4 | |
318 | ||
319 | movdqa %xmm0,%xmm5 | |
320 | movdqa %xmm4,%xmm0 | |
321 | ||
322 | palignr $3,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) | |
323 | palignr $3,%xmm2,%xmm3 | |
324 | palignr $3,%xmm1,%xmm2 | |
325 | palignr $3,%xmm5,%xmm1 | |
326 | ||
327 | movdqa %xmm1,(%edi,%edx) | |
328 | movdqa %xmm2,16(%edi,%edx) | |
329 | movdqa %xmm3,32(%edi,%edx) | |
330 | movdqa %xmm4,48(%edi,%edx) | |
331 | ||
332 | addl $64,%edx | |
333 | jnz 1b | |
334 | ||
335 | jmp Lshort // copy remaining 0..63 bytes and done | |
336 | ||
337 | ||
338 | // Forward loop for medium length operands in which low four bits of %esi == 0100 | |
339 | // We use the float single data type in order to use "movss" to merge vectors. | |
340 | ||
341 | LMod4: | |
342 | movaps -4(%esi,%edx),%xmm0 // 4-byte aligned: prime the loop | |
343 | jmp 1f | |
344 | .align 4,0x90 | |
345 | 1: // loop over 64-byte chunks | |
346 | movaps 12(%esi,%edx),%xmm1 | |
347 | movaps 28(%esi,%edx),%xmm2 | |
348 | movss %xmm1,%xmm0 // copy low 4 bytes of source into destination | |
349 | pshufd $(0x39),%xmm0,%xmm0 // rotate right 4 bytes (mask -- 00 11 10 01) | |
350 | movaps 44(%esi,%edx),%xmm3 | |
351 | movss %xmm2,%xmm1 | |
352 | pshufd $(0x39),%xmm1,%xmm1 | |
353 | movaps 60(%esi,%edx),%xmm4 | |
354 | movss %xmm3,%xmm2 | |
355 | pshufd $(0x39),%xmm2,%xmm2 | |
356 | ||
357 | movaps %xmm0,(%edi,%edx) | |
358 | movss %xmm4,%xmm3 | |
359 | pshufd $(0x39),%xmm3,%xmm3 | |
360 | movaps %xmm1,16(%edi,%edx) | |
361 | movaps %xmm2,32(%edi,%edx) | |
362 | movaps %xmm4,%xmm0 | |
363 | movaps %xmm3,48(%edi,%edx) | |
364 | ||
365 | addl $64,%edx | |
366 | jnz 1b | |
367 | ||
368 | jmp Lshort // copy remaining 0..63 bytes and done | |
369 | ||
370 | ||
371 | // Forward loop for medium length operands in which low four bits of %esi == 0101 | |
372 | ||
373 | LMod5: | |
374 | movdqa -5(%esi,%edx),%xmm0// prime the loop by loading 1st source dq | |
375 | 1: // loop over 64-byte chunks | |
376 | movdqa 11(%esi,%edx),%xmm1 | |
377 | movdqa 27(%esi,%edx),%xmm2 | |
378 | movdqa 43(%esi,%edx),%xmm3 | |
379 | movdqa 59(%esi,%edx),%xmm4 | |
380 | ||
381 | movdqa %xmm0,%xmm5 | |
382 | movdqa %xmm4,%xmm0 | |
383 | ||
384 | palignr $5,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) | |
385 | palignr $5,%xmm2,%xmm3 | |
386 | palignr $5,%xmm1,%xmm2 | |
387 | palignr $5,%xmm5,%xmm1 | |
388 | ||
389 | movdqa %xmm1,(%edi,%edx) | |
390 | movdqa %xmm2,16(%edi,%edx) | |
391 | movdqa %xmm3,32(%edi,%edx) | |
392 | movdqa %xmm4,48(%edi,%edx) | |
393 | ||
394 | addl $64,%edx | |
395 | jnz 1b | |
396 | ||
397 | jmp Lshort // copy remaining 0..63 bytes and done | |
398 | ||
399 | ||
400 | // Forward loop for medium length operands in which low four bits of %esi == 0110 | |
401 | ||
402 | LMod6: | |
403 | movdqa -6(%esi,%edx),%xmm0// prime the loop by loading 1st source dq | |
404 | 1: // loop over 64-byte chunks | |
405 | movdqa 10(%esi,%edx),%xmm1 | |
406 | movdqa 26(%esi,%edx),%xmm2 | |
407 | movdqa 42(%esi,%edx),%xmm3 | |
408 | movdqa 58(%esi,%edx),%xmm4 | |
409 | ||
410 | movdqa %xmm0,%xmm5 | |
411 | movdqa %xmm4,%xmm0 | |
412 | ||
413 | palignr $6,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) | |
414 | palignr $6,%xmm2,%xmm3 | |
415 | palignr $6,%xmm1,%xmm2 | |
416 | palignr $6,%xmm5,%xmm1 | |
417 | ||
418 | movdqa %xmm1,(%edi,%edx) | |
419 | movdqa %xmm2,16(%edi,%edx) | |
420 | movdqa %xmm3,32(%edi,%edx) | |
421 | movdqa %xmm4,48(%edi,%edx) | |
422 | ||
423 | addl $64,%edx | |
424 | jnz 1b | |
425 | ||
426 | jmp Lshort // copy remaining 0..63 bytes and done | |
427 | ||
428 | ||
429 | // Forward loop for medium length operands in which low four bits of %esi == 0111 | |
430 | ||
431 | LMod7: | |
432 | movdqa -7(%esi,%edx),%xmm0// prime the loop by loading 1st source dq | |
433 | 1: // loop over 64-byte chunks | |
434 | movdqa 9(%esi,%edx),%xmm1 | |
435 | movdqa 25(%esi,%edx),%xmm2 | |
436 | movdqa 41(%esi,%edx),%xmm3 | |
437 | movdqa 57(%esi,%edx),%xmm4 | |
438 | ||
439 | movdqa %xmm0,%xmm5 | |
440 | movdqa %xmm4,%xmm0 | |
441 | ||
442 | palignr $7,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) | |
443 | palignr $7,%xmm2,%xmm3 | |
444 | palignr $7,%xmm1,%xmm2 | |
445 | palignr $7,%xmm5,%xmm1 | |
446 | ||
447 | movdqa %xmm1,(%edi,%edx) | |
448 | movdqa %xmm2,16(%edi,%edx) | |
449 | movdqa %xmm3,32(%edi,%edx) | |
450 | movdqa %xmm4,48(%edi,%edx) | |
451 | ||
452 | addl $64,%edx | |
453 | jnz 1b | |
454 | ||
455 | jmp Lshort // copy remaining 0..63 bytes and done | |
456 | ||
457 | ||
458 | // Forward loop for medium length operands in which low four bits of %esi == 1000 | |
459 | // We use the float double data type in order to use "shufpd" to shift by 8 bytes. | |
460 | ||
461 | LMod8: | |
462 | cmpl $(-kFastUCode),%edx// %edx == -length, where (length < kVeryLong) | |
463 | jle Lfastpath // long enough for fastpath in microcode | |
464 | movapd -8(%esi,%edx),%xmm0// 8-byte aligned: prime the loop | |
465 | jmp 1f | |
466 | .align 4,0x90 | |
467 | 1: // loop over 64-byte chunks | |
468 | movapd 8(%esi,%edx),%xmm1 | |
469 | movapd 24(%esi,%edx),%xmm2 | |
470 | shufpd $01,%xmm1,%xmm0 // %xmm0 <- shr( %xmm0 || %xmm1, 8 bytes) | |
471 | movapd 40(%esi,%edx),%xmm3 | |
472 | shufpd $01,%xmm2,%xmm1 | |
473 | movapd 56(%esi,%edx),%xmm4 | |
474 | shufpd $01,%xmm3,%xmm2 | |
475 | ||
476 | movapd %xmm0,(%edi,%edx) | |
477 | shufpd $01,%xmm4,%xmm3 | |
478 | movapd %xmm1,16(%edi,%edx) | |
479 | movapd %xmm2,32(%edi,%edx) | |
480 | movapd %xmm4,%xmm0 | |
481 | movapd %xmm3,48(%edi,%edx) | |
482 | ||
483 | addl $64,%edx | |
484 | jnz 1b | |
485 | ||
486 | jmp Lshort // copy remaining 0..63 bytes and done | |
487 | ||
488 | ||
489 | // Forward loop for medium length operands in which low four bits of %esi == 1001 | |
490 | ||
491 | LMod9: | |
492 | movdqa -9(%esi,%edx),%xmm0// prime the loop by loading 1st source dq | |
493 | 1: // loop over 64-byte chunks | |
494 | movdqa 7(%esi,%edx),%xmm1 | |
495 | movdqa 23(%esi,%edx),%xmm2 | |
496 | movdqa 39(%esi,%edx),%xmm3 | |
497 | movdqa 55(%esi,%edx),%xmm4 | |
498 | ||
499 | movdqa %xmm0,%xmm5 | |
500 | movdqa %xmm4,%xmm0 | |
501 | ||
502 | palignr $9,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) | |
503 | palignr $9,%xmm2,%xmm3 | |
504 | palignr $9,%xmm1,%xmm2 | |
505 | palignr $9,%xmm5,%xmm1 | |
506 | ||
507 | movdqa %xmm1,(%edi,%edx) | |
508 | movdqa %xmm2,16(%edi,%edx) | |
509 | movdqa %xmm3,32(%edi,%edx) | |
510 | movdqa %xmm4,48(%edi,%edx) | |
511 | ||
512 | addl $64,%edx | |
513 | jnz 1b | |
514 | ||
515 | jmp Lshort // copy remaining 0..63 bytes and done | |
516 | ||
517 | ||
518 | // Forward loop for medium length operands in which low four bits of %esi == 1010 | |
519 | ||
520 | LMod10: | |
521 | movdqa -10(%esi,%edx),%xmm0// prime the loop by loading 1st source dq | |
522 | 1: // loop over 64-byte chunks | |
523 | movdqa 6(%esi,%edx),%xmm1 | |
524 | movdqa 22(%esi,%edx),%xmm2 | |
525 | movdqa 38(%esi,%edx),%xmm3 | |
526 | movdqa 54(%esi,%edx),%xmm4 | |
527 | ||
528 | movdqa %xmm0,%xmm5 | |
529 | movdqa %xmm4,%xmm0 | |
530 | ||
531 | palignr $10,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) | |
532 | palignr $10,%xmm2,%xmm3 | |
533 | palignr $10,%xmm1,%xmm2 | |
534 | palignr $10,%xmm5,%xmm1 | |
535 | ||
536 | movdqa %xmm1,(%edi,%edx) | |
537 | movdqa %xmm2,16(%edi,%edx) | |
538 | movdqa %xmm3,32(%edi,%edx) | |
539 | movdqa %xmm4,48(%edi,%edx) | |
540 | ||
541 | addl $64,%edx | |
542 | jnz 1b | |
543 | ||
544 | jmp Lshort // copy remaining 0..63 bytes and done | |
545 | ||
546 | ||
547 | // Forward loop for medium length operands in which low four bits of %esi == 1011 | |
548 | ||
549 | LMod11: | |
550 | movdqa -11(%esi,%edx),%xmm0// prime the loop by loading 1st source dq | |
551 | 1: // loop over 64-byte chunks | |
552 | movdqa 5(%esi,%edx),%xmm1 | |
553 | movdqa 21(%esi,%edx),%xmm2 | |
554 | movdqa 37(%esi,%edx),%xmm3 | |
555 | movdqa 53(%esi,%edx),%xmm4 | |
556 | ||
557 | movdqa %xmm0,%xmm5 | |
558 | movdqa %xmm4,%xmm0 | |
559 | ||
560 | palignr $11,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) | |
561 | palignr $11,%xmm2,%xmm3 | |
562 | palignr $11,%xmm1,%xmm2 | |
563 | palignr $11,%xmm5,%xmm1 | |
564 | ||
565 | movdqa %xmm1,(%edi,%edx) | |
566 | movdqa %xmm2,16(%edi,%edx) | |
567 | movdqa %xmm3,32(%edi,%edx) | |
568 | movdqa %xmm4,48(%edi,%edx) | |
569 | ||
570 | addl $64,%edx | |
571 | jnz 1b | |
572 | ||
573 | jmp Lshort // copy remaining 0..63 bytes and done | |
574 | ||
575 | ||
576 | // Forward loop for medium length operands in which low four bits of %esi == 1100 | |
577 | // We use the float single data type in order to use "movss" to merge vectors. | |
578 | ||
579 | LMod12: | |
580 | movss (%esi,%edx),%xmm0// prefetch 1st four bytes of source, right justified | |
581 | jmp 1f | |
582 | .align 4,0x90 | |
583 | 1: // loop over 64-byte chunks | |
584 | pshufd $(0x93),4(%esi,%edx),%xmm1 // load and rotate right 12 bytes (mask -- 10 01 00 11) | |
585 | pshufd $(0x93),20(%esi,%edx),%xmm2 | |
586 | pshufd $(0x93),36(%esi,%edx),%xmm3 | |
587 | pshufd $(0x93),52(%esi,%edx),%xmm4 | |
588 | ||
589 | movaps %xmm4,%xmm5 | |
590 | movss %xmm3,%xmm4 // copy low 4 bytes of source into destination | |
591 | movss %xmm2,%xmm3 | |
592 | movss %xmm1,%xmm2 | |
593 | movss %xmm0,%xmm1 | |
594 | ||
595 | movaps %xmm1,(%edi,%edx) | |
596 | movaps %xmm2,16(%edi,%edx) | |
597 | movaps %xmm5,%xmm0 | |
598 | movaps %xmm3,32(%edi,%edx) | |
599 | movaps %xmm4,48(%edi,%edx) | |
600 | ||
601 | addl $64,%edx | |
602 | jnz 1b | |
603 | ||
604 | jmp Lshort // copy remaining 0..63 bytes and done | |
605 | ||
606 | ||
607 | // Forward loop for medium length operands in which low four bits of %esi == 1101 | |
608 | ||
609 | LMod13: | |
610 | movdqa -13(%esi,%edx),%xmm0// prime the loop by loading 1st source dq | |
611 | 1: // loop over 64-byte chunks | |
612 | movdqa 3(%esi,%edx),%xmm1 | |
613 | movdqa 19(%esi,%edx),%xmm2 | |
614 | movdqa 35(%esi,%edx),%xmm3 | |
615 | movdqa 51(%esi,%edx),%xmm4 | |
616 | ||
617 | movdqa %xmm0,%xmm5 | |
618 | movdqa %xmm4,%xmm0 | |
619 | ||
620 | palignr $13,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) | |
621 | palignr $13,%xmm2,%xmm3 | |
622 | palignr $13,%xmm1,%xmm2 | |
623 | palignr $13,%xmm5,%xmm1 | |
624 | ||
625 | movdqa %xmm1,(%edi,%edx) | |
626 | movdqa %xmm2,16(%edi,%edx) | |
627 | movdqa %xmm3,32(%edi,%edx) | |
628 | movdqa %xmm4,48(%edi,%edx) | |
629 | ||
630 | addl $64,%edx | |
631 | jnz 1b | |
632 | ||
633 | jmp Lshort // copy remaining 0..63 bytes and done | |
634 | ||
635 | ||
636 | // Forward loop for medium length operands in which low four bits of %esi == 1110 | |
637 | ||
638 | LMod14: | |
639 | movdqa -14(%esi,%edx),%xmm0// prime the loop by loading 1st source dq | |
640 | 1: // loop over 64-byte chunks | |
641 | movdqa 2(%esi,%edx),%xmm1 | |
642 | movdqa 18(%esi,%edx),%xmm2 | |
643 | movdqa 34(%esi,%edx),%xmm3 | |
644 | movdqa 50(%esi,%edx),%xmm4 | |
645 | ||
646 | movdqa %xmm0,%xmm5 | |
647 | movdqa %xmm4,%xmm0 | |
648 | ||
649 | palignr $14,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) | |
650 | palignr $14,%xmm2,%xmm3 | |
651 | palignr $14,%xmm1,%xmm2 | |
652 | palignr $14,%xmm5,%xmm1 | |
653 | ||
654 | movdqa %xmm1,(%edi,%edx) | |
655 | movdqa %xmm2,16(%edi,%edx) | |
656 | movdqa %xmm3,32(%edi,%edx) | |
657 | movdqa %xmm4,48(%edi,%edx) | |
658 | ||
659 | addl $64,%edx | |
660 | jnz 1b | |
661 | ||
662 | jmp Lshort // copy remaining 0..63 bytes and done | |
663 | ||
664 | ||
665 | // Forward loop for medium length operands in which low four bits of %esi == 1111 | |
666 | ||
667 | LMod15: | |
668 | movdqa -15(%esi,%edx),%xmm0// prime the loop by loading 1st source dq | |
669 | 1: // loop over 64-byte chunks | |
670 | movdqa 1(%esi,%edx),%xmm1 | |
671 | movdqa 17(%esi,%edx),%xmm2 | |
672 | movdqa 33(%esi,%edx),%xmm3 | |
673 | movdqa 49(%esi,%edx),%xmm4 | |
674 | ||
675 | movdqa %xmm0,%xmm5 | |
676 | movdqa %xmm4,%xmm0 | |
677 | ||
678 | palignr $15,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) | |
679 | palignr $15,%xmm2,%xmm3 | |
680 | palignr $15,%xmm1,%xmm2 | |
681 | palignr $15,%xmm5,%xmm1 | |
682 | ||
683 | movdqa %xmm1,(%edi,%edx) | |
684 | movdqa %xmm2,16(%edi,%edx) | |
685 | movdqa %xmm3,32(%edi,%edx) | |
686 | movdqa %xmm4,48(%edi,%edx) | |
687 | ||
688 | addl $64,%edx | |
689 | jnz 1b | |
690 | ||
691 | jmp Lshort // copy remaining 0..63 bytes and done | |
692 | ||
693 | ||
694 | // Reverse moves. These are not optimized as aggressively as their forward | |
695 | // counterparts, as they are only used with destructive overlap. | |
696 | // ecx = length | |
697 | // esi = source ptr | |
698 | // edi = dest ptr | |
699 | ||
700 | LReverse: | |
701 | addl %ecx,%esi // point to end of strings | |
702 | addl %ecx,%edi | |
703 | cmpl $(kShort),%ecx // long enough to bother with SSE? | |
704 | ja LReverseNotShort // yes | |
705 | ||
706 | // Handle reverse short copies. | |
707 | // ecx = length | |
708 | // esi = one byte past end of source | |
709 | // edi = one byte past end of dest | |
710 | ||
711 | LReverseShort: | |
712 | movl %ecx,%edx // copy length | |
713 | shrl $2,%ecx // #words | |
714 | jz 3f | |
715 | 1: | |
716 | subl $4,%esi | |
717 | movl (%esi),%eax | |
718 | subl $4,%edi | |
719 | movl %eax,(%edi) | |
720 | dec %ecx | |
721 | jnz 1b | |
722 | 3: | |
723 | andl $3,%edx // bytes? | |
724 | jz 5f | |
725 | 4: | |
726 | dec %esi | |
727 | movb (%esi),%al | |
728 | dec %edi | |
729 | movb %al,(%edi) | |
730 | dec %edx | |
731 | jnz 4b | |
732 | 5: | |
733 | movl 8(%ebp),%eax // get return value (dst ptr) for memcpy/memmove | |
734 | popl %ebx | |
735 | popl %edi | |
736 | popl %esi | |
737 | popl %ebp | |
738 | ret | |
739 | ||
740 | // Handle a reverse move long enough to justify using SSE. | |
741 | // ecx = length | |
742 | // esi = one byte past end of source | |
743 | // edi = one byte past end of dest | |
744 | ||
745 | LReverseNotShort: | |
746 | movl %edi,%edx // copy destination | |
747 | andl $15,%edx // get #bytes to align destination | |
748 | je LReverseDestAligned // already aligned | |
749 | subl %edx,%ecx // adjust length | |
750 | 1: // loop copying 1..15 bytes | |
751 | dec %esi | |
752 | movb (%esi),%al | |
753 | dec %edi | |
754 | movb %al,(%edi) | |
755 | dec %edx | |
756 | jnz 1b | |
757 | ||
758 | // Destination is now aligned. Prepare for reverse loops. | |
759 | ||
760 | LReverseDestAligned: | |
761 | movl %ecx,%edx // copy length | |
762 | andl $63,%ecx // get remaining bytes for Lshort | |
763 | andl $-64,%edx // get number of bytes we will copy in inner loop | |
764 | subl %edx,%esi // point to endpoint of copy | |
765 | subl %edx,%edi | |
766 | testl $15,%esi // is source aligned too? | |
767 | jnz LReverseUnalignedLoop // no | |
768 | ||
769 | LReverseAlignedLoop: // loop over 64-byte chunks | |
770 | movdqa -16(%esi,%edx),%xmm0 | |
771 | movdqa -32(%esi,%edx),%xmm1 | |
772 | movdqa -48(%esi,%edx),%xmm2 | |
773 | movdqa -64(%esi,%edx),%xmm3 | |
774 | ||
775 | movdqa %xmm0,-16(%edi,%edx) | |
776 | movdqa %xmm1,-32(%edi,%edx) | |
777 | movdqa %xmm2,-48(%edi,%edx) | |
778 | movdqa %xmm3,-64(%edi,%edx) | |
779 | ||
780 | subl $64,%edx | |
781 | jne LReverseAlignedLoop | |
782 | ||
783 | jmp LReverseShort // copy remaining 0..63 bytes and done | |
784 | ||
785 | ||
786 | // Reverse, unaligned loop. LDDQU==MOVDQU on these machines. | |
787 | ||
788 | LReverseUnalignedLoop: // loop over 64-byte chunks | |
789 | movdqu -16(%esi,%edx),%xmm0 | |
790 | movdqu -32(%esi,%edx),%xmm1 | |
791 | movdqu -48(%esi,%edx),%xmm2 | |
792 | movdqu -64(%esi,%edx),%xmm3 | |
793 | ||
794 | movdqa %xmm0,-16(%edi,%edx) | |
795 | movdqa %xmm1,-32(%edi,%edx) | |
796 | movdqa %xmm2,-48(%edi,%edx) | |
797 | movdqa %xmm3,-64(%edi,%edx) | |
798 | ||
799 | subl $64,%edx | |
800 | jne LReverseUnalignedLoop | |
801 | ||
802 | jmp LReverseShort // copy remaining 0..63 bytes and done | |
803 | ||
804 | PLATFUNC_DESCRIPTOR(bcopy,sse3x,kHasSSE2|kHasSupplementalSSE3|kCache64,kHasSSE4_2) | |
805 | PLATFUNC_DESCRIPTOR(memcpy,sse3x,kHasSSE2|kHasSupplementalSSE3|kCache64,kHasSSE4_2) | |
806 | PLATFUNC_DESCRIPTOR(memmove,sse3x,kHasSSE2|kHasSupplementalSSE3|kCache64,kHasSSE4_2) |