]> git.saurik.com Git - apple/xnu.git/blob - osfmk/i386/commpage/bcopy_sse3x.s
xnu-1504.3.12.tar.gz
[apple/xnu.git] / osfmk / i386 / commpage / bcopy_sse3x.s
1 /*
2 * Copyright (c) 2006 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <machine/cpu_capabilities.h>
30 #include <machine/commpage.h>
31
32 /*
33 * The bcopy/memcpy loops, tuned for Pentium-M class processors with
34 * Supplemental SSE3 and 64-byte cache lines.
35 *
36 * The following #defines are tightly coupled to the u-architecture:
37 */
38
39 #define kShort 80 // too short to bother with SSE (must be >=80)
40 #define kVeryLong (500*1024) // large enough for non-temporal stores (must be >= 8192)
41 #define kFastUCode ((16*1024)-15) // cutoff for microcode fastpath for "rep/movsl"
42
43 // void bcopy(const void *src, void *dst, size_t len);
44
45 COMMPAGE_FUNCTION_START(bcopy_sse3x, 32, 5)
46 LZero:
47 pushl %ebp // set up a frame for backtraces
48 movl %esp,%ebp
49 pushl %esi
50 pushl %edi
51 movl 8(%ebp),%esi // get source ptr
52 movl 12(%ebp),%edi // get dest ptr
53 movl 16(%ebp),%ecx // get length
54 movl %edi,%edx
55 subl %esi,%edx // (dest - source)
56 cmpl %ecx,%edx // must move in reverse if (dest - source) < length
57 jb LReverseIsland
58 cmpl $(kShort),%ecx // long enough to bother with SSE?
59 jbe Lshort // no
60 jmp LNotShort
61
62 //
63 // void *memcpy(void *dst, const void *src, size_t len);
64 // void *memmove(void *dst, const void *src, size_t len);
65 //
66 // NB: These need to be 32 bytes from bcopy():
67 //
68
69 .align 5, 0x90
70 Lmemcpy: // void *memcpy(void *dst, const void *src, size_t len)
71 Lmemmove: // void *memmove(void *dst, const void *src, size_t len)
72 pushl %ebp // set up a frame for backtraces
73 movl %esp,%ebp
74 pushl %esi
75 pushl %edi
76 movl 8(%ebp),%edi // get dest ptr
77 movl 12(%ebp),%esi // get source ptr
78 movl 16(%ebp),%ecx // get length
79 movl %edi,%edx
80 subl %esi,%edx // (dest - source)
81 cmpl %ecx,%edx // must move in reverse if (dest - source) < length
82 jb LReverseIsland
83 cmpl $(kShort),%ecx // long enough to bother with SSE?
84 ja LNotShort // yes
85
86 // Handle short forward copies. As the most common case, this is the fall-through path.
87 // ecx = length (<= kShort)
88 // esi = source ptr
89 // edi = dest ptr
90
91 Lshort:
92 movl %ecx,%edx // copy length
93 shrl $2,%ecx // get #doublewords
94 jz LLeftovers
95 2: // loop copying doublewords
96 movl (%esi),%eax
97 addl $4,%esi
98 movl %eax,(%edi)
99 addl $4,%edi
100 dec %ecx
101 jnz 2b
102 LLeftovers: // handle leftover bytes (0..3) in last word
103 andl $3,%edx // any leftover bytes?
104 jz Lexit
105 4: // loop copying bytes
106 movb (%esi),%al
107 inc %esi
108 movb %al,(%edi)
109 inc %edi
110 dec %edx
111 jnz 4b
112 Lexit:
113 movl 8(%ebp),%eax // get return value (dst ptr) for memcpy/memmove
114 popl %edi
115 popl %esi
116 popl %ebp
117 ret
118
119
120 LReverseIsland: // keep the "jb" above a short branch...
121 jmp LReverse // ...because reverse moves are uncommon
122
123
124 // Handle forward moves that are long enough to justify use of SSE3.
125 // First, 16-byte align the destination.
126 // ecx = length (> kShort)
127 // esi = source ptr
128 // edi = dest ptr
129
130 LNotShort:
131 cmpl $(kVeryLong),%ecx // long enough to justify heavyweight loops?
132 movl %edi,%edx // copy destination
133 jae LVeryLong // use very-long-operand path
134 negl %edx
135 andl $15,%edx // get #bytes to align destination
136 jz LDestAligned // already aligned
137 subl %edx,%ecx // decrement length
138 1: // loop copying 1..15 bytes
139 movb (%esi),%al
140 inc %esi
141 movb %al,(%edi)
142 inc %edi
143 dec %edx
144 jnz 1b
145
146 // Destination is now aligned. Dispatch to one of sixteen loops over 64-byte chunks,
147 // based on the alignment of the source. All vector loads and stores are aligned.
148 // Even though this means we have to shift and repack vectors, doing so is much faster
149 // than unaligned loads. Since kShort>=80 and we've moved at most 15 bytes already,
150 // there is at least one chunk. When we enter the copy loops, the following registers
151 // are set up:
152 // ecx = residual length (0..63)
153 // edx = -(length to move), a multiple of 64
154 // esi = ptr to 1st source byte not to move (unaligned)
155 // edi = ptr to 1st dest byte not to move (aligned)
156
157 LDestAligned:
158 movl %ecx,%edx // copy length
159 movl %esi,%eax // copy source address
160 andl $63,%ecx // get remaining bytes for Lshort
161 andl $-64,%edx // get number of bytes we will copy in inner loop
162 andl $15,%eax // mask to low 4 bits of source address
163 addl %edx,%esi // point to 1st byte not copied
164 addl %edx,%edi
165 negl %edx // now generate offset to 1st byte to be copied
166 .set LTableOffset, LTable - LZero
167 leal (LTableOffset)(,%eax,4), %eax // load jump table entry address, relative to LZero
168 movl _COMM_PAGE_BCOPY(%eax), %eax // load jump table entry
169 addl $(_COMM_PAGE_BCOPY), %eax // add runtime address of LZero to get final function
170 jmp *%eax
171
172 .align 2
173 LTable: // table of copy loop addresses
174 // force generation of assembly-time constants. Otherwise assembler
175 // creates subtractor relocations relative to first external symbol,
176 // and this file has none
177 .set LMod0Offset, LMod0 - LZero
178 .set LMod1Offset, LMod1 - LZero
179 .set LMod2Offset, LMod2 - LZero
180 .set LMod3Offset, LMod3 - LZero
181 .set LMod4Offset, LMod4 - LZero
182 .set LMod5Offset, LMod5 - LZero
183 .set LMod6Offset, LMod6 - LZero
184 .set LMod7Offset, LMod7 - LZero
185 .set LMod8Offset, LMod8 - LZero
186 .set LMod9Offset, LMod9 - LZero
187 .set LMod10Offset, LMod10 - LZero
188 .set LMod11Offset, LMod11 - LZero
189 .set LMod12Offset, LMod12 - LZero
190 .set LMod13Offset, LMod13 - LZero
191 .set LMod14Offset, LMod14 - LZero
192 .set LMod15Offset, LMod15 - LZero
193 .long LMod0Offset
194 .long LMod1Offset
195 .long LMod2Offset
196 .long LMod3Offset
197 .long LMod4Offset
198 .long LMod5Offset
199 .long LMod6Offset
200 .long LMod7Offset
201 .long LMod8Offset
202 .long LMod9Offset
203 .long LMod10Offset
204 .long LMod11Offset
205 .long LMod12Offset
206 .long LMod13Offset
207 .long LMod14Offset
208 .long LMod15Offset
209
210
211 // Very long forward moves. These are at least several pages. They are special cased
212 // and aggressively optimized, not so much because they are common or useful, but
213 // because they are subject to benchmark. There isn't enough room for them in the
214 // area reserved on the commpage for bcopy, so we put them elsewhere. We call
215 // the longcopy routine using the normal ABI.
216
217 LVeryLong:
218 pushl %ecx // length (>= kVeryLong)
219 pushl %esi // source ptr
220 pushl %edi // dest ptr
221 movl $(_COMM_PAGE_LONGCOPY),%eax
222 call *%eax // do the long copy
223 addl $12,%esp // pop off our parameters
224 jmp Lexit
225
226
227 // On Pentium-M, the microcode for "rep/movsl" is faster than SSE for 8-byte
228 // aligned operands from about 32KB up to kVeryLong for the hot cache case, and from
229 // about 256 bytes up to kVeryLong for cold caches. This is because the microcode
230 // avoids having to read destination cache lines that will be completely overwritten.
231 // The cutoff we use (ie, kFastUCode) must somehow balance the two cases, since
232 // we do not know if the destination is in cache or not.
233
234 Lfastpath:
235 addl %edx,%esi // restore ptrs to 1st byte of source and dest
236 addl %edx,%edi
237 negl %edx // make length positive
238 orl %edx,%ecx // restore total #bytes remaining to move
239 cld // we'll move forward
240 movl %ecx,%edx // copy total length to move
241 shrl $2,%ecx // compute #words to move
242 rep // the u-code will optimize this
243 movsl
244 jmp LLeftovers // handle 0..3 leftover bytes
245
246
247 // Forward loop for medium length operands in which low four bits of %esi == 0000
248
249 LMod0:
250 cmpl $(-kFastUCode),%edx // %edx == -length, where (length < kVeryLong)
251 jle Lfastpath // long enough for fastpath in microcode
252 jmp 1f
253 .align 4,0x90 // 16-byte align inner loops
254 1: // loop over 64-byte chunks
255 movdqa (%esi,%edx),%xmm0
256 movdqa 16(%esi,%edx),%xmm1
257 movdqa 32(%esi,%edx),%xmm2
258 movdqa 48(%esi,%edx),%xmm3
259
260 movdqa %xmm0,(%edi,%edx)
261 movdqa %xmm1,16(%edi,%edx)
262 movdqa %xmm2,32(%edi,%edx)
263 movdqa %xmm3,48(%edi,%edx)
264
265 addl $64,%edx
266 jnz 1b
267
268 jmp Lshort // copy remaining 0..63 bytes and done
269
270
271 // Forward loop for medium length operands in which low four bits of %esi == 0001
272
273 LMod1:
274 movdqa -1(%esi,%edx),%xmm0 // prime the loop by loading 1st quadword
275 1: // loop over 64-byte chunks
276 movdqa 15(%esi,%edx),%xmm1
277 movdqa 31(%esi,%edx),%xmm2
278 movdqa 47(%esi,%edx),%xmm3
279 movdqa 63(%esi,%edx),%xmm4
280
281 movdqa %xmm0,%xmm5
282 movdqa %xmm4,%xmm0
283
284 palignr $1,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
285 palignr $1,%xmm2,%xmm3
286 palignr $1,%xmm1,%xmm2
287 palignr $1,%xmm5,%xmm1
288
289 movdqa %xmm1,(%edi,%edx)
290 movdqa %xmm2,16(%edi,%edx)
291 movdqa %xmm3,32(%edi,%edx)
292 movdqa %xmm4,48(%edi,%edx)
293
294 addl $64,%edx
295 jnz 1b
296
297 jmp Lshort // copy remaining 0..63 bytes and done
298
299
300 // Forward loop for medium length operands in which low four bits of %esi == 0010
301
302 LMod2:
303 movdqa -2(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
304 1: // loop over 64-byte chunks
305 movdqa 14(%esi,%edx),%xmm1
306 movdqa 30(%esi,%edx),%xmm2
307 movdqa 46(%esi,%edx),%xmm3
308 movdqa 62(%esi,%edx),%xmm4
309
310 movdqa %xmm0,%xmm5
311 movdqa %xmm4,%xmm0
312
313 palignr $2,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
314 palignr $2,%xmm2,%xmm3
315 palignr $2,%xmm1,%xmm2
316 palignr $2,%xmm5,%xmm1
317
318 movdqa %xmm1,(%edi,%edx)
319 movdqa %xmm2,16(%edi,%edx)
320 movdqa %xmm3,32(%edi,%edx)
321 movdqa %xmm4,48(%edi,%edx)
322
323 addl $64,%edx
324 jnz 1b
325
326 jmp Lshort // copy remaining 0..63 bytes and done
327
328
329 // Forward loop for medium length operands in which low four bits of %esi == 0011
330
331 LMod3:
332 movdqa -3(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
333 1: // loop over 64-byte chunks
334 movdqa 13(%esi,%edx),%xmm1
335 movdqa 29(%esi,%edx),%xmm2
336 movdqa 45(%esi,%edx),%xmm3
337 movdqa 61(%esi,%edx),%xmm4
338
339 movdqa %xmm0,%xmm5
340 movdqa %xmm4,%xmm0
341
342 palignr $3,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
343 palignr $3,%xmm2,%xmm3
344 palignr $3,%xmm1,%xmm2
345 palignr $3,%xmm5,%xmm1
346
347 movdqa %xmm1,(%edi,%edx)
348 movdqa %xmm2,16(%edi,%edx)
349 movdqa %xmm3,32(%edi,%edx)
350 movdqa %xmm4,48(%edi,%edx)
351
352 addl $64,%edx
353 jnz 1b
354
355 jmp Lshort // copy remaining 0..63 bytes and done
356
357
358 // Forward loop for medium length operands in which low four bits of %esi == 0100
359 // We use the float single data type in order to use "movss" to merge vectors.
360
361 LMod4:
362 movaps -4(%esi,%edx),%xmm0 // 4-byte aligned: prime the loop
363 jmp 1f
364 .align 4,0x90
365 1: // loop over 64-byte chunks
366 movaps 12(%esi,%edx),%xmm1
367 movaps 28(%esi,%edx),%xmm2
368 movss %xmm1,%xmm0 // copy low 4 bytes of source into destination
369 pshufd $(0x39),%xmm0,%xmm0 // rotate right 4 bytes (mask -- 00 11 10 01)
370 movaps 44(%esi,%edx),%xmm3
371 movss %xmm2,%xmm1
372 pshufd $(0x39),%xmm1,%xmm1
373 movaps 60(%esi,%edx),%xmm4
374 movss %xmm3,%xmm2
375 pshufd $(0x39),%xmm2,%xmm2
376
377 movaps %xmm0,(%edi,%edx)
378 movss %xmm4,%xmm3
379 pshufd $(0x39),%xmm3,%xmm3
380 movaps %xmm1,16(%edi,%edx)
381 movaps %xmm2,32(%edi,%edx)
382 movaps %xmm4,%xmm0
383 movaps %xmm3,48(%edi,%edx)
384
385 addl $64,%edx
386 jnz 1b
387
388 jmp Lshort // copy remaining 0..63 bytes and done
389
390
391 // Forward loop for medium length operands in which low four bits of %esi == 0101
392
393 LMod5:
394 movdqa -5(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
395 1: // loop over 64-byte chunks
396 movdqa 11(%esi,%edx),%xmm1
397 movdqa 27(%esi,%edx),%xmm2
398 movdqa 43(%esi,%edx),%xmm3
399 movdqa 59(%esi,%edx),%xmm4
400
401 movdqa %xmm0,%xmm5
402 movdqa %xmm4,%xmm0
403
404 palignr $5,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
405 palignr $5,%xmm2,%xmm3
406 palignr $5,%xmm1,%xmm2
407 palignr $5,%xmm5,%xmm1
408
409 movdqa %xmm1,(%edi,%edx)
410 movdqa %xmm2,16(%edi,%edx)
411 movdqa %xmm3,32(%edi,%edx)
412 movdqa %xmm4,48(%edi,%edx)
413
414 addl $64,%edx
415 jnz 1b
416
417 jmp Lshort // copy remaining 0..63 bytes and done
418
419
420 // Forward loop for medium length operands in which low four bits of %esi == 0110
421
422 LMod6:
423 movdqa -6(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
424 1: // loop over 64-byte chunks
425 movdqa 10(%esi,%edx),%xmm1
426 movdqa 26(%esi,%edx),%xmm2
427 movdqa 42(%esi,%edx),%xmm3
428 movdqa 58(%esi,%edx),%xmm4
429
430 movdqa %xmm0,%xmm5
431 movdqa %xmm4,%xmm0
432
433 palignr $6,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
434 palignr $6,%xmm2,%xmm3
435 palignr $6,%xmm1,%xmm2
436 palignr $6,%xmm5,%xmm1
437
438 movdqa %xmm1,(%edi,%edx)
439 movdqa %xmm2,16(%edi,%edx)
440 movdqa %xmm3,32(%edi,%edx)
441 movdqa %xmm4,48(%edi,%edx)
442
443 addl $64,%edx
444 jnz 1b
445
446 jmp Lshort // copy remaining 0..63 bytes and done
447
448
449 // Forward loop for medium length operands in which low four bits of %esi == 0111
450
451 LMod7:
452 movdqa -7(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
453 1: // loop over 64-byte chunks
454 movdqa 9(%esi,%edx),%xmm1
455 movdqa 25(%esi,%edx),%xmm2
456 movdqa 41(%esi,%edx),%xmm3
457 movdqa 57(%esi,%edx),%xmm4
458
459 movdqa %xmm0,%xmm5
460 movdqa %xmm4,%xmm0
461
462 palignr $7,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
463 palignr $7,%xmm2,%xmm3
464 palignr $7,%xmm1,%xmm2
465 palignr $7,%xmm5,%xmm1
466
467 movdqa %xmm1,(%edi,%edx)
468 movdqa %xmm2,16(%edi,%edx)
469 movdqa %xmm3,32(%edi,%edx)
470 movdqa %xmm4,48(%edi,%edx)
471
472 addl $64,%edx
473 jnz 1b
474
475 jmp Lshort // copy remaining 0..63 bytes and done
476
477
478 // Forward loop for medium length operands in which low four bits of %esi == 1000
479 // We use the float double data type in order to use "shufpd" to shift by 8 bytes.
480
481 LMod8:
482 cmpl $(-kFastUCode),%edx // %edx == -length, where (length < kVeryLong)
483 jle Lfastpath // long enough for fastpath in microcode
484 movapd -8(%esi,%edx),%xmm0 // 8-byte aligned: prime the loop
485 jmp 1f
486 .align 4,0x90
487 1: // loop over 64-byte chunks
488 movapd 8(%esi,%edx),%xmm1
489 movapd 24(%esi,%edx),%xmm2
490 shufpd $01,%xmm1,%xmm0 // %xmm0 <- shr( %xmm0 || %xmm1, 8 bytes)
491 movapd 40(%esi,%edx),%xmm3
492 shufpd $01,%xmm2,%xmm1
493 movapd 56(%esi,%edx),%xmm4
494 shufpd $01,%xmm3,%xmm2
495
496 movapd %xmm0,(%edi,%edx)
497 shufpd $01,%xmm4,%xmm3
498 movapd %xmm1,16(%edi,%edx)
499 movapd %xmm2,32(%edi,%edx)
500 movapd %xmm4,%xmm0
501 movapd %xmm3,48(%edi,%edx)
502
503 addl $64,%edx
504 jnz 1b
505
506 jmp Lshort // copy remaining 0..63 bytes and done
507
508
509 // Forward loop for medium length operands in which low four bits of %esi == 1001
510
511 LMod9:
512 movdqa -9(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
513 1: // loop over 64-byte chunks
514 movdqa 7(%esi,%edx),%xmm1
515 movdqa 23(%esi,%edx),%xmm2
516 movdqa 39(%esi,%edx),%xmm3
517 movdqa 55(%esi,%edx),%xmm4
518
519 movdqa %xmm0,%xmm5
520 movdqa %xmm4,%xmm0
521
522 palignr $9,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
523 palignr $9,%xmm2,%xmm3
524 palignr $9,%xmm1,%xmm2
525 palignr $9,%xmm5,%xmm1
526
527 movdqa %xmm1,(%edi,%edx)
528 movdqa %xmm2,16(%edi,%edx)
529 movdqa %xmm3,32(%edi,%edx)
530 movdqa %xmm4,48(%edi,%edx)
531
532 addl $64,%edx
533 jnz 1b
534
535 jmp Lshort // copy remaining 0..63 bytes and done
536
537
538 // Forward loop for medium length operands in which low four bits of %esi == 1010
539
540 LMod10:
541 movdqa -10(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
542 1: // loop over 64-byte chunks
543 movdqa 6(%esi,%edx),%xmm1
544 movdqa 22(%esi,%edx),%xmm2
545 movdqa 38(%esi,%edx),%xmm3
546 movdqa 54(%esi,%edx),%xmm4
547
548 movdqa %xmm0,%xmm5
549 movdqa %xmm4,%xmm0
550
551 palignr $10,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
552 palignr $10,%xmm2,%xmm3
553 palignr $10,%xmm1,%xmm2
554 palignr $10,%xmm5,%xmm1
555
556 movdqa %xmm1,(%edi,%edx)
557 movdqa %xmm2,16(%edi,%edx)
558 movdqa %xmm3,32(%edi,%edx)
559 movdqa %xmm4,48(%edi,%edx)
560
561 addl $64,%edx
562 jnz 1b
563
564 jmp Lshort // copy remaining 0..63 bytes and done
565
566
567 // Forward loop for medium length operands in which low four bits of %esi == 1011
568
569 LMod11:
570 movdqa -11(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
571 1: // loop over 64-byte chunks
572 movdqa 5(%esi,%edx),%xmm1
573 movdqa 21(%esi,%edx),%xmm2
574 movdqa 37(%esi,%edx),%xmm3
575 movdqa 53(%esi,%edx),%xmm4
576
577 movdqa %xmm0,%xmm5
578 movdqa %xmm4,%xmm0
579
580 palignr $11,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
581 palignr $11,%xmm2,%xmm3
582 palignr $11,%xmm1,%xmm2
583 palignr $11,%xmm5,%xmm1
584
585 movdqa %xmm1,(%edi,%edx)
586 movdqa %xmm2,16(%edi,%edx)
587 movdqa %xmm3,32(%edi,%edx)
588 movdqa %xmm4,48(%edi,%edx)
589
590 addl $64,%edx
591 jnz 1b
592
593 jmp Lshort // copy remaining 0..63 bytes and done
594
595
596 // Forward loop for medium length operands in which low four bits of %esi == 1100
597 // We use the float single data type in order to use "movss" to merge vectors.
598
599 LMod12:
600 movss (%esi,%edx),%xmm0 // prefetch 1st four bytes of source, right justified
601 jmp 1f
602 .align 4,0x90
603 1: // loop over 64-byte chunks
604 pshufd $(0x93),4(%esi,%edx),%xmm1 // load and rotate right 12 bytes (mask -- 10 01 00 11)
605 pshufd $(0x93),20(%esi,%edx),%xmm2
606 pshufd $(0x93),36(%esi,%edx),%xmm3
607 pshufd $(0x93),52(%esi,%edx),%xmm4
608
609 movaps %xmm4,%xmm5
610 movss %xmm3,%xmm4 // copy low 4 bytes of source into destination
611 movss %xmm2,%xmm3
612 movss %xmm1,%xmm2
613 movss %xmm0,%xmm1
614
615 movaps %xmm1,(%edi,%edx)
616 movaps %xmm2,16(%edi,%edx)
617 movaps %xmm5,%xmm0
618 movaps %xmm3,32(%edi,%edx)
619 movaps %xmm4,48(%edi,%edx)
620
621 addl $64,%edx
622 jnz 1b
623
624 jmp Lshort // copy remaining 0..63 bytes and done
625
626
627 // Forward loop for medium length operands in which low four bits of %esi == 1101
628
629 LMod13:
630 movdqa -13(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
631 1: // loop over 64-byte chunks
632 movdqa 3(%esi,%edx),%xmm1
633 movdqa 19(%esi,%edx),%xmm2
634 movdqa 35(%esi,%edx),%xmm3
635 movdqa 51(%esi,%edx),%xmm4
636
637 movdqa %xmm0,%xmm5
638 movdqa %xmm4,%xmm0
639
640 palignr $13,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
641 palignr $13,%xmm2,%xmm3
642 palignr $13,%xmm1,%xmm2
643 palignr $13,%xmm5,%xmm1
644
645 movdqa %xmm1,(%edi,%edx)
646 movdqa %xmm2,16(%edi,%edx)
647 movdqa %xmm3,32(%edi,%edx)
648 movdqa %xmm4,48(%edi,%edx)
649
650 addl $64,%edx
651 jnz 1b
652
653 jmp Lshort // copy remaining 0..63 bytes and done
654
655
656 // Forward loop for medium length operands in which low four bits of %esi == 1110
657
658 LMod14:
659 movdqa -14(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
660 1: // loop over 64-byte chunks
661 movdqa 2(%esi,%edx),%xmm1
662 movdqa 18(%esi,%edx),%xmm2
663 movdqa 34(%esi,%edx),%xmm3
664 movdqa 50(%esi,%edx),%xmm4
665
666 movdqa %xmm0,%xmm5
667 movdqa %xmm4,%xmm0
668
669 palignr $14,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
670 palignr $14,%xmm2,%xmm3
671 palignr $14,%xmm1,%xmm2
672 palignr $14,%xmm5,%xmm1
673
674 movdqa %xmm1,(%edi,%edx)
675 movdqa %xmm2,16(%edi,%edx)
676 movdqa %xmm3,32(%edi,%edx)
677 movdqa %xmm4,48(%edi,%edx)
678
679 addl $64,%edx
680 jnz 1b
681
682 jmp Lshort // copy remaining 0..63 bytes and done
683
684
685 // Forward loop for medium length operands in which low four bits of %esi == 1111
686
687 LMod15:
688 movdqa -15(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
689 1: // loop over 64-byte chunks
690 movdqa 1(%esi,%edx),%xmm1
691 movdqa 17(%esi,%edx),%xmm2
692 movdqa 33(%esi,%edx),%xmm3
693 movdqa 49(%esi,%edx),%xmm4
694
695 movdqa %xmm0,%xmm5
696 movdqa %xmm4,%xmm0
697
698 palignr $15,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
699 palignr $15,%xmm2,%xmm3
700 palignr $15,%xmm1,%xmm2
701 palignr $15,%xmm5,%xmm1
702
703 movdqa %xmm1,(%edi,%edx)
704 movdqa %xmm2,16(%edi,%edx)
705 movdqa %xmm3,32(%edi,%edx)
706 movdqa %xmm4,48(%edi,%edx)
707
708 addl $64,%edx
709 jnz 1b
710
711 jmp Lshort // copy remaining 0..63 bytes and done
712
713
714 // Reverse moves. These are not optimized as aggressively as their forward
715 // counterparts, as they are only used with destructive overlap.
716 // ecx = length
717 // esi = source ptr
718 // edi = dest ptr
719
720 LReverse:
721 addl %ecx,%esi // point to end of strings
722 addl %ecx,%edi
723 cmpl $(kShort),%ecx // long enough to bother with SSE?
724 ja LReverseNotShort // yes
725
726 // Handle reverse short copies.
727 // ecx = length
728 // esi = one byte past end of source
729 // edi = one byte past end of dest
730
731 LReverseShort:
732 movl %ecx,%edx // copy length
733 shrl $2,%ecx // #words
734 jz 3f
735 1:
736 subl $4,%esi
737 movl (%esi),%eax
738 subl $4,%edi
739 movl %eax,(%edi)
740 dec %ecx
741 jnz 1b
742 3:
743 andl $3,%edx // bytes?
744 jz 5f
745 4:
746 dec %esi
747 movb (%esi),%al
748 dec %edi
749 movb %al,(%edi)
750 dec %edx
751 jnz 4b
752 5:
753 movl 8(%ebp),%eax // get return value (dst ptr) for memcpy/memmove
754 popl %edi
755 popl %esi
756 popl %ebp
757 ret
758
759 // Handle a reverse move long enough to justify using SSE.
760 // ecx = length
761 // esi = one byte past end of source
762 // edi = one byte past end of dest
763
764 LReverseNotShort:
765 movl %edi,%edx // copy destination
766 andl $15,%edx // get #bytes to align destination
767 je LReverseDestAligned // already aligned
768 subl %edx,%ecx // adjust length
769 1: // loop copying 1..15 bytes
770 dec %esi
771 movb (%esi),%al
772 dec %edi
773 movb %al,(%edi)
774 dec %edx
775 jnz 1b
776
777 // Destination is now aligned. Prepare for reverse loops.
778
779 LReverseDestAligned:
780 movl %ecx,%edx // copy length
781 andl $63,%ecx // get remaining bytes for Lshort
782 andl $-64,%edx // get number of bytes we will copy in inner loop
783 subl %edx,%esi // point to endpoint of copy
784 subl %edx,%edi
785 testl $15,%esi // is source aligned too?
786 jnz LReverseUnalignedLoop // no
787
788 LReverseAlignedLoop: // loop over 64-byte chunks
789 movdqa -16(%esi,%edx),%xmm0
790 movdqa -32(%esi,%edx),%xmm1
791 movdqa -48(%esi,%edx),%xmm2
792 movdqa -64(%esi,%edx),%xmm3
793
794 movdqa %xmm0,-16(%edi,%edx)
795 movdqa %xmm1,-32(%edi,%edx)
796 movdqa %xmm2,-48(%edi,%edx)
797 movdqa %xmm3,-64(%edi,%edx)
798
799 subl $64,%edx
800 jne LReverseAlignedLoop
801
802 jmp LReverseShort // copy remaining 0..63 bytes and done
803
804
805 // Reverse, unaligned loop. LDDQU==MOVDQU on these machines.
806
807 LReverseUnalignedLoop: // loop over 64-byte chunks
808 movdqu -16(%esi,%edx),%xmm0
809 movdqu -32(%esi,%edx),%xmm1
810 movdqu -48(%esi,%edx),%xmm2
811 movdqu -64(%esi,%edx),%xmm3
812
813 movdqa %xmm0,-16(%edi,%edx)
814 movdqa %xmm1,-32(%edi,%edx)
815 movdqa %xmm2,-48(%edi,%edx)
816 movdqa %xmm3,-64(%edi,%edx)
817
818 subl $64,%edx
819 jne LReverseUnalignedLoop
820
821 jmp LReverseShort // copy remaining 0..63 bytes and done
822
823 COMMPAGE_DESCRIPTOR(bcopy_sse3x,_COMM_PAGE_BCOPY,kHasSSE2+kHasSupplementalSSE3+kCache64,kHasSSE4_2)