]> git.saurik.com Git - apple/libc.git/blame - i386/string/bcopy_sse3x.s
Libc-763.11.tar.gz
[apple/libc.git] / i386 / string / bcopy_sse3x.s
CommitLineData
1f2f436a
A
1/*
2 * Copyright (c) 2006 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <machine/cpu_capabilities.h>
30#include <platfunc.h>
31
32/*
33 * The bcopy/memcpy loops, tuned for Pentium-M class processors with
34 * Supplemental SSE3 and 64-byte cache lines.
35 *
36 * The following #defines are tightly coupled to the u-architecture:
37 */
38
39#define kShort 80 // too short to bother with SSE (must be >=80)
40#define kVeryLong (500*1024) // large enough for non-temporal stores (must be >= 8192)
41#define kFastUCode ((16*1024)-15) // cutoff for microcode fastpath for "rep/movsl"
42
43// void bcopy(const void *src, void *dst, size_t len);
44
45PLATFUNC_FUNCTION_START(bcopy, sse3x, 32, 5)
46 pushl %ebp // set up a frame for backtraces
47 movl %esp,%ebp
48 pushl %esi
49 pushl %edi
50 pushl %ebx
51 movl 8(%ebp),%esi // get source ptr
52 movl 12(%ebp),%edi // get dest ptr
53 movl 16(%ebp),%ecx // get length
54 movl %edi,%edx
55 subl %esi,%edx // (dest - source)
56 cmpl %ecx,%edx // must move in reverse if (dest - source) < length
57 jb LReverseIsland
58 cmpl $(kShort),%ecx // long enough to bother with SSE?
59 jbe Lshort // no
60 jmp LNotShort
61
62//
63// void *memcpy(void *dst, const void *src, size_t len);
64// void *memmove(void *dst, const void *src, size_t len);
65//
66
67PLATFUNC_FUNCTION_START(memcpy, sse3x, 32, 0) // void *memcpy(void *dst, const void *src, size_t len)
68PLATFUNC_FUNCTION_START(memmove, sse3x, 32, 0) // void *memmove(void *dst, const void *src, size_t len)
69 pushl %ebp // set up a frame for backtraces
70 movl %esp,%ebp
71 pushl %esi
72 pushl %edi
73 pushl %ebx
74 movl 8(%ebp),%edi // get dest ptr
75 movl 12(%ebp),%esi // get source ptr
76 movl 16(%ebp),%ecx // get length
77 movl %edi,%edx
78 subl %esi,%edx // (dest - source)
79 cmpl %ecx,%edx // must move in reverse if (dest - source) < length
80 jb LReverseIsland
81 cmpl $(kShort),%ecx // long enough to bother with SSE?
82 ja LNotShort // yes
83
84// Handle short forward copies. As the most common case, this is the fall-through path.
85// ecx = length (<= kShort)
86// esi = source ptr
87// edi = dest ptr
88
89Lshort:
90 movl %ecx,%edx // copy length
91 shrl $2,%ecx // get #doublewords
92 jz LLeftovers
932: // loop copying doublewords
94 movl (%esi),%eax
95 addl $4,%esi
96 movl %eax,(%edi)
97 addl $4,%edi
98 dec %ecx
99 jnz 2b
100LLeftovers: // handle leftover bytes (0..3) in last word
101 andl $3,%edx // any leftover bytes?
102 jz Lexit
1034: // loop copying bytes
104 movb (%esi),%al
105 inc %esi
106 movb %al,(%edi)
107 inc %edi
108 dec %edx
109 jnz 4b
110Lexit:
111 movl 8(%ebp),%eax // get return value (dst ptr) for memcpy/memmove
112 popl %ebx
113 popl %edi
114 popl %esi
115 popl %ebp
116 ret
117
118
119LReverseIsland: // keep the "jb" above a short branch...
120 jmp LReverse // ...because reverse moves are uncommon
121
122
123// Handle forward moves that are long enough to justify use of SSE3.
124// First, 16-byte align the destination.
125// ecx = length (> kShort)
126// esi = source ptr
127// edi = dest ptr
128
129LNotShort:
130 cmpl $(kVeryLong),%ecx // long enough to justify heavyweight loops?
131 movl %edi,%edx // copy destination
132 jae LVeryLong // use very-long-operand path
133 negl %edx
134 andl $15,%edx // get #bytes to align destination
135 jz LDestAligned // already aligned
136 subl %edx,%ecx // decrement length
1371: // loop copying 1..15 bytes
138 movb (%esi),%al
139 inc %esi
140 movb %al,(%edi)
141 inc %edi
142 dec %edx
143 jnz 1b
144
145// Destination is now aligned. Dispatch to one of sixteen loops over 64-byte chunks,
146// based on the alignment of the source. All vector loads and stores are aligned.
147// Even though this means we have to shift and repack vectors, doing so is much faster
148// than unaligned loads. Since kShort>=80 and we've moved at most 15 bytes already,
149// there is at least one chunk. When we enter the copy loops, the following registers
150// are set up:
151// ecx = residual length (0..63)
152// edx = -(length to move), a multiple of 64
153// esi = ptr to 1st source byte not to move (unaligned)
154// edi = ptr to 1st dest byte not to move (aligned)
155
156LDestAligned:
157 movl %ecx,%edx // copy length
158 movl %esi,%eax // copy source address
159 andl $63,%ecx // get remaining bytes for Lshort
160 andl $-64,%edx // get number of bytes we will copy in inner loop
161 andl $15,%eax // mask to low 4 bits of source address
162 addl %edx,%esi // point to 1st byte not copied
163 addl %edx,%edi
164 negl %edx // now generate offset to 1st byte to be copied
165 call 1f
1661:
167 popl %ebx
168 movl (LTable-1b)(%ebx,%eax,4), %eax // load jump table entry address, relative to LZero
169 leal (LTable-1b)(%ebx,%eax,1), %eax
170 jmp *%eax
171
172 .align 2
173LTable: // table of copy loop addresses
174 .long LMod0 -LTable
175 .long LMod1 -LTable
176 .long LMod2 -LTable
177 .long LMod3 -LTable
178 .long LMod4 -LTable
179 .long LMod5 -LTable
180 .long LMod6 -LTable
181 .long LMod7 -LTable
182 .long LMod8 -LTable
183 .long LMod9 -LTable
184 .long LMod10 -LTable
185 .long LMod11 -LTable
186 .long LMod12 -LTable
187 .long LMod13 -LTable
188 .long LMod14 -LTable
189 .long LMod15 -LTable
190
191
192// Very long forward moves. These are at least several pages. They are special cased
193// and aggressively optimized, not so much because they are common or useful, but
194// because they are subject to benchmark. There isn't enough room for them in the
195// area reserved on the platfunc for bcopy, so we put them elsewhere. We call
196// the longcopy routine using the normal ABI.
197
198LVeryLong:
199 pushl %ecx // length (>= kVeryLong)
200 pushl %esi // source ptr
201 pushl %edi // dest ptr
202 call _longcopy
203 addl $12,%esp // pop off our parameters
204 jmp Lexit
205
206
207// On Pentium-M, the microcode for "rep/movsl" is faster than SSE for 8-byte
208// aligned operands from about 32KB up to kVeryLong for the hot cache case, and from
209// about 256 bytes up to kVeryLong for cold caches. This is because the microcode
210// avoids having to read destination cache lines that will be completely overwritten.
211// The cutoff we use (ie, kFastUCode) must somehow balance the two cases, since
212// we do not know if the destination is in cache or not.
213
214Lfastpath:
215 addl %edx,%esi // restore ptrs to 1st byte of source and dest
216 addl %edx,%edi
217 negl %edx // make length positive
218 orl %edx,%ecx // restore total #bytes remaining to move
219 cld // we'll move forward
220 movl %ecx,%edx // copy total length to move
221 shrl $2,%ecx // compute #words to move
222 rep // the u-code will optimize this
223 movsl
224 jmp LLeftovers // handle 0..3 leftover bytes
225
226
227// Forward loop for medium length operands in which low four bits of %esi == 0000
228
229LMod0:
230 cmpl $(-kFastUCode),%edx // %edx == -length, where (length < kVeryLong)
231 jle Lfastpath // long enough for fastpath in microcode
232 jmp 1f
233 .align 4,0x90 // 16-byte align inner loops
2341: // loop over 64-byte chunks
235 movdqa (%esi,%edx),%xmm0
236 movdqa 16(%esi,%edx),%xmm1
237 movdqa 32(%esi,%edx),%xmm2
238 movdqa 48(%esi,%edx),%xmm3
239
240 movdqa %xmm0,(%edi,%edx)
241 movdqa %xmm1,16(%edi,%edx)
242 movdqa %xmm2,32(%edi,%edx)
243 movdqa %xmm3,48(%edi,%edx)
244
245 addl $64,%edx
246 jnz 1b
247
248 jmp Lshort // copy remaining 0..63 bytes and done
249
250
251// Forward loop for medium length operands in which low four bits of %esi == 0001
252
253LMod1:
254 movdqa -1(%esi,%edx),%xmm0 // prime the loop by loading 1st quadword
2551: // loop over 64-byte chunks
256 movdqa 15(%esi,%edx),%xmm1
257 movdqa 31(%esi,%edx),%xmm2
258 movdqa 47(%esi,%edx),%xmm3
259 movdqa 63(%esi,%edx),%xmm4
260
261 movdqa %xmm0,%xmm5
262 movdqa %xmm4,%xmm0
263
264 palignr $1,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
265 palignr $1,%xmm2,%xmm3
266 palignr $1,%xmm1,%xmm2
267 palignr $1,%xmm5,%xmm1
268
269 movdqa %xmm1,(%edi,%edx)
270 movdqa %xmm2,16(%edi,%edx)
271 movdqa %xmm3,32(%edi,%edx)
272 movdqa %xmm4,48(%edi,%edx)
273
274 addl $64,%edx
275 jnz 1b
276
277 jmp Lshort // copy remaining 0..63 bytes and done
278
279
280// Forward loop for medium length operands in which low four bits of %esi == 0010
281
282LMod2:
283 movdqa -2(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
2841: // loop over 64-byte chunks
285 movdqa 14(%esi,%edx),%xmm1
286 movdqa 30(%esi,%edx),%xmm2
287 movdqa 46(%esi,%edx),%xmm3
288 movdqa 62(%esi,%edx),%xmm4
289
290 movdqa %xmm0,%xmm5
291 movdqa %xmm4,%xmm0
292
293 palignr $2,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
294 palignr $2,%xmm2,%xmm3
295 palignr $2,%xmm1,%xmm2
296 palignr $2,%xmm5,%xmm1
297
298 movdqa %xmm1,(%edi,%edx)
299 movdqa %xmm2,16(%edi,%edx)
300 movdqa %xmm3,32(%edi,%edx)
301 movdqa %xmm4,48(%edi,%edx)
302
303 addl $64,%edx
304 jnz 1b
305
306 jmp Lshort // copy remaining 0..63 bytes and done
307
308
309// Forward loop for medium length operands in which low four bits of %esi == 0011
310
311LMod3:
312 movdqa -3(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
3131: // loop over 64-byte chunks
314 movdqa 13(%esi,%edx),%xmm1
315 movdqa 29(%esi,%edx),%xmm2
316 movdqa 45(%esi,%edx),%xmm3
317 movdqa 61(%esi,%edx),%xmm4
318
319 movdqa %xmm0,%xmm5
320 movdqa %xmm4,%xmm0
321
322 palignr $3,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
323 palignr $3,%xmm2,%xmm3
324 palignr $3,%xmm1,%xmm2
325 palignr $3,%xmm5,%xmm1
326
327 movdqa %xmm1,(%edi,%edx)
328 movdqa %xmm2,16(%edi,%edx)
329 movdqa %xmm3,32(%edi,%edx)
330 movdqa %xmm4,48(%edi,%edx)
331
332 addl $64,%edx
333 jnz 1b
334
335 jmp Lshort // copy remaining 0..63 bytes and done
336
337
338// Forward loop for medium length operands in which low four bits of %esi == 0100
339// We use the float single data type in order to use "movss" to merge vectors.
340
341LMod4:
342 movaps -4(%esi,%edx),%xmm0 // 4-byte aligned: prime the loop
343 jmp 1f
344 .align 4,0x90
3451: // loop over 64-byte chunks
346 movaps 12(%esi,%edx),%xmm1
347 movaps 28(%esi,%edx),%xmm2
348 movss %xmm1,%xmm0 // copy low 4 bytes of source into destination
349 pshufd $(0x39),%xmm0,%xmm0 // rotate right 4 bytes (mask -- 00 11 10 01)
350 movaps 44(%esi,%edx),%xmm3
351 movss %xmm2,%xmm1
352 pshufd $(0x39),%xmm1,%xmm1
353 movaps 60(%esi,%edx),%xmm4
354 movss %xmm3,%xmm2
355 pshufd $(0x39),%xmm2,%xmm2
356
357 movaps %xmm0,(%edi,%edx)
358 movss %xmm4,%xmm3
359 pshufd $(0x39),%xmm3,%xmm3
360 movaps %xmm1,16(%edi,%edx)
361 movaps %xmm2,32(%edi,%edx)
362 movaps %xmm4,%xmm0
363 movaps %xmm3,48(%edi,%edx)
364
365 addl $64,%edx
366 jnz 1b
367
368 jmp Lshort // copy remaining 0..63 bytes and done
369
370
371// Forward loop for medium length operands in which low four bits of %esi == 0101
372
373LMod5:
374 movdqa -5(%esi,%edx),%xmm0// prime the loop by loading 1st source dq
3751: // loop over 64-byte chunks
376 movdqa 11(%esi,%edx),%xmm1
377 movdqa 27(%esi,%edx),%xmm2
378 movdqa 43(%esi,%edx),%xmm3
379 movdqa 59(%esi,%edx),%xmm4
380
381 movdqa %xmm0,%xmm5
382 movdqa %xmm4,%xmm0
383
384 palignr $5,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
385 palignr $5,%xmm2,%xmm3
386 palignr $5,%xmm1,%xmm2
387 palignr $5,%xmm5,%xmm1
388
389 movdqa %xmm1,(%edi,%edx)
390 movdqa %xmm2,16(%edi,%edx)
391 movdqa %xmm3,32(%edi,%edx)
392 movdqa %xmm4,48(%edi,%edx)
393
394 addl $64,%edx
395 jnz 1b
396
397 jmp Lshort // copy remaining 0..63 bytes and done
398
399
400// Forward loop for medium length operands in which low four bits of %esi == 0110
401
402LMod6:
403 movdqa -6(%esi,%edx),%xmm0// prime the loop by loading 1st source dq
4041: // loop over 64-byte chunks
405 movdqa 10(%esi,%edx),%xmm1
406 movdqa 26(%esi,%edx),%xmm2
407 movdqa 42(%esi,%edx),%xmm3
408 movdqa 58(%esi,%edx),%xmm4
409
410 movdqa %xmm0,%xmm5
411 movdqa %xmm4,%xmm0
412
413 palignr $6,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
414 palignr $6,%xmm2,%xmm3
415 palignr $6,%xmm1,%xmm2
416 palignr $6,%xmm5,%xmm1
417
418 movdqa %xmm1,(%edi,%edx)
419 movdqa %xmm2,16(%edi,%edx)
420 movdqa %xmm3,32(%edi,%edx)
421 movdqa %xmm4,48(%edi,%edx)
422
423 addl $64,%edx
424 jnz 1b
425
426 jmp Lshort // copy remaining 0..63 bytes and done
427
428
429// Forward loop for medium length operands in which low four bits of %esi == 0111
430
431LMod7:
432 movdqa -7(%esi,%edx),%xmm0// prime the loop by loading 1st source dq
4331: // loop over 64-byte chunks
434 movdqa 9(%esi,%edx),%xmm1
435 movdqa 25(%esi,%edx),%xmm2
436 movdqa 41(%esi,%edx),%xmm3
437 movdqa 57(%esi,%edx),%xmm4
438
439 movdqa %xmm0,%xmm5
440 movdqa %xmm4,%xmm0
441
442 palignr $7,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
443 palignr $7,%xmm2,%xmm3
444 palignr $7,%xmm1,%xmm2
445 palignr $7,%xmm5,%xmm1
446
447 movdqa %xmm1,(%edi,%edx)
448 movdqa %xmm2,16(%edi,%edx)
449 movdqa %xmm3,32(%edi,%edx)
450 movdqa %xmm4,48(%edi,%edx)
451
452 addl $64,%edx
453 jnz 1b
454
455 jmp Lshort // copy remaining 0..63 bytes and done
456
457
458// Forward loop for medium length operands in which low four bits of %esi == 1000
459// We use the float double data type in order to use "shufpd" to shift by 8 bytes.
460
461LMod8:
462 cmpl $(-kFastUCode),%edx// %edx == -length, where (length < kVeryLong)
463 jle Lfastpath // long enough for fastpath in microcode
464 movapd -8(%esi,%edx),%xmm0// 8-byte aligned: prime the loop
465 jmp 1f
466 .align 4,0x90
4671: // loop over 64-byte chunks
468 movapd 8(%esi,%edx),%xmm1
469 movapd 24(%esi,%edx),%xmm2
470 shufpd $01,%xmm1,%xmm0 // %xmm0 <- shr( %xmm0 || %xmm1, 8 bytes)
471 movapd 40(%esi,%edx),%xmm3
472 shufpd $01,%xmm2,%xmm1
473 movapd 56(%esi,%edx),%xmm4
474 shufpd $01,%xmm3,%xmm2
475
476 movapd %xmm0,(%edi,%edx)
477 shufpd $01,%xmm4,%xmm3
478 movapd %xmm1,16(%edi,%edx)
479 movapd %xmm2,32(%edi,%edx)
480 movapd %xmm4,%xmm0
481 movapd %xmm3,48(%edi,%edx)
482
483 addl $64,%edx
484 jnz 1b
485
486 jmp Lshort // copy remaining 0..63 bytes and done
487
488
489// Forward loop for medium length operands in which low four bits of %esi == 1001
490
491LMod9:
492 movdqa -9(%esi,%edx),%xmm0// prime the loop by loading 1st source dq
4931: // loop over 64-byte chunks
494 movdqa 7(%esi,%edx),%xmm1
495 movdqa 23(%esi,%edx),%xmm2
496 movdqa 39(%esi,%edx),%xmm3
497 movdqa 55(%esi,%edx),%xmm4
498
499 movdqa %xmm0,%xmm5
500 movdqa %xmm4,%xmm0
501
502 palignr $9,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
503 palignr $9,%xmm2,%xmm3
504 palignr $9,%xmm1,%xmm2
505 palignr $9,%xmm5,%xmm1
506
507 movdqa %xmm1,(%edi,%edx)
508 movdqa %xmm2,16(%edi,%edx)
509 movdqa %xmm3,32(%edi,%edx)
510 movdqa %xmm4,48(%edi,%edx)
511
512 addl $64,%edx
513 jnz 1b
514
515 jmp Lshort // copy remaining 0..63 bytes and done
516
517
518// Forward loop for medium length operands in which low four bits of %esi == 1010
519
520LMod10:
521 movdqa -10(%esi,%edx),%xmm0// prime the loop by loading 1st source dq
5221: // loop over 64-byte chunks
523 movdqa 6(%esi,%edx),%xmm1
524 movdqa 22(%esi,%edx),%xmm2
525 movdqa 38(%esi,%edx),%xmm3
526 movdqa 54(%esi,%edx),%xmm4
527
528 movdqa %xmm0,%xmm5
529 movdqa %xmm4,%xmm0
530
531 palignr $10,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
532 palignr $10,%xmm2,%xmm3
533 palignr $10,%xmm1,%xmm2
534 palignr $10,%xmm5,%xmm1
535
536 movdqa %xmm1,(%edi,%edx)
537 movdqa %xmm2,16(%edi,%edx)
538 movdqa %xmm3,32(%edi,%edx)
539 movdqa %xmm4,48(%edi,%edx)
540
541 addl $64,%edx
542 jnz 1b
543
544 jmp Lshort // copy remaining 0..63 bytes and done
545
546
547// Forward loop for medium length operands in which low four bits of %esi == 1011
548
549LMod11:
550 movdqa -11(%esi,%edx),%xmm0// prime the loop by loading 1st source dq
5511: // loop over 64-byte chunks
552 movdqa 5(%esi,%edx),%xmm1
553 movdqa 21(%esi,%edx),%xmm2
554 movdqa 37(%esi,%edx),%xmm3
555 movdqa 53(%esi,%edx),%xmm4
556
557 movdqa %xmm0,%xmm5
558 movdqa %xmm4,%xmm0
559
560 palignr $11,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
561 palignr $11,%xmm2,%xmm3
562 palignr $11,%xmm1,%xmm2
563 palignr $11,%xmm5,%xmm1
564
565 movdqa %xmm1,(%edi,%edx)
566 movdqa %xmm2,16(%edi,%edx)
567 movdqa %xmm3,32(%edi,%edx)
568 movdqa %xmm4,48(%edi,%edx)
569
570 addl $64,%edx
571 jnz 1b
572
573 jmp Lshort // copy remaining 0..63 bytes and done
574
575
576// Forward loop for medium length operands in which low four bits of %esi == 1100
577// We use the float single data type in order to use "movss" to merge vectors.
578
579LMod12:
580 movss (%esi,%edx),%xmm0// prefetch 1st four bytes of source, right justified
581 jmp 1f
582 .align 4,0x90
5831: // loop over 64-byte chunks
584 pshufd $(0x93),4(%esi,%edx),%xmm1 // load and rotate right 12 bytes (mask -- 10 01 00 11)
585 pshufd $(0x93),20(%esi,%edx),%xmm2
586 pshufd $(0x93),36(%esi,%edx),%xmm3
587 pshufd $(0x93),52(%esi,%edx),%xmm4
588
589 movaps %xmm4,%xmm5
590 movss %xmm3,%xmm4 // copy low 4 bytes of source into destination
591 movss %xmm2,%xmm3
592 movss %xmm1,%xmm2
593 movss %xmm0,%xmm1
594
595 movaps %xmm1,(%edi,%edx)
596 movaps %xmm2,16(%edi,%edx)
597 movaps %xmm5,%xmm0
598 movaps %xmm3,32(%edi,%edx)
599 movaps %xmm4,48(%edi,%edx)
600
601 addl $64,%edx
602 jnz 1b
603
604 jmp Lshort // copy remaining 0..63 bytes and done
605
606
607// Forward loop for medium length operands in which low four bits of %esi == 1101
608
609LMod13:
610 movdqa -13(%esi,%edx),%xmm0// prime the loop by loading 1st source dq
6111: // loop over 64-byte chunks
612 movdqa 3(%esi,%edx),%xmm1
613 movdqa 19(%esi,%edx),%xmm2
614 movdqa 35(%esi,%edx),%xmm3
615 movdqa 51(%esi,%edx),%xmm4
616
617 movdqa %xmm0,%xmm5
618 movdqa %xmm4,%xmm0
619
620 palignr $13,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
621 palignr $13,%xmm2,%xmm3
622 palignr $13,%xmm1,%xmm2
623 palignr $13,%xmm5,%xmm1
624
625 movdqa %xmm1,(%edi,%edx)
626 movdqa %xmm2,16(%edi,%edx)
627 movdqa %xmm3,32(%edi,%edx)
628 movdqa %xmm4,48(%edi,%edx)
629
630 addl $64,%edx
631 jnz 1b
632
633 jmp Lshort // copy remaining 0..63 bytes and done
634
635
636// Forward loop for medium length operands in which low four bits of %esi == 1110
637
638LMod14:
639 movdqa -14(%esi,%edx),%xmm0// prime the loop by loading 1st source dq
6401: // loop over 64-byte chunks
641 movdqa 2(%esi,%edx),%xmm1
642 movdqa 18(%esi,%edx),%xmm2
643 movdqa 34(%esi,%edx),%xmm3
644 movdqa 50(%esi,%edx),%xmm4
645
646 movdqa %xmm0,%xmm5
647 movdqa %xmm4,%xmm0
648
649 palignr $14,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
650 palignr $14,%xmm2,%xmm3
651 palignr $14,%xmm1,%xmm2
652 palignr $14,%xmm5,%xmm1
653
654 movdqa %xmm1,(%edi,%edx)
655 movdqa %xmm2,16(%edi,%edx)
656 movdqa %xmm3,32(%edi,%edx)
657 movdqa %xmm4,48(%edi,%edx)
658
659 addl $64,%edx
660 jnz 1b
661
662 jmp Lshort // copy remaining 0..63 bytes and done
663
664
665// Forward loop for medium length operands in which low four bits of %esi == 1111
666
667LMod15:
668 movdqa -15(%esi,%edx),%xmm0// prime the loop by loading 1st source dq
6691: // loop over 64-byte chunks
670 movdqa 1(%esi,%edx),%xmm1
671 movdqa 17(%esi,%edx),%xmm2
672 movdqa 33(%esi,%edx),%xmm3
673 movdqa 49(%esi,%edx),%xmm4
674
675 movdqa %xmm0,%xmm5
676 movdqa %xmm4,%xmm0
677
678 palignr $15,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
679 palignr $15,%xmm2,%xmm3
680 palignr $15,%xmm1,%xmm2
681 palignr $15,%xmm5,%xmm1
682
683 movdqa %xmm1,(%edi,%edx)
684 movdqa %xmm2,16(%edi,%edx)
685 movdqa %xmm3,32(%edi,%edx)
686 movdqa %xmm4,48(%edi,%edx)
687
688 addl $64,%edx
689 jnz 1b
690
691 jmp Lshort // copy remaining 0..63 bytes and done
692
693
694// Reverse moves. These are not optimized as aggressively as their forward
695// counterparts, as they are only used with destructive overlap.
696// ecx = length
697// esi = source ptr
698// edi = dest ptr
699
700LReverse:
701 addl %ecx,%esi // point to end of strings
702 addl %ecx,%edi
703 cmpl $(kShort),%ecx // long enough to bother with SSE?
704 ja LReverseNotShort // yes
705
706// Handle reverse short copies.
707// ecx = length
708// esi = one byte past end of source
709// edi = one byte past end of dest
710
711LReverseShort:
712 movl %ecx,%edx // copy length
713 shrl $2,%ecx // #words
714 jz 3f
7151:
716 subl $4,%esi
717 movl (%esi),%eax
718 subl $4,%edi
719 movl %eax,(%edi)
720 dec %ecx
721 jnz 1b
7223:
723 andl $3,%edx // bytes?
724 jz 5f
7254:
726 dec %esi
727 movb (%esi),%al
728 dec %edi
729 movb %al,(%edi)
730 dec %edx
731 jnz 4b
7325:
733 movl 8(%ebp),%eax // get return value (dst ptr) for memcpy/memmove
734 popl %ebx
735 popl %edi
736 popl %esi
737 popl %ebp
738 ret
739
740// Handle a reverse move long enough to justify using SSE.
741// ecx = length
742// esi = one byte past end of source
743// edi = one byte past end of dest
744
745LReverseNotShort:
746 movl %edi,%edx // copy destination
747 andl $15,%edx // get #bytes to align destination
748 je LReverseDestAligned // already aligned
749 subl %edx,%ecx // adjust length
7501: // loop copying 1..15 bytes
751 dec %esi
752 movb (%esi),%al
753 dec %edi
754 movb %al,(%edi)
755 dec %edx
756 jnz 1b
757
758// Destination is now aligned. Prepare for reverse loops.
759
760LReverseDestAligned:
761 movl %ecx,%edx // copy length
762 andl $63,%ecx // get remaining bytes for Lshort
763 andl $-64,%edx // get number of bytes we will copy in inner loop
764 subl %edx,%esi // point to endpoint of copy
765 subl %edx,%edi
766 testl $15,%esi // is source aligned too?
767 jnz LReverseUnalignedLoop // no
768
769LReverseAlignedLoop: // loop over 64-byte chunks
770 movdqa -16(%esi,%edx),%xmm0
771 movdqa -32(%esi,%edx),%xmm1
772 movdqa -48(%esi,%edx),%xmm2
773 movdqa -64(%esi,%edx),%xmm3
774
775 movdqa %xmm0,-16(%edi,%edx)
776 movdqa %xmm1,-32(%edi,%edx)
777 movdqa %xmm2,-48(%edi,%edx)
778 movdqa %xmm3,-64(%edi,%edx)
779
780 subl $64,%edx
781 jne LReverseAlignedLoop
782
783 jmp LReverseShort // copy remaining 0..63 bytes and done
784
785
786// Reverse, unaligned loop. LDDQU==MOVDQU on these machines.
787
788LReverseUnalignedLoop: // loop over 64-byte chunks
789 movdqu -16(%esi,%edx),%xmm0
790 movdqu -32(%esi,%edx),%xmm1
791 movdqu -48(%esi,%edx),%xmm2
792 movdqu -64(%esi,%edx),%xmm3
793
794 movdqa %xmm0,-16(%edi,%edx)
795 movdqa %xmm1,-32(%edi,%edx)
796 movdqa %xmm2,-48(%edi,%edx)
797 movdqa %xmm3,-64(%edi,%edx)
798
799 subl $64,%edx
800 jne LReverseUnalignedLoop
801
802 jmp LReverseShort // copy remaining 0..63 bytes and done
803
804PLATFUNC_DESCRIPTOR(bcopy,sse3x,kHasSSE2|kHasSupplementalSSE3|kCache64,kHasSSE4_2)
805PLATFUNC_DESCRIPTOR(memcpy,sse3x,kHasSSE2|kHasSupplementalSSE3|kCache64,kHasSSE4_2)
806PLATFUNC_DESCRIPTOR(memmove,sse3x,kHasSSE2|kHasSupplementalSSE3|kCache64,kHasSSE4_2)