]> git.saurik.com Git - apple/xnu.git/blame - osfmk/i386/commpage/bcopy_sse4.s
xnu-792.13.8.tar.gz
[apple/xnu.git] / osfmk / i386 / commpage / bcopy_sse4.s
CommitLineData
5d5c5d0d
A
1/*
2 * Copyright (c) 2006 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_OSREFERENCE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the
10 * License may not be used to create, or enable the creation or
11 * redistribution of, unlawful or unlicensed copies of an Apple operating
12 * system, or to circumvent, violate, or enable the circumvention or
13 * violation of, any terms of an Apple operating system software license
14 * agreement.
15 *
16 * Please obtain a copy of the License at
17 * http://www.opensource.apple.com/apsl/ and read it before using this
18 * file.
19 *
20 * The Original Code and all software distributed under the License are
21 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
22 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
23 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
24 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
25 * Please see the License for the specific language governing rights and
26 * limitations under the License.
27 *
28 * @APPLE_LICENSE_OSREFERENCE_HEADER_END@
29 */
30
31#include <machine/cpu_capabilities.h>
32#include <machine/commpage.h>
33
34/*
35 * The bcopy/memcpy loops, tuned for Pentium-M class processors with SSE4
36 * and 64-byte cache lines.
37 *
38 * The following #defines are tightly coupled to the u-architecture:
39 */
40
41#define kShort 80 // too short to bother with SSE (must be >=80)
42#define kVeryLong (500*1024) // large enough for non-temporal stores (must be >= 8192)
43#define kFastUCode ((16*1024)-15) // cutoff for microcode fastpath for "rep/movsl"
44
45
46// void bcopy(const void *src, void *dst, size_t len);
47
48 .text
49 .align 5, 0x90
50LZero:
51Lbcopy_sse4: // void bcopy(const void *src, void *dst, size_t len)
52 pushl %ebp // set up a frame for backtraces
53 movl %esp,%ebp
54 pushl %esi
55 pushl %edi
56 movl 8(%ebp),%esi // get source ptr
57 movl 12(%ebp),%edi // get dest ptr
58 movl 16(%ebp),%ecx // get length
59 movl %edi,%edx
60 subl %esi,%edx // (dest - source)
61 cmpl %ecx,%edx // must move in reverse if (dest - source) < length
62 jb LReverseIsland
63 cmpl $(kShort),%ecx // long enough to bother with SSE?
64 jbe Lshort // no
65 jmp LNotShort
66
67//
68// void *memcpy(void *dst, const void *src, size_t len);
69// void *memmove(void *dst, const void *src, size_t len);
70//
71// NB: These need to be 32 bytes from bcopy():
72//
73
74 .align 5, 0x90
75Lmemcpy: // void *memcpy(void *dst, const void *src, size_t len)
76Lmemmove: // void *memmove(void *dst, const void *src, size_t len)
77 pushl %ebp // set up a frame for backtraces
78 movl %esp,%ebp
79 pushl %esi
80 pushl %edi
81 movl 8(%ebp),%edi // get dest ptr
82 movl 12(%ebp),%esi // get source ptr
83 movl 16(%ebp),%ecx // get length
84 movl %edi,%edx
85 subl %esi,%edx // (dest - source)
86 cmpl %ecx,%edx // must move in reverse if (dest - source) < length
87 jb LReverseIsland
88 cmpl $(kShort),%ecx // long enough to bother with SSE?
89 ja LNotShort // yes
90
91// Handle short forward copies. As the most common case, this is the fall-through path.
92// ecx = length (<= kShort)
93// esi = source ptr
94// edi = dest ptr
95
96Lshort:
97 movl %ecx,%edx // copy length
98 shrl $2,%ecx // get #doublewords
99 jz LLeftovers
1002: // loop copying doublewords
101 movl (%esi),%eax
102 addl $4,%esi
103 movl %eax,(%edi)
104 addl $4,%edi
105 dec %ecx
106 jnz 2b
107LLeftovers: // handle leftover bytes (0..3) in last word
108 andl $3,%edx // any leftover bytes?
109 jz Lexit
1104: // loop copying bytes
111 movb (%esi),%al
112 inc %esi
113 movb %al,(%edi)
114 inc %edi
115 dec %edx
116 jnz 4b
117Lexit:
118 movl 8(%ebp),%eax // get return value (dst ptr) for memcpy/memmove
119 popl %edi
120 popl %esi
121 popl %ebp
122 ret
123
124
125LReverseIsland: // keep the "jb" above a short branch...
126 jmp LReverse // ...because reverse moves are uncommon
127
128
129// Handle forward moves that are long enough to justify use of SSE3.
130// First, 16-byte align the destination.
131// ecx = length (> kShort)
132// esi = source ptr
133// edi = dest ptr
134
135LNotShort:
136 cmpl $(kVeryLong),%ecx // long enough to justify heavyweight loops?
137 movl %edi,%edx // copy destination
138 jae LVeryLong // use very-long-operand path
139 negl %edx
140 andl $15,%edx // get #bytes to align destination
141 jz LDestAligned // already aligned
142 subl %edx,%ecx // decrement length
1431: // loop copying 1..15 bytes
144 movb (%esi),%al
145 inc %esi
146 movb %al,(%edi)
147 inc %edi
148 dec %edx
149 jnz 1b
150
151// Destination is now aligned. Dispatch to one of sixteen loops over 64-byte chunks,
152// based on the alignment of the source. All vector loads and stores are aligned.
153// Even though this means we have to shift and repack vectors, doing so is much faster
154// than unaligned loads. Since kShort>=80 and we've moved at most 15 bytes already,
155// there is at least one chunk. When we enter the copy loops, the following registers
156// are set up:
157// ecx = residual length (0..63)
158// edx = -(length to move), a multiple of 64
159// esi = ptr to 1st source byte not to move (unaligned)
160// edi = ptr to 1st dest byte not to move (aligned)
161
162LDestAligned:
163 movl %ecx,%edx // copy length
164 movl %esi,%eax // copy source address
165 andl $63,%ecx // get remaining bytes for Lshort
166 andl $-64,%edx // get number of bytes we will copy in inner loop
167 andl $15,%eax // mask to low 4 bits of source address
168 addl %edx,%esi // point to 1st byte not copied
169 addl %edx,%edi
170 negl %edx // now generate offset to 1st byte to be copied
171 movl (_COMM_PAGE_BCOPY+LTable-LZero)(,%eax,4),%eax
172 jmp *%eax
173
174 .align 2
175LTable: // table of copy loop addresses
176 .long LMod0 + _COMM_PAGE_BCOPY - LZero
177 .long LMod1 + _COMM_PAGE_BCOPY - LZero
178 .long LMod2 + _COMM_PAGE_BCOPY - LZero
179 .long LMod3 + _COMM_PAGE_BCOPY - LZero
180 .long LMod4 + _COMM_PAGE_BCOPY - LZero
181 .long LMod5 + _COMM_PAGE_BCOPY - LZero
182 .long LMod6 + _COMM_PAGE_BCOPY - LZero
183 .long LMod7 + _COMM_PAGE_BCOPY - LZero
184 .long LMod8 + _COMM_PAGE_BCOPY - LZero
185 .long LMod9 + _COMM_PAGE_BCOPY - LZero
186 .long LMod10 + _COMM_PAGE_BCOPY - LZero
187 .long LMod11 + _COMM_PAGE_BCOPY - LZero
188 .long LMod12 + _COMM_PAGE_BCOPY - LZero
189 .long LMod13 + _COMM_PAGE_BCOPY - LZero
190 .long LMod14 + _COMM_PAGE_BCOPY - LZero
191 .long LMod15 + _COMM_PAGE_BCOPY - LZero
192
193
194// Very long forward moves. These are at least several pages. They are special cased
195// and aggressively optimized, not so much because they are common or useful, but
196// because they are subject to benchmark. There isn't enough room for them in the
197// area reserved on the commpage for bcopy, so we put them elsewhere. We call
198// the longcopy routine using the normal ABI.
199
200LVeryLong:
201 pushl %ecx // length (>= kVeryLong)
202 pushl %esi // source ptr
203 pushl %edi // dest ptr
204 movl $(_COMM_PAGE_LONGCOPY),%eax
205 call *%eax // do the long copy
206 addl $12,%esp // pop off our parameters
207 jmp Lexit
208
209
210// On Pentium-M, the microcode for "rep/movsl" is faster than SSE for 8-byte
211// aligned operands from about 32KB up to kVeryLong for the hot cache case, and from
212// about 256 bytes up to kVeryLong for cold caches. This is because the microcode
213// avoids having to read destination cache lines that will be completely overwritten.
214// The cutoff we use (ie, kFastUCode) must somehow balance the two cases, since
215// we do not know if the destination is in cache or not.
216
217Lfastpath:
218 addl %edx,%esi // restore ptrs to 1st byte of source and dest
219 addl %edx,%edi
220 negl %edx // make length positive
221 orl %edx,%ecx // restore total #bytes remaining to move
222 cld // we'll move forward
223 movl %ecx,%edx // copy total length to move
224 shrl $2,%ecx // compute #words to move
225 rep // the u-code will optimize this
226 movsl
227 jmp LLeftovers // handle 0..3 leftover bytes
228
229
230// Forward loop for medium length operands in which low four bits of %esi == 0000
231
232LMod0:
233 cmpl $(-kFastUCode),%edx // %edx == -length, where (length < kVeryLong)
234 jle Lfastpath // long enough for fastpath in microcode
235 jmp 1f
236 .align 4,0x90 // 16-byte align inner loops
2371: // loop over 64-byte chunks
238 movdqa (%esi,%edx),%xmm0
239 movdqa 16(%esi,%edx),%xmm1
240 movdqa 32(%esi,%edx),%xmm2
241 movdqa 48(%esi,%edx),%xmm3
242
243 movdqa %xmm0,(%edi,%edx)
244 movdqa %xmm1,16(%edi,%edx)
245 movdqa %xmm2,32(%edi,%edx)
246 movdqa %xmm3,48(%edi,%edx)
247
248 addl $64,%edx
249 jnz 1b
250
251 jmp Lshort // copy remaining 0..63 bytes and done
252
253
254// Forward loop for medium length operands in which low four bits of %esi == 0001
255
256LMod1:
257 movdqa -1(%esi,%edx),%xmm0 // prime the loop by loading 1st quadword
2581: // loop over 64-byte chunks
259 movdqa 15(%esi,%edx),%xmm1
260 movdqa 31(%esi,%edx),%xmm2
261 movdqa 47(%esi,%edx),%xmm3
262 movdqa 63(%esi,%edx),%xmm4
263
264 movdqa %xmm0,%xmm5
265 movdqa %xmm4,%xmm0
266
267 palignr $1,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
268 palignr $1,%xmm2,%xmm3
269 palignr $1,%xmm1,%xmm2
270 palignr $1,%xmm5,%xmm1
271
272 movdqa %xmm1,(%edi,%edx)
273 movdqa %xmm2,16(%edi,%edx)
274 movdqa %xmm3,32(%edi,%edx)
275 movdqa %xmm4,48(%edi,%edx)
276
277 addl $64,%edx
278 jnz 1b
279
280 jmp Lshort // copy remaining 0..63 bytes and done
281
282
283// Forward loop for medium length operands in which low four bits of %esi == 0010
284
285LMod2:
286 movdqa -2(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
2871: // loop over 64-byte chunks
288 movdqa 14(%esi,%edx),%xmm1
289 movdqa 30(%esi,%edx),%xmm2
290 movdqa 46(%esi,%edx),%xmm3
291 movdqa 62(%esi,%edx),%xmm4
292
293 movdqa %xmm0,%xmm5
294 movdqa %xmm4,%xmm0
295
296 palignr $2,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
297 palignr $2,%xmm2,%xmm3
298 palignr $2,%xmm1,%xmm2
299 palignr $2,%xmm5,%xmm1
300
301 movdqa %xmm1,(%edi,%edx)
302 movdqa %xmm2,16(%edi,%edx)
303 movdqa %xmm3,32(%edi,%edx)
304 movdqa %xmm4,48(%edi,%edx)
305
306 addl $64,%edx
307 jnz 1b
308
309 jmp Lshort // copy remaining 0..63 bytes and done
310
311
312// Forward loop for medium length operands in which low four bits of %esi == 0011
313
314LMod3:
315 movdqa -3(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
3161: // loop over 64-byte chunks
317 movdqa 13(%esi,%edx),%xmm1
318 movdqa 29(%esi,%edx),%xmm2
319 movdqa 45(%esi,%edx),%xmm3
320 movdqa 61(%esi,%edx),%xmm4
321
322 movdqa %xmm0,%xmm5
323 movdqa %xmm4,%xmm0
324
325 palignr $3,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
326 palignr $3,%xmm2,%xmm3
327 palignr $3,%xmm1,%xmm2
328 palignr $3,%xmm5,%xmm1
329
330 movdqa %xmm1,(%edi,%edx)
331 movdqa %xmm2,16(%edi,%edx)
332 movdqa %xmm3,32(%edi,%edx)
333 movdqa %xmm4,48(%edi,%edx)
334
335 addl $64,%edx
336 jnz 1b
337
338 jmp Lshort // copy remaining 0..63 bytes and done
339
340
341// Forward loop for medium length operands in which low four bits of %esi == 0100
342// We use the float single data type in order to use "movss" to merge vectors.
343
344LMod4:
345 movaps -4(%esi,%edx),%xmm0 // 4-byte aligned: prime the loop
346 jmp 1f
347 .align 4,0x90
3481: // loop over 64-byte chunks
349 movaps 12(%esi,%edx),%xmm1
350 movaps 28(%esi,%edx),%xmm2
351 movss %xmm1,%xmm0 // copy low 4 bytes of source into destination
352 pshufd $(0x39),%xmm0,%xmm0 // rotate right 4 bytes (mask -- 00 11 10 01)
353 movaps 44(%esi,%edx),%xmm3
354 movss %xmm2,%xmm1
355 pshufd $(0x39),%xmm1,%xmm1
356 movaps 60(%esi,%edx),%xmm4
357 movss %xmm3,%xmm2
358 pshufd $(0x39),%xmm2,%xmm2
359
360 movaps %xmm0,(%edi,%edx)
361 movss %xmm4,%xmm3
362 pshufd $(0x39),%xmm3,%xmm3
363 movaps %xmm1,16(%edi,%edx)
364 movaps %xmm2,32(%edi,%edx)
365 movaps %xmm4,%xmm0
366 movaps %xmm3,48(%edi,%edx)
367
368 addl $64,%edx
369 jnz 1b
370
371 jmp Lshort // copy remaining 0..63 bytes and done
372
373
374// Forward loop for medium length operands in which low four bits of %esi == 0101
375
376LMod5:
377 movdqa -5(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
3781: // loop over 64-byte chunks
379 movdqa 11(%esi,%edx),%xmm1
380 movdqa 27(%esi,%edx),%xmm2
381 movdqa 43(%esi,%edx),%xmm3
382 movdqa 59(%esi,%edx),%xmm4
383
384 movdqa %xmm0,%xmm5
385 movdqa %xmm4,%xmm0
386
387 palignr $5,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
388 palignr $5,%xmm2,%xmm3
389 palignr $5,%xmm1,%xmm2
390 palignr $5,%xmm5,%xmm1
391
392 movdqa %xmm1,(%edi,%edx)
393 movdqa %xmm2,16(%edi,%edx)
394 movdqa %xmm3,32(%edi,%edx)
395 movdqa %xmm4,48(%edi,%edx)
396
397 addl $64,%edx
398 jnz 1b
399
400 jmp Lshort // copy remaining 0..63 bytes and done
401
402
403// Forward loop for medium length operands in which low four bits of %esi == 0110
404
405LMod6:
406 movdqa -6(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
4071: // loop over 64-byte chunks
408 movdqa 10(%esi,%edx),%xmm1
409 movdqa 26(%esi,%edx),%xmm2
410 movdqa 42(%esi,%edx),%xmm3
411 movdqa 58(%esi,%edx),%xmm4
412
413 movdqa %xmm0,%xmm5
414 movdqa %xmm4,%xmm0
415
416 palignr $6,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
417 palignr $6,%xmm2,%xmm3
418 palignr $6,%xmm1,%xmm2
419 palignr $6,%xmm5,%xmm1
420
421 movdqa %xmm1,(%edi,%edx)
422 movdqa %xmm2,16(%edi,%edx)
423 movdqa %xmm3,32(%edi,%edx)
424 movdqa %xmm4,48(%edi,%edx)
425
426 addl $64,%edx
427 jnz 1b
428
429 jmp Lshort // copy remaining 0..63 bytes and done
430
431
432// Forward loop for medium length operands in which low four bits of %esi == 0111
433
434LMod7:
435 movdqa -7(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
4361: // loop over 64-byte chunks
437 movdqa 9(%esi,%edx),%xmm1
438 movdqa 25(%esi,%edx),%xmm2
439 movdqa 41(%esi,%edx),%xmm3
440 movdqa 57(%esi,%edx),%xmm4
441
442 movdqa %xmm0,%xmm5
443 movdqa %xmm4,%xmm0
444
445 palignr $7,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
446 palignr $7,%xmm2,%xmm3
447 palignr $7,%xmm1,%xmm2
448 palignr $7,%xmm5,%xmm1
449
450 movdqa %xmm1,(%edi,%edx)
451 movdqa %xmm2,16(%edi,%edx)
452 movdqa %xmm3,32(%edi,%edx)
453 movdqa %xmm4,48(%edi,%edx)
454
455 addl $64,%edx
456 jnz 1b
457
458 jmp Lshort // copy remaining 0..63 bytes and done
459
460
461// Forward loop for medium length operands in which low four bits of %esi == 1000
462// We use the float double data type in order to use "shufpd" to shift by 8 bytes.
463
464LMod8:
465 cmpl $(-kFastUCode),%edx // %edx == -length, where (length < kVeryLong)
466 jle Lfastpath // long enough for fastpath in microcode
467 movapd -8(%esi,%edx),%xmm0 // 8-byte aligned: prime the loop
468 jmp 1f
469 .align 4,0x90
4701: // loop over 64-byte chunks
471 movapd 8(%esi,%edx),%xmm1
472 movapd 24(%esi,%edx),%xmm2
473 shufpd $01,%xmm1,%xmm0 // %xmm0 <- shr( %xmm0 || %xmm1, 8 bytes)
474 movapd 40(%esi,%edx),%xmm3
475 shufpd $01,%xmm2,%xmm1
476 movapd 56(%esi,%edx),%xmm4
477 shufpd $01,%xmm3,%xmm2
478
479 movapd %xmm0,(%edi,%edx)
480 shufpd $01,%xmm4,%xmm3
481 movapd %xmm1,16(%edi,%edx)
482 movapd %xmm2,32(%edi,%edx)
483 movapd %xmm4,%xmm0
484 movapd %xmm3,48(%edi,%edx)
485
486 addl $64,%edx
487 jnz 1b
488
489 jmp Lshort // copy remaining 0..63 bytes and done
490
491
492// Forward loop for medium length operands in which low four bits of %esi == 1001
493
494LMod9:
495 movdqa -9(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
4961: // loop over 64-byte chunks
497 movdqa 7(%esi,%edx),%xmm1
498 movdqa 23(%esi,%edx),%xmm2
499 movdqa 39(%esi,%edx),%xmm3
500 movdqa 55(%esi,%edx),%xmm4
501
502 movdqa %xmm0,%xmm5
503 movdqa %xmm4,%xmm0
504
505 palignr $9,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
506 palignr $9,%xmm2,%xmm3
507 palignr $9,%xmm1,%xmm2
508 palignr $9,%xmm5,%xmm1
509
510 movdqa %xmm1,(%edi,%edx)
511 movdqa %xmm2,16(%edi,%edx)
512 movdqa %xmm3,32(%edi,%edx)
513 movdqa %xmm4,48(%edi,%edx)
514
515 addl $64,%edx
516 jnz 1b
517
518 jmp Lshort // copy remaining 0..63 bytes and done
519
520
521// Forward loop for medium length operands in which low four bits of %esi == 1010
522
523LMod10:
524 movdqa -10(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
5251: // loop over 64-byte chunks
526 movdqa 6(%esi,%edx),%xmm1
527 movdqa 22(%esi,%edx),%xmm2
528 movdqa 38(%esi,%edx),%xmm3
529 movdqa 54(%esi,%edx),%xmm4
530
531 movdqa %xmm0,%xmm5
532 movdqa %xmm4,%xmm0
533
534 palignr $10,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
535 palignr $10,%xmm2,%xmm3
536 palignr $10,%xmm1,%xmm2
537 palignr $10,%xmm5,%xmm1
538
539 movdqa %xmm1,(%edi,%edx)
540 movdqa %xmm2,16(%edi,%edx)
541 movdqa %xmm3,32(%edi,%edx)
542 movdqa %xmm4,48(%edi,%edx)
543
544 addl $64,%edx
545 jnz 1b
546
547 jmp Lshort // copy remaining 0..63 bytes and done
548
549
550// Forward loop for medium length operands in which low four bits of %esi == 1011
551
552LMod11:
553 movdqa -11(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
5541: // loop over 64-byte chunks
555 movdqa 5(%esi,%edx),%xmm1
556 movdqa 21(%esi,%edx),%xmm2
557 movdqa 37(%esi,%edx),%xmm3
558 movdqa 53(%esi,%edx),%xmm4
559
560 movdqa %xmm0,%xmm5
561 movdqa %xmm4,%xmm0
562
563 palignr $11,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
564 palignr $11,%xmm2,%xmm3
565 palignr $11,%xmm1,%xmm2
566 palignr $11,%xmm5,%xmm1
567
568 movdqa %xmm1,(%edi,%edx)
569 movdqa %xmm2,16(%edi,%edx)
570 movdqa %xmm3,32(%edi,%edx)
571 movdqa %xmm4,48(%edi,%edx)
572
573 addl $64,%edx
574 jnz 1b
575
576 jmp Lshort // copy remaining 0..63 bytes and done
577
578
579// Forward loop for medium length operands in which low four bits of %esi == 1100
580// We use the float single data type in order to use "movss" to merge vectors.
581
582LMod12:
583 movss (%esi,%edx),%xmm0 // prefetch 1st four bytes of source, right justified
584 jmp 1f
585 .align 4,0x90
5861: // loop over 64-byte chunks
587 pshufd $(0x93),4(%esi,%edx),%xmm1 // load and rotate right 12 bytes (mask -- 10 01 00 11)
588 pshufd $(0x93),20(%esi,%edx),%xmm2
589 pshufd $(0x93),36(%esi,%edx),%xmm3
590 pshufd $(0x93),52(%esi,%edx),%xmm4
591
592 movaps %xmm4,%xmm5
593 movss %xmm3,%xmm4 // copy low 4 bytes of source into destination
594 movss %xmm2,%xmm3
595 movss %xmm1,%xmm2
596 movss %xmm0,%xmm1
597
598 movaps %xmm1,(%edi,%edx)
599 movaps %xmm2,16(%edi,%edx)
600 movaps %xmm5,%xmm0
601 movaps %xmm3,32(%edi,%edx)
602 movaps %xmm4,48(%edi,%edx)
603
604 addl $64,%edx
605 jnz 1b
606
607 jmp Lshort // copy remaining 0..63 bytes and done
608
609
610// Forward loop for medium length operands in which low four bits of %esi == 1101
611
612LMod13:
613 movdqa -13(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
6141: // loop over 64-byte chunks
615 movdqa 3(%esi,%edx),%xmm1
616 movdqa 19(%esi,%edx),%xmm2
617 movdqa 35(%esi,%edx),%xmm3
618 movdqa 51(%esi,%edx),%xmm4
619
620 movdqa %xmm0,%xmm5
621 movdqa %xmm4,%xmm0
622
623 palignr $13,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
624 palignr $13,%xmm2,%xmm3
625 palignr $13,%xmm1,%xmm2
626 palignr $13,%xmm5,%xmm1
627
628 movdqa %xmm1,(%edi,%edx)
629 movdqa %xmm2,16(%edi,%edx)
630 movdqa %xmm3,32(%edi,%edx)
631 movdqa %xmm4,48(%edi,%edx)
632
633 addl $64,%edx
634 jnz 1b
635
636 jmp Lshort // copy remaining 0..63 bytes and done
637
638
639// Forward loop for medium length operands in which low four bits of %esi == 1110
640
641LMod14:
642 movdqa -14(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
6431: // loop over 64-byte chunks
644 movdqa 2(%esi,%edx),%xmm1
645 movdqa 18(%esi,%edx),%xmm2
646 movdqa 34(%esi,%edx),%xmm3
647 movdqa 50(%esi,%edx),%xmm4
648
649 movdqa %xmm0,%xmm5
650 movdqa %xmm4,%xmm0
651
652 palignr $14,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
653 palignr $14,%xmm2,%xmm3
654 palignr $14,%xmm1,%xmm2
655 palignr $14,%xmm5,%xmm1
656
657 movdqa %xmm1,(%edi,%edx)
658 movdqa %xmm2,16(%edi,%edx)
659 movdqa %xmm3,32(%edi,%edx)
660 movdqa %xmm4,48(%edi,%edx)
661
662 addl $64,%edx
663 jnz 1b
664
665 jmp Lshort // copy remaining 0..63 bytes and done
666
667
668// Forward loop for medium length operands in which low four bits of %esi == 1111
669
670LMod15:
671 movdqa -15(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
6721: // loop over 64-byte chunks
673 movdqa 1(%esi,%edx),%xmm1
674 movdqa 17(%esi,%edx),%xmm2
675 movdqa 33(%esi,%edx),%xmm3
676 movdqa 49(%esi,%edx),%xmm4
677
678 movdqa %xmm0,%xmm5
679 movdqa %xmm4,%xmm0
680
681 palignr $15,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
682 palignr $15,%xmm2,%xmm3
683 palignr $15,%xmm1,%xmm2
684 palignr $15,%xmm5,%xmm1
685
686 movdqa %xmm1,(%edi,%edx)
687 movdqa %xmm2,16(%edi,%edx)
688 movdqa %xmm3,32(%edi,%edx)
689 movdqa %xmm4,48(%edi,%edx)
690
691 addl $64,%edx
692 jnz 1b
693
694 jmp Lshort // copy remaining 0..63 bytes and done
695
696
697// Reverse moves. These are not optimized as aggressively as their forward
698// counterparts, as they are only used with destructive overlap.
699// ecx = length
700// esi = source ptr
701// edi = dest ptr
702
703LReverse:
704 addl %ecx,%esi // point to end of strings
705 addl %ecx,%edi
706 cmpl $(kShort),%ecx // long enough to bother with SSE?
707 ja LReverseNotShort // yes
708
709// Handle reverse short copies.
710// ecx = length
711// esi = one byte past end of source
712// edi = one byte past end of dest
713
714LReverseShort:
715 movl %ecx,%edx // copy length
716 shrl $2,%ecx // #words
717 jz 3f
7181:
719 subl $4,%esi
720 movl (%esi),%eax
721 subl $4,%edi
722 movl %eax,(%edi)
723 dec %ecx
724 jnz 1b
7253:
726 andl $3,%edx // bytes?
727 jz 5f
7284:
729 dec %esi
730 movb (%esi),%al
731 dec %edi
732 movb %al,(%edi)
733 dec %edx
734 jnz 4b
7355:
736 movl 8(%ebp),%eax // get return value (dst ptr) for memcpy/memmove
737 popl %edi
738 popl %esi
739 popl %ebp
740 ret
741
742// Handle a reverse move long enough to justify using SSE.
743// ecx = length
744// esi = one byte past end of source
745// edi = one byte past end of dest
746
747LReverseNotShort:
748 movl %edi,%edx // copy destination
749 andl $15,%edx // get #bytes to align destination
750 je LReverseDestAligned // already aligned
751 subl %edx,%ecx // adjust length
7521: // loop copying 1..15 bytes
753 dec %esi
754 movb (%esi),%al
755 dec %edi
756 movb %al,(%edi)
757 dec %edx
758 jnz 1b
759
760// Destination is now aligned. Prepare for reverse loops.
761
762LReverseDestAligned:
763 movl %ecx,%edx // copy length
764 andl $63,%ecx // get remaining bytes for Lshort
765 andl $-64,%edx // get number of bytes we will copy in inner loop
766 subl %edx,%esi // point to endpoint of copy
767 subl %edx,%edi
768 testl $15,%esi // is source aligned too?
769 jnz LReverseUnalignedLoop // no
770
771LReverseAlignedLoop: // loop over 64-byte chunks
772 movdqa -16(%esi,%edx),%xmm0
773 movdqa -32(%esi,%edx),%xmm1
774 movdqa -48(%esi,%edx),%xmm2
775 movdqa -64(%esi,%edx),%xmm3
776
777 movdqa %xmm0,-16(%edi,%edx)
778 movdqa %xmm1,-32(%edi,%edx)
779 movdqa %xmm2,-48(%edi,%edx)
780 movdqa %xmm3,-64(%edi,%edx)
781
782 subl $64,%edx
783 jne LReverseAlignedLoop
784
785 jmp LReverseShort // copy remaining 0..63 bytes and done
786
787
788// Reverse, unaligned loop. LDDQU==MOVDQU on these machines.
789
790LReverseUnalignedLoop: // loop over 64-byte chunks
791 movdqu -16(%esi,%edx),%xmm0
792 movdqu -32(%esi,%edx),%xmm1
793 movdqu -48(%esi,%edx),%xmm2
794 movdqu -64(%esi,%edx),%xmm3
795
796 movdqa %xmm0,-16(%edi,%edx)
797 movdqa %xmm1,-32(%edi,%edx)
798 movdqa %xmm2,-48(%edi,%edx)
799 movdqa %xmm3,-64(%edi,%edx)
800
801 subl $64,%edx
802 jne LReverseUnalignedLoop
803
804 jmp LReverseShort // copy remaining 0..63 bytes and done
805
806
807 COMMPAGE_DESCRIPTOR(bcopy_sse4,_COMM_PAGE_BCOPY,kHasSSE3+kHasSupplementalSSE3+kCache64,0)