]> git.saurik.com Git - apple/xnu.git/blob - osfmk/i386/commpage/bcopy_sse4.s
xnu-792.10.96.tar.gz
[apple/xnu.git] / osfmk / i386 / commpage / bcopy_sse4.s
1 /*
2 * Copyright (c) 2006 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
11 *
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
18 * under the License.
19 *
20 * @APPLE_LICENSE_HEADER_END@
21 */
22
23 #include <machine/cpu_capabilities.h>
24 #include <machine/commpage.h>
25
26 /*
27 * The bcopy/memcpy loops, tuned for Pentium-M class processors with SSE4
28 * and 64-byte cache lines.
29 *
30 * The following #defines are tightly coupled to the u-architecture:
31 */
32
33 #define kShort 80 // too short to bother with SSE (must be >=80)
34 #define kVeryLong (500*1024) // large enough for non-temporal stores (must be >= 8192)
35 #define kFastUCode ((16*1024)-15) // cutoff for microcode fastpath for "rep/movsl"
36
37
38 // void bcopy(const void *src, void *dst, size_t len);
39
40 .text
41 .align 5, 0x90
42 LZero:
43 Lbcopy_sse4: // void bcopy(const void *src, void *dst, size_t len)
44 pushl %ebp // set up a frame for backtraces
45 movl %esp,%ebp
46 pushl %esi
47 pushl %edi
48 movl 8(%ebp),%esi // get source ptr
49 movl 12(%ebp),%edi // get dest ptr
50 movl 16(%ebp),%ecx // get length
51 movl %edi,%edx
52 subl %esi,%edx // (dest - source)
53 cmpl %ecx,%edx // must move in reverse if (dest - source) < length
54 jb LReverseIsland
55 cmpl $(kShort),%ecx // long enough to bother with SSE?
56 jbe Lshort // no
57 jmp LNotShort
58
59 //
60 // void *memcpy(void *dst, const void *src, size_t len);
61 // void *memmove(void *dst, const void *src, size_t len);
62 //
63 // NB: These need to be 32 bytes from bcopy():
64 //
65
66 .align 5, 0x90
67 Lmemcpy: // void *memcpy(void *dst, const void *src, size_t len)
68 Lmemmove: // void *memmove(void *dst, const void *src, size_t len)
69 pushl %ebp // set up a frame for backtraces
70 movl %esp,%ebp
71 pushl %esi
72 pushl %edi
73 movl 8(%ebp),%edi // get dest ptr
74 movl 12(%ebp),%esi // get source ptr
75 movl 16(%ebp),%ecx // get length
76 movl %edi,%edx
77 subl %esi,%edx // (dest - source)
78 cmpl %ecx,%edx // must move in reverse if (dest - source) < length
79 jb LReverseIsland
80 cmpl $(kShort),%ecx // long enough to bother with SSE?
81 ja LNotShort // yes
82
83 // Handle short forward copies. As the most common case, this is the fall-through path.
84 // ecx = length (<= kShort)
85 // esi = source ptr
86 // edi = dest ptr
87
88 Lshort:
89 movl %ecx,%edx // copy length
90 shrl $2,%ecx // get #doublewords
91 jz LLeftovers
92 2: // loop copying doublewords
93 movl (%esi),%eax
94 addl $4,%esi
95 movl %eax,(%edi)
96 addl $4,%edi
97 dec %ecx
98 jnz 2b
99 LLeftovers: // handle leftover bytes (0..3) in last word
100 andl $3,%edx // any leftover bytes?
101 jz Lexit
102 4: // loop copying bytes
103 movb (%esi),%al
104 inc %esi
105 movb %al,(%edi)
106 inc %edi
107 dec %edx
108 jnz 4b
109 Lexit:
110 movl 8(%ebp),%eax // get return value (dst ptr) for memcpy/memmove
111 popl %edi
112 popl %esi
113 popl %ebp
114 ret
115
116
117 LReverseIsland: // keep the "jb" above a short branch...
118 jmp LReverse // ...because reverse moves are uncommon
119
120
121 // Handle forward moves that are long enough to justify use of SSE3.
122 // First, 16-byte align the destination.
123 // ecx = length (> kShort)
124 // esi = source ptr
125 // edi = dest ptr
126
127 LNotShort:
128 cmpl $(kVeryLong),%ecx // long enough to justify heavyweight loops?
129 movl %edi,%edx // copy destination
130 jae LVeryLong // use very-long-operand path
131 negl %edx
132 andl $15,%edx // get #bytes to align destination
133 jz LDestAligned // already aligned
134 subl %edx,%ecx // decrement length
135 1: // loop copying 1..15 bytes
136 movb (%esi),%al
137 inc %esi
138 movb %al,(%edi)
139 inc %edi
140 dec %edx
141 jnz 1b
142
143 // Destination is now aligned. Dispatch to one of sixteen loops over 64-byte chunks,
144 // based on the alignment of the source. All vector loads and stores are aligned.
145 // Even though this means we have to shift and repack vectors, doing so is much faster
146 // than unaligned loads. Since kShort>=80 and we've moved at most 15 bytes already,
147 // there is at least one chunk. When we enter the copy loops, the following registers
148 // are set up:
149 // ecx = residual length (0..63)
150 // edx = -(length to move), a multiple of 64
151 // esi = ptr to 1st source byte not to move (unaligned)
152 // edi = ptr to 1st dest byte not to move (aligned)
153
154 LDestAligned:
155 movl %ecx,%edx // copy length
156 movl %esi,%eax // copy source address
157 andl $63,%ecx // get remaining bytes for Lshort
158 andl $-64,%edx // get number of bytes we will copy in inner loop
159 andl $15,%eax // mask to low 4 bits of source address
160 addl %edx,%esi // point to 1st byte not copied
161 addl %edx,%edi
162 negl %edx // now generate offset to 1st byte to be copied
163 movl (_COMM_PAGE_BCOPY+LTable-LZero)(,%eax,4),%eax
164 jmp *%eax
165
166 .align 2
167 LTable: // table of copy loop addresses
168 .long LMod0 + _COMM_PAGE_BCOPY - LZero
169 .long LMod1 + _COMM_PAGE_BCOPY - LZero
170 .long LMod2 + _COMM_PAGE_BCOPY - LZero
171 .long LMod3 + _COMM_PAGE_BCOPY - LZero
172 .long LMod4 + _COMM_PAGE_BCOPY - LZero
173 .long LMod5 + _COMM_PAGE_BCOPY - LZero
174 .long LMod6 + _COMM_PAGE_BCOPY - LZero
175 .long LMod7 + _COMM_PAGE_BCOPY - LZero
176 .long LMod8 + _COMM_PAGE_BCOPY - LZero
177 .long LMod9 + _COMM_PAGE_BCOPY - LZero
178 .long LMod10 + _COMM_PAGE_BCOPY - LZero
179 .long LMod11 + _COMM_PAGE_BCOPY - LZero
180 .long LMod12 + _COMM_PAGE_BCOPY - LZero
181 .long LMod13 + _COMM_PAGE_BCOPY - LZero
182 .long LMod14 + _COMM_PAGE_BCOPY - LZero
183 .long LMod15 + _COMM_PAGE_BCOPY - LZero
184
185
186 // Very long forward moves. These are at least several pages. They are special cased
187 // and aggressively optimized, not so much because they are common or useful, but
188 // because they are subject to benchmark. There isn't enough room for them in the
189 // area reserved on the commpage for bcopy, so we put them elsewhere. We call
190 // the longcopy routine using the normal ABI.
191
192 LVeryLong:
193 pushl %ecx // length (>= kVeryLong)
194 pushl %esi // source ptr
195 pushl %edi // dest ptr
196 movl $(_COMM_PAGE_LONGCOPY),%eax
197 call *%eax // do the long copy
198 addl $12,%esp // pop off our parameters
199 jmp Lexit
200
201
202 // On Pentium-M, the microcode for "rep/movsl" is faster than SSE for 8-byte
203 // aligned operands from about 32KB up to kVeryLong for the hot cache case, and from
204 // about 256 bytes up to kVeryLong for cold caches. This is because the microcode
205 // avoids having to read destination cache lines that will be completely overwritten.
206 // The cutoff we use (ie, kFastUCode) must somehow balance the two cases, since
207 // we do not know if the destination is in cache or not.
208
209 Lfastpath:
210 addl %edx,%esi // restore ptrs to 1st byte of source and dest
211 addl %edx,%edi
212 negl %edx // make length positive
213 orl %edx,%ecx // restore total #bytes remaining to move
214 cld // we'll move forward
215 movl %ecx,%edx // copy total length to move
216 shrl $2,%ecx // compute #words to move
217 rep // the u-code will optimize this
218 movsl
219 jmp LLeftovers // handle 0..3 leftover bytes
220
221
222 // Forward loop for medium length operands in which low four bits of %esi == 0000
223
224 LMod0:
225 cmpl $(-kFastUCode),%edx // %edx == -length, where (length < kVeryLong)
226 jle Lfastpath // long enough for fastpath in microcode
227 jmp 1f
228 .align 4,0x90 // 16-byte align inner loops
229 1: // loop over 64-byte chunks
230 movdqa (%esi,%edx),%xmm0
231 movdqa 16(%esi,%edx),%xmm1
232 movdqa 32(%esi,%edx),%xmm2
233 movdqa 48(%esi,%edx),%xmm3
234
235 movdqa %xmm0,(%edi,%edx)
236 movdqa %xmm1,16(%edi,%edx)
237 movdqa %xmm2,32(%edi,%edx)
238 movdqa %xmm3,48(%edi,%edx)
239
240 addl $64,%edx
241 jnz 1b
242
243 jmp Lshort // copy remaining 0..63 bytes and done
244
245
246 // Forward loop for medium length operands in which low four bits of %esi == 0001
247
248 LMod1:
249 movdqa -1(%esi,%edx),%xmm0 // prime the loop by loading 1st quadword
250 1: // loop over 64-byte chunks
251 movdqa 15(%esi,%edx),%xmm1
252 movdqa 31(%esi,%edx),%xmm2
253 movdqa 47(%esi,%edx),%xmm3
254 movdqa 63(%esi,%edx),%xmm4
255
256 movdqa %xmm0,%xmm5
257 movdqa %xmm4,%xmm0
258
259 palignr $1,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
260 palignr $1,%xmm2,%xmm3
261 palignr $1,%xmm1,%xmm2
262 palignr $1,%xmm5,%xmm1
263
264 movdqa %xmm1,(%edi,%edx)
265 movdqa %xmm2,16(%edi,%edx)
266 movdqa %xmm3,32(%edi,%edx)
267 movdqa %xmm4,48(%edi,%edx)
268
269 addl $64,%edx
270 jnz 1b
271
272 jmp Lshort // copy remaining 0..63 bytes and done
273
274
275 // Forward loop for medium length operands in which low four bits of %esi == 0010
276
277 LMod2:
278 movdqa -2(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
279 1: // loop over 64-byte chunks
280 movdqa 14(%esi,%edx),%xmm1
281 movdqa 30(%esi,%edx),%xmm2
282 movdqa 46(%esi,%edx),%xmm3
283 movdqa 62(%esi,%edx),%xmm4
284
285 movdqa %xmm0,%xmm5
286 movdqa %xmm4,%xmm0
287
288 palignr $2,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
289 palignr $2,%xmm2,%xmm3
290 palignr $2,%xmm1,%xmm2
291 palignr $2,%xmm5,%xmm1
292
293 movdqa %xmm1,(%edi,%edx)
294 movdqa %xmm2,16(%edi,%edx)
295 movdqa %xmm3,32(%edi,%edx)
296 movdqa %xmm4,48(%edi,%edx)
297
298 addl $64,%edx
299 jnz 1b
300
301 jmp Lshort // copy remaining 0..63 bytes and done
302
303
304 // Forward loop for medium length operands in which low four bits of %esi == 0011
305
306 LMod3:
307 movdqa -3(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
308 1: // loop over 64-byte chunks
309 movdqa 13(%esi,%edx),%xmm1
310 movdqa 29(%esi,%edx),%xmm2
311 movdqa 45(%esi,%edx),%xmm3
312 movdqa 61(%esi,%edx),%xmm4
313
314 movdqa %xmm0,%xmm5
315 movdqa %xmm4,%xmm0
316
317 palignr $3,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
318 palignr $3,%xmm2,%xmm3
319 palignr $3,%xmm1,%xmm2
320 palignr $3,%xmm5,%xmm1
321
322 movdqa %xmm1,(%edi,%edx)
323 movdqa %xmm2,16(%edi,%edx)
324 movdqa %xmm3,32(%edi,%edx)
325 movdqa %xmm4,48(%edi,%edx)
326
327 addl $64,%edx
328 jnz 1b
329
330 jmp Lshort // copy remaining 0..63 bytes and done
331
332
333 // Forward loop for medium length operands in which low four bits of %esi == 0100
334 // We use the float single data type in order to use "movss" to merge vectors.
335
336 LMod4:
337 movaps -4(%esi,%edx),%xmm0 // 4-byte aligned: prime the loop
338 jmp 1f
339 .align 4,0x90
340 1: // loop over 64-byte chunks
341 movaps 12(%esi,%edx),%xmm1
342 movaps 28(%esi,%edx),%xmm2
343 movss %xmm1,%xmm0 // copy low 4 bytes of source into destination
344 pshufd $(0x39),%xmm0,%xmm0 // rotate right 4 bytes (mask -- 00 11 10 01)
345 movaps 44(%esi,%edx),%xmm3
346 movss %xmm2,%xmm1
347 pshufd $(0x39),%xmm1,%xmm1
348 movaps 60(%esi,%edx),%xmm4
349 movss %xmm3,%xmm2
350 pshufd $(0x39),%xmm2,%xmm2
351
352 movaps %xmm0,(%edi,%edx)
353 movss %xmm4,%xmm3
354 pshufd $(0x39),%xmm3,%xmm3
355 movaps %xmm1,16(%edi,%edx)
356 movaps %xmm2,32(%edi,%edx)
357 movaps %xmm4,%xmm0
358 movaps %xmm3,48(%edi,%edx)
359
360 addl $64,%edx
361 jnz 1b
362
363 jmp Lshort // copy remaining 0..63 bytes and done
364
365
366 // Forward loop for medium length operands in which low four bits of %esi == 0101
367
368 LMod5:
369 movdqa -5(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
370 1: // loop over 64-byte chunks
371 movdqa 11(%esi,%edx),%xmm1
372 movdqa 27(%esi,%edx),%xmm2
373 movdqa 43(%esi,%edx),%xmm3
374 movdqa 59(%esi,%edx),%xmm4
375
376 movdqa %xmm0,%xmm5
377 movdqa %xmm4,%xmm0
378
379 palignr $5,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
380 palignr $5,%xmm2,%xmm3
381 palignr $5,%xmm1,%xmm2
382 palignr $5,%xmm5,%xmm1
383
384 movdqa %xmm1,(%edi,%edx)
385 movdqa %xmm2,16(%edi,%edx)
386 movdqa %xmm3,32(%edi,%edx)
387 movdqa %xmm4,48(%edi,%edx)
388
389 addl $64,%edx
390 jnz 1b
391
392 jmp Lshort // copy remaining 0..63 bytes and done
393
394
395 // Forward loop for medium length operands in which low four bits of %esi == 0110
396
397 LMod6:
398 movdqa -6(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
399 1: // loop over 64-byte chunks
400 movdqa 10(%esi,%edx),%xmm1
401 movdqa 26(%esi,%edx),%xmm2
402 movdqa 42(%esi,%edx),%xmm3
403 movdqa 58(%esi,%edx),%xmm4
404
405 movdqa %xmm0,%xmm5
406 movdqa %xmm4,%xmm0
407
408 palignr $6,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
409 palignr $6,%xmm2,%xmm3
410 palignr $6,%xmm1,%xmm2
411 palignr $6,%xmm5,%xmm1
412
413 movdqa %xmm1,(%edi,%edx)
414 movdqa %xmm2,16(%edi,%edx)
415 movdqa %xmm3,32(%edi,%edx)
416 movdqa %xmm4,48(%edi,%edx)
417
418 addl $64,%edx
419 jnz 1b
420
421 jmp Lshort // copy remaining 0..63 bytes and done
422
423
424 // Forward loop for medium length operands in which low four bits of %esi == 0111
425
426 LMod7:
427 movdqa -7(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
428 1: // loop over 64-byte chunks
429 movdqa 9(%esi,%edx),%xmm1
430 movdqa 25(%esi,%edx),%xmm2
431 movdqa 41(%esi,%edx),%xmm3
432 movdqa 57(%esi,%edx),%xmm4
433
434 movdqa %xmm0,%xmm5
435 movdqa %xmm4,%xmm0
436
437 palignr $7,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
438 palignr $7,%xmm2,%xmm3
439 palignr $7,%xmm1,%xmm2
440 palignr $7,%xmm5,%xmm1
441
442 movdqa %xmm1,(%edi,%edx)
443 movdqa %xmm2,16(%edi,%edx)
444 movdqa %xmm3,32(%edi,%edx)
445 movdqa %xmm4,48(%edi,%edx)
446
447 addl $64,%edx
448 jnz 1b
449
450 jmp Lshort // copy remaining 0..63 bytes and done
451
452
453 // Forward loop for medium length operands in which low four bits of %esi == 1000
454 // We use the float double data type in order to use "shufpd" to shift by 8 bytes.
455
456 LMod8:
457 cmpl $(-kFastUCode),%edx // %edx == -length, where (length < kVeryLong)
458 jle Lfastpath // long enough for fastpath in microcode
459 movapd -8(%esi,%edx),%xmm0 // 8-byte aligned: prime the loop
460 jmp 1f
461 .align 4,0x90
462 1: // loop over 64-byte chunks
463 movapd 8(%esi,%edx),%xmm1
464 movapd 24(%esi,%edx),%xmm2
465 shufpd $01,%xmm1,%xmm0 // %xmm0 <- shr( %xmm0 || %xmm1, 8 bytes)
466 movapd 40(%esi,%edx),%xmm3
467 shufpd $01,%xmm2,%xmm1
468 movapd 56(%esi,%edx),%xmm4
469 shufpd $01,%xmm3,%xmm2
470
471 movapd %xmm0,(%edi,%edx)
472 shufpd $01,%xmm4,%xmm3
473 movapd %xmm1,16(%edi,%edx)
474 movapd %xmm2,32(%edi,%edx)
475 movapd %xmm4,%xmm0
476 movapd %xmm3,48(%edi,%edx)
477
478 addl $64,%edx
479 jnz 1b
480
481 jmp Lshort // copy remaining 0..63 bytes and done
482
483
484 // Forward loop for medium length operands in which low four bits of %esi == 1001
485
486 LMod9:
487 movdqa -9(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
488 1: // loop over 64-byte chunks
489 movdqa 7(%esi,%edx),%xmm1
490 movdqa 23(%esi,%edx),%xmm2
491 movdqa 39(%esi,%edx),%xmm3
492 movdqa 55(%esi,%edx),%xmm4
493
494 movdqa %xmm0,%xmm5
495 movdqa %xmm4,%xmm0
496
497 palignr $9,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
498 palignr $9,%xmm2,%xmm3
499 palignr $9,%xmm1,%xmm2
500 palignr $9,%xmm5,%xmm1
501
502 movdqa %xmm1,(%edi,%edx)
503 movdqa %xmm2,16(%edi,%edx)
504 movdqa %xmm3,32(%edi,%edx)
505 movdqa %xmm4,48(%edi,%edx)
506
507 addl $64,%edx
508 jnz 1b
509
510 jmp Lshort // copy remaining 0..63 bytes and done
511
512
513 // Forward loop for medium length operands in which low four bits of %esi == 1010
514
515 LMod10:
516 movdqa -10(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
517 1: // loop over 64-byte chunks
518 movdqa 6(%esi,%edx),%xmm1
519 movdqa 22(%esi,%edx),%xmm2
520 movdqa 38(%esi,%edx),%xmm3
521 movdqa 54(%esi,%edx),%xmm4
522
523 movdqa %xmm0,%xmm5
524 movdqa %xmm4,%xmm0
525
526 palignr $10,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
527 palignr $10,%xmm2,%xmm3
528 palignr $10,%xmm1,%xmm2
529 palignr $10,%xmm5,%xmm1
530
531 movdqa %xmm1,(%edi,%edx)
532 movdqa %xmm2,16(%edi,%edx)
533 movdqa %xmm3,32(%edi,%edx)
534 movdqa %xmm4,48(%edi,%edx)
535
536 addl $64,%edx
537 jnz 1b
538
539 jmp Lshort // copy remaining 0..63 bytes and done
540
541
542 // Forward loop for medium length operands in which low four bits of %esi == 1011
543
544 LMod11:
545 movdqa -11(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
546 1: // loop over 64-byte chunks
547 movdqa 5(%esi,%edx),%xmm1
548 movdqa 21(%esi,%edx),%xmm2
549 movdqa 37(%esi,%edx),%xmm3
550 movdqa 53(%esi,%edx),%xmm4
551
552 movdqa %xmm0,%xmm5
553 movdqa %xmm4,%xmm0
554
555 palignr $11,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
556 palignr $11,%xmm2,%xmm3
557 palignr $11,%xmm1,%xmm2
558 palignr $11,%xmm5,%xmm1
559
560 movdqa %xmm1,(%edi,%edx)
561 movdqa %xmm2,16(%edi,%edx)
562 movdqa %xmm3,32(%edi,%edx)
563 movdqa %xmm4,48(%edi,%edx)
564
565 addl $64,%edx
566 jnz 1b
567
568 jmp Lshort // copy remaining 0..63 bytes and done
569
570
571 // Forward loop for medium length operands in which low four bits of %esi == 1100
572 // We use the float single data type in order to use "movss" to merge vectors.
573
574 LMod12:
575 movss (%esi,%edx),%xmm0 // prefetch 1st four bytes of source, right justified
576 jmp 1f
577 .align 4,0x90
578 1: // loop over 64-byte chunks
579 pshufd $(0x93),4(%esi,%edx),%xmm1 // load and rotate right 12 bytes (mask -- 10 01 00 11)
580 pshufd $(0x93),20(%esi,%edx),%xmm2
581 pshufd $(0x93),36(%esi,%edx),%xmm3
582 pshufd $(0x93),52(%esi,%edx),%xmm4
583
584 movaps %xmm4,%xmm5
585 movss %xmm3,%xmm4 // copy low 4 bytes of source into destination
586 movss %xmm2,%xmm3
587 movss %xmm1,%xmm2
588 movss %xmm0,%xmm1
589
590 movaps %xmm1,(%edi,%edx)
591 movaps %xmm2,16(%edi,%edx)
592 movaps %xmm5,%xmm0
593 movaps %xmm3,32(%edi,%edx)
594 movaps %xmm4,48(%edi,%edx)
595
596 addl $64,%edx
597 jnz 1b
598
599 jmp Lshort // copy remaining 0..63 bytes and done
600
601
602 // Forward loop for medium length operands in which low four bits of %esi == 1101
603
604 LMod13:
605 movdqa -13(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
606 1: // loop over 64-byte chunks
607 movdqa 3(%esi,%edx),%xmm1
608 movdqa 19(%esi,%edx),%xmm2
609 movdqa 35(%esi,%edx),%xmm3
610 movdqa 51(%esi,%edx),%xmm4
611
612 movdqa %xmm0,%xmm5
613 movdqa %xmm4,%xmm0
614
615 palignr $13,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
616 palignr $13,%xmm2,%xmm3
617 palignr $13,%xmm1,%xmm2
618 palignr $13,%xmm5,%xmm1
619
620 movdqa %xmm1,(%edi,%edx)
621 movdqa %xmm2,16(%edi,%edx)
622 movdqa %xmm3,32(%edi,%edx)
623 movdqa %xmm4,48(%edi,%edx)
624
625 addl $64,%edx
626 jnz 1b
627
628 jmp Lshort // copy remaining 0..63 bytes and done
629
630
631 // Forward loop for medium length operands in which low four bits of %esi == 1110
632
633 LMod14:
634 movdqa -14(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
635 1: // loop over 64-byte chunks
636 movdqa 2(%esi,%edx),%xmm1
637 movdqa 18(%esi,%edx),%xmm2
638 movdqa 34(%esi,%edx),%xmm3
639 movdqa 50(%esi,%edx),%xmm4
640
641 movdqa %xmm0,%xmm5
642 movdqa %xmm4,%xmm0
643
644 palignr $14,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
645 palignr $14,%xmm2,%xmm3
646 palignr $14,%xmm1,%xmm2
647 palignr $14,%xmm5,%xmm1
648
649 movdqa %xmm1,(%edi,%edx)
650 movdqa %xmm2,16(%edi,%edx)
651 movdqa %xmm3,32(%edi,%edx)
652 movdqa %xmm4,48(%edi,%edx)
653
654 addl $64,%edx
655 jnz 1b
656
657 jmp Lshort // copy remaining 0..63 bytes and done
658
659
660 // Forward loop for medium length operands in which low four bits of %esi == 1111
661
662 LMod15:
663 movdqa -15(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
664 1: // loop over 64-byte chunks
665 movdqa 1(%esi,%edx),%xmm1
666 movdqa 17(%esi,%edx),%xmm2
667 movdqa 33(%esi,%edx),%xmm3
668 movdqa 49(%esi,%edx),%xmm4
669
670 movdqa %xmm0,%xmm5
671 movdqa %xmm4,%xmm0
672
673 palignr $15,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 )
674 palignr $15,%xmm2,%xmm3
675 palignr $15,%xmm1,%xmm2
676 palignr $15,%xmm5,%xmm1
677
678 movdqa %xmm1,(%edi,%edx)
679 movdqa %xmm2,16(%edi,%edx)
680 movdqa %xmm3,32(%edi,%edx)
681 movdqa %xmm4,48(%edi,%edx)
682
683 addl $64,%edx
684 jnz 1b
685
686 jmp Lshort // copy remaining 0..63 bytes and done
687
688
689 // Reverse moves. These are not optimized as aggressively as their forward
690 // counterparts, as they are only used with destructive overlap.
691 // ecx = length
692 // esi = source ptr
693 // edi = dest ptr
694
695 LReverse:
696 addl %ecx,%esi // point to end of strings
697 addl %ecx,%edi
698 cmpl $(kShort),%ecx // long enough to bother with SSE?
699 ja LReverseNotShort // yes
700
701 // Handle reverse short copies.
702 // ecx = length
703 // esi = one byte past end of source
704 // edi = one byte past end of dest
705
706 LReverseShort:
707 movl %ecx,%edx // copy length
708 shrl $2,%ecx // #words
709 jz 3f
710 1:
711 subl $4,%esi
712 movl (%esi),%eax
713 subl $4,%edi
714 movl %eax,(%edi)
715 dec %ecx
716 jnz 1b
717 3:
718 andl $3,%edx // bytes?
719 jz 5f
720 4:
721 dec %esi
722 movb (%esi),%al
723 dec %edi
724 movb %al,(%edi)
725 dec %edx
726 jnz 4b
727 5:
728 movl 8(%ebp),%eax // get return value (dst ptr) for memcpy/memmove
729 popl %edi
730 popl %esi
731 popl %ebp
732 ret
733
734 // Handle a reverse move long enough to justify using SSE.
735 // ecx = length
736 // esi = one byte past end of source
737 // edi = one byte past end of dest
738
739 LReverseNotShort:
740 movl %edi,%edx // copy destination
741 andl $15,%edx // get #bytes to align destination
742 je LReverseDestAligned // already aligned
743 subl %edx,%ecx // adjust length
744 1: // loop copying 1..15 bytes
745 dec %esi
746 movb (%esi),%al
747 dec %edi
748 movb %al,(%edi)
749 dec %edx
750 jnz 1b
751
752 // Destination is now aligned. Prepare for reverse loops.
753
754 LReverseDestAligned:
755 movl %ecx,%edx // copy length
756 andl $63,%ecx // get remaining bytes for Lshort
757 andl $-64,%edx // get number of bytes we will copy in inner loop
758 subl %edx,%esi // point to endpoint of copy
759 subl %edx,%edi
760 testl $15,%esi // is source aligned too?
761 jnz LReverseUnalignedLoop // no
762
763 LReverseAlignedLoop: // loop over 64-byte chunks
764 movdqa -16(%esi,%edx),%xmm0
765 movdqa -32(%esi,%edx),%xmm1
766 movdqa -48(%esi,%edx),%xmm2
767 movdqa -64(%esi,%edx),%xmm3
768
769 movdqa %xmm0,-16(%edi,%edx)
770 movdqa %xmm1,-32(%edi,%edx)
771 movdqa %xmm2,-48(%edi,%edx)
772 movdqa %xmm3,-64(%edi,%edx)
773
774 subl $64,%edx
775 jne LReverseAlignedLoop
776
777 jmp LReverseShort // copy remaining 0..63 bytes and done
778
779
780 // Reverse, unaligned loop. LDDQU==MOVDQU on these machines.
781
782 LReverseUnalignedLoop: // loop over 64-byte chunks
783 movdqu -16(%esi,%edx),%xmm0
784 movdqu -32(%esi,%edx),%xmm1
785 movdqu -48(%esi,%edx),%xmm2
786 movdqu -64(%esi,%edx),%xmm3
787
788 movdqa %xmm0,-16(%edi,%edx)
789 movdqa %xmm1,-32(%edi,%edx)
790 movdqa %xmm2,-48(%edi,%edx)
791 movdqa %xmm3,-64(%edi,%edx)
792
793 subl $64,%edx
794 jne LReverseUnalignedLoop
795
796 jmp LReverseShort // copy remaining 0..63 bytes and done
797
798
799 COMMPAGE_DESCRIPTOR(bcopy_sse4,_COMM_PAGE_BCOPY,kHasSSE3+kHasSupplementalSSE3+kCache64,0)