]>
Commit | Line | Data |
---|---|---|
1f2f436a A |
1 | /* |
2 | * Copyright (c) 2006 Apple Computer, Inc. All rights reserved. | |
3 | * | |
4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ | |
5 | * | |
6 | * This file contains Original Code and/or Modifications of Original Code | |
7 | * as defined in and that are subject to the Apple Public Source License | |
8 | * Version 2.0 (the 'License'). You may not use this file except in | |
9 | * compliance with the License. The rights granted to you under the License | |
10 | * may not be used to create, or enable the creation or redistribution of, | |
11 | * unlawful or unlicensed copies of an Apple operating system, or to | |
12 | * circumvent, violate, or enable the circumvention or violation of, any | |
13 | * terms of an Apple operating system software license agreement. | |
14 | * | |
15 | * Please obtain a copy of the License at | |
16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. | |
17 | * | |
18 | * The Original Code and all software distributed under the License are | |
19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, | |
21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. | |
23 | * Please see the License for the specific language governing rights and | |
24 | * limitations under the License. | |
25 | * | |
26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ | |
27 | */ | |
28 | ||
29 | #include <machine/cpu_capabilities.h> | |
ad3c9f2a | 30 | #include "platfunc.h" |
1f2f436a A |
31 | |
32 | /* | |
33 | * The bcopy/memcpy loops, tuned for 64-bit Pentium-M class processors with | |
34 | * Supplemental SSE3 and 64-byte cache lines. This is the 64-bit version. | |
35 | * | |
36 | * The following #defines are tightly coupled to the u-architecture: | |
37 | */ | |
38 | ||
39 | #define kShort 80 // too short to bother with SSE (must be >=80) | |
40 | #define kVeryLong (500*1024) // large enough for non-temporal stores (>=8192 and <2GB) | |
41 | #define kFastUCode ((16*1024)-15) // cutoff for microcode fastpath for "rep/movsl" | |
42 | ||
43 | // void bcopy(const void *src, void *dst, size_t len); | |
44 | ||
45 | PLATFUNC_FUNCTION_START_GENERIC(bcopy, sse3x, 64, 5) | |
46 | LZero: | |
47 | pushq %rbp // set up a frame for backtraces | |
48 | movq %rsp,%rbp | |
49 | movq %rsi,%rax // copy dest ptr | |
50 | movq %rdi,%rsi // xchange source and dest ptrs | |
51 | movq %rax,%rdi | |
52 | subq %rsi,%rax // (dest - source) | |
53 | cmpq %rdx,%rax // must move in reverse if (dest - source) < length | |
54 | jb LReverseIsland | |
55 | cmpq $(kShort),%rdx // long enough to bother with SSE? | |
56 | jbe LShort // no | |
57 | jmp LNotShort | |
58 | ||
59 | // | |
60 | // void *memcpy(void *dst, const void *src, size_t len); | |
61 | // void *memmove(void *dst, const void *src, size_t len); | |
62 | // | |
63 | ||
64 | PLATFUNC_FUNCTION_START_GENERIC(memcpy, sse3x, 64, 0) // void *memcpy(void *dst, const void *src, size_t len) | |
65 | PLATFUNC_FUNCTION_START_GENERIC(memmove, sse3x, 64, 0) // void *memmove(void *dst, const void *src, size_t len) | |
66 | pushq %rbp // set up a frame for backtraces | |
67 | movq %rsp,%rbp | |
68 | movq %rdi,%r11 // save return value here | |
69 | movq %rdi,%rax | |
70 | subq %rsi,%rax // (dest - source) | |
71 | cmpq %rdx,%rax // must move in reverse if (dest - source) < length | |
72 | jb LReverseIsland | |
73 | cmpq $(kShort),%rdx // long enough to bother with SSE? | |
74 | ja LNotShort // yes | |
75 | ||
76 | // Handle short forward copies. As the most common case, this is the fall-through path. | |
77 | // rdx = length (<= kShort) | |
78 | // rsi = source ptr | |
79 | // rdi = dest ptr | |
80 | ||
81 | LShort: | |
82 | movl %edx,%ecx // copy length using 32-bit operation | |
83 | shrl $2,%ecx // get #doublewords | |
84 | jz LLeftovers | |
85 | 2: // loop copying doublewords | |
86 | movl (%rsi),%eax | |
87 | addq $4,%rsi | |
88 | movl %eax,(%rdi) | |
89 | addq $4,%rdi | |
90 | decl %ecx | |
91 | jnz 2b | |
92 | LLeftovers: // handle leftover bytes (0..3) in last word | |
93 | andl $3,%edx // any leftover bytes? | |
94 | jz 5f | |
95 | 4: // loop copying bytes | |
96 | movb (%rsi),%al | |
97 | incq %rsi | |
98 | movb %al,(%rdi) | |
99 | incq %rdi | |
100 | decl %edx | |
101 | jnz 4b | |
102 | 5: | |
103 | movq %r11,%rax // get return value (dst ptr) for memcpy/memmove | |
104 | popq %rbp | |
105 | ret | |
106 | ||
107 | ||
108 | LReverseIsland: // keep the "jb" above a short branch... | |
109 | jmp LReverse // ...because reverse moves are uncommon | |
110 | ||
111 | ||
112 | // Handle forward moves that are long enough to justify use of SSE. | |
113 | // First, 16-byte align the destination. | |
114 | // rdx = length (> kShort) | |
115 | // rsi = source ptr | |
116 | // rdi = dest ptr | |
117 | ||
118 | LNotShort: | |
119 | cmpq $(kVeryLong),%rdx // long enough to justify heavyweight loops? | |
120 | jae LVeryLong // use very-long-operand path | |
121 | movl %edi,%ecx // copy low half of destination ptr | |
122 | negl %ecx | |
123 | andl $15,%ecx // get #bytes to align destination | |
124 | jz LDestAligned // already aligned | |
125 | subl %ecx,%edx // decrement length | |
126 | rep // align destination | |
127 | movsb | |
128 | ||
129 | ||
130 | // Destination is now aligned. Dispatch to one of sixteen loops over 64-byte chunks, | |
131 | // based on the alignment of the source. All vector loads and stores are aligned. | |
132 | // Even though this means we have to shift and repack vectors, doing so is much faster | |
133 | // than unaligned loads. Since kShort>=80 and we've moved at most 15 bytes already, | |
134 | // there is at least one chunk. When we enter the copy loops, the following registers | |
135 | // are set up: | |
136 | // rdx = residual length (0..63) | |
137 | // rcx = -(length to move), a multiple of 64 less than 2GB | |
138 | // rsi = ptr to 1st source byte not to move (unaligned) | |
139 | // rdi = ptr to 1st dest byte not to move (aligned) | |
140 | ||
141 | LDestAligned: | |
142 | movq %rdx,%rcx // copy length | |
143 | movl %esi,%eax // copy low half of source address | |
144 | andl $63,%edx // get remaining bytes for LShort | |
145 | andl $15,%eax // mask to low 4 bits of source address | |
146 | andq $-64,%rcx // get number of bytes we will copy in inner loop | |
147 | leaq LTable(%rip), %r8 | |
148 | addq %rcx,%rsi // point to 1st byte not copied | |
149 | addq %rcx,%rdi | |
150 | movl (%r8,%rax,4),%eax // get offset of routine | |
151 | negq %rcx // now generate offset to 1st byte to be copied | |
152 | addq %r8,%rax // generate address of copy loop | |
153 | jmp *%rax // enter copy loop, selected by source alignment | |
154 | ||
155 | .align 2 | |
156 | LTable: // table of copy loop addresses | |
157 | // force generation of assembly-time constants. Otherwise assembler | |
158 | // creates subtractor relocations relative to first external symbol, | |
159 | // and this file has none | |
160 | .set LMod0Offset, LMod0 - LTable | |
161 | .set LMod1Offset, LMod1 - LTable | |
162 | .set LMod2Offset, LMod2 - LTable | |
163 | .set LMod3Offset, LMod3 - LTable | |
164 | .set LMod4Offset, LMod4 - LTable | |
165 | .set LMod5Offset, LMod5 - LTable | |
166 | .set LMod6Offset, LMod6 - LTable | |
167 | .set LMod7Offset, LMod7 - LTable | |
168 | .set LMod8Offset, LMod8 - LTable | |
169 | .set LMod9Offset, LMod9 - LTable | |
170 | .set LMod10Offset, LMod10 - LTable | |
171 | .set LMod11Offset, LMod11 - LTable | |
172 | .set LMod12Offset, LMod12 - LTable | |
173 | .set LMod13Offset, LMod13 - LTable | |
174 | .set LMod14Offset, LMod14 - LTable | |
175 | .set LMod15Offset, LMod15 - LTable | |
176 | .long LMod0Offset | |
177 | .long LMod1Offset | |
178 | .long LMod2Offset | |
179 | .long LMod3Offset | |
180 | .long LMod4Offset | |
181 | .long LMod5Offset | |
182 | .long LMod6Offset | |
183 | .long LMod7Offset | |
184 | .long LMod8Offset | |
185 | .long LMod9Offset | |
186 | .long LMod10Offset | |
187 | .long LMod11Offset | |
188 | .long LMod12Offset | |
189 | .long LMod13Offset | |
190 | .long LMod14Offset | |
191 | .long LMod15Offset | |
192 | ||
193 | ||
194 | // Very long forward moves. These are at least several pages. They are special cased | |
195 | // and aggressively optimized, not so much because they are common or useful, but | |
196 | // because they are subject to benchmark. There isn't enough room for them in the | |
197 | // area reserved on the platfunc for bcopy, so we put them elsewhere. We call | |
198 | // the longcopy routine using the normal ABI: | |
199 | // rdi = dest | |
200 | // rsi = source | |
201 | // rdx = length (>= kVeryLong bytes) | |
202 | ||
203 | LVeryLong: | |
204 | pushq %r11 // save return value | |
205 | call _longcopy // call very long operand routine | |
206 | popq %rax // pop return value | |
207 | popq %rbp | |
208 | ret | |
209 | ||
210 | ||
211 | // On Pentium-M, the microcode for "rep/movsl" is faster than SSE for 16-byte | |
212 | // aligned operands from about 32KB up to kVeryLong for the hot cache case, and from | |
213 | // about 256 bytes up to kVeryLong for cold caches. This is because the microcode | |
214 | // avoids having to read destination cache lines that will be completely overwritten. | |
215 | // The cutoff we use (ie, kFastUCode) must somehow balance the two cases, since | |
216 | // we do not know if the destination is in cache or not. | |
217 | ||
218 | Lfastpath: | |
219 | addq %rcx,%rsi // restore ptrs to 1st byte of source and dest | |
220 | addq %rcx,%rdi | |
221 | negl %ecx // make length positive (known to be < 2GB) | |
222 | orl %edx,%ecx // restore total #bytes remaining to move | |
223 | cld // we'll move forward | |
224 | shrl $2,%ecx // compute #words to move | |
225 | rep // the u-code will optimize this | |
226 | movsl | |
227 | jmp LLeftovers // handle 0..3 leftover bytes | |
228 | ||
229 | ||
230 | // Forward loop for medium length operands in which low four bits of %rsi == 0000 | |
231 | ||
232 | LMod0: | |
233 | cmpl $(-kFastUCode),%ecx // %rcx == -length, where (length < kVeryLong) | |
234 | jle Lfastpath // long enough for fastpath in microcode | |
235 | jmp 1f | |
236 | .align 4,0x90 // 16-byte align inner loops | |
237 | 1: // loop over 64-byte chunks | |
238 | movdqa (%rsi,%rcx),%xmm0 | |
239 | movdqa 16(%rsi,%rcx),%xmm1 | |
240 | movdqa 32(%rsi,%rcx),%xmm2 | |
241 | movdqa 48(%rsi,%rcx),%xmm3 | |
242 | ||
243 | movdqa %xmm0,(%rdi,%rcx) | |
244 | movdqa %xmm1,16(%rdi,%rcx) | |
245 | movdqa %xmm2,32(%rdi,%rcx) | |
246 | movdqa %xmm3,48(%rdi,%rcx) | |
247 | ||
248 | addq $64,%rcx | |
249 | jnz 1b | |
250 | ||
251 | jmp LShort // copy remaining 0..63 bytes and done | |
252 | ||
253 | ||
254 | // Forward loop for medium length operands in which low four bits of %rsi == 0001 | |
255 | ||
256 | LMod1: | |
257 | movdqa -1(%rsi,%rcx),%xmm0 // prime the loop by loading 1st quadword | |
258 | 1: // loop over 64-byte chunks | |
259 | movdqa 15(%rsi,%rcx),%xmm1 | |
260 | movdqa 31(%rsi,%rcx),%xmm2 | |
261 | movdqa 47(%rsi,%rcx),%xmm3 | |
262 | movdqa 63(%rsi,%rcx),%xmm4 | |
263 | ||
264 | movdqa %xmm0,%xmm5 | |
265 | movdqa %xmm4,%xmm0 | |
266 | ||
267 | palignr $1,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) | |
268 | palignr $1,%xmm2,%xmm3 | |
269 | palignr $1,%xmm1,%xmm2 | |
270 | palignr $1,%xmm5,%xmm1 | |
271 | ||
272 | movdqa %xmm1,(%rdi,%rcx) | |
273 | movdqa %xmm2,16(%rdi,%rcx) | |
274 | movdqa %xmm3,32(%rdi,%rcx) | |
275 | movdqa %xmm4,48(%rdi,%rcx) | |
276 | ||
277 | addq $64,%rcx | |
278 | jnz 1b | |
279 | ||
280 | jmp LShort // copy remaining 0..63 bytes and done | |
281 | ||
282 | ||
283 | // Forward loop for medium length operands in which low four bits of %rsi == 0010 | |
284 | ||
285 | LMod2: | |
286 | movdqa -2(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq | |
287 | 1: // loop over 64-byte chunks | |
288 | movdqa 14(%rsi,%rcx),%xmm1 | |
289 | movdqa 30(%rsi,%rcx),%xmm2 | |
290 | movdqa 46(%rsi,%rcx),%xmm3 | |
291 | movdqa 62(%rsi,%rcx),%xmm4 | |
292 | ||
293 | movdqa %xmm0,%xmm5 | |
294 | movdqa %xmm4,%xmm0 | |
295 | ||
296 | palignr $2,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) | |
297 | palignr $2,%xmm2,%xmm3 | |
298 | palignr $2,%xmm1,%xmm2 | |
299 | palignr $2,%xmm5,%xmm1 | |
300 | ||
301 | movdqa %xmm1,(%rdi,%rcx) | |
302 | movdqa %xmm2,16(%rdi,%rcx) | |
303 | movdqa %xmm3,32(%rdi,%rcx) | |
304 | movdqa %xmm4,48(%rdi,%rcx) | |
305 | ||
306 | addq $64,%rcx | |
307 | jnz 1b | |
308 | ||
309 | jmp LShort // copy remaining 0..63 bytes and done | |
310 | ||
311 | ||
312 | // Forward loop for medium length operands in which low four bits of %rsi == 0011 | |
313 | ||
314 | LMod3: | |
315 | movdqa -3(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq | |
316 | 1: // loop over 64-byte chunks | |
317 | movdqa 13(%rsi,%rcx),%xmm1 | |
318 | movdqa 29(%rsi,%rcx),%xmm2 | |
319 | movdqa 45(%rsi,%rcx),%xmm3 | |
320 | movdqa 61(%rsi,%rcx),%xmm4 | |
321 | ||
322 | movdqa %xmm0,%xmm5 | |
323 | movdqa %xmm4,%xmm0 | |
324 | ||
325 | palignr $3,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) | |
326 | palignr $3,%xmm2,%xmm3 | |
327 | palignr $3,%xmm1,%xmm2 | |
328 | palignr $3,%xmm5,%xmm1 | |
329 | ||
330 | movdqa %xmm1,(%rdi,%rcx) | |
331 | movdqa %xmm2,16(%rdi,%rcx) | |
332 | movdqa %xmm3,32(%rdi,%rcx) | |
333 | movdqa %xmm4,48(%rdi,%rcx) | |
334 | ||
335 | addq $64,%rcx | |
336 | jnz 1b | |
337 | ||
338 | jmp LShort // copy remaining 0..63 bytes and done | |
339 | ||
340 | ||
341 | // Forward loop for medium length operands in which low four bits of %rsi == 0100 | |
342 | // We use the float single data type in order to use "movss" to merge vectors. | |
343 | ||
344 | LMod4: | |
345 | movaps -4(%rsi,%rcx),%xmm0 // 4-byte aligned: prime the loop | |
346 | jmp 1f | |
347 | .align 4,0x90 | |
348 | 1: // loop over 64-byte chunks | |
349 | movaps 12(%rsi,%rcx),%xmm1 | |
350 | movaps 28(%rsi,%rcx),%xmm2 | |
351 | movss %xmm1,%xmm0 // copy low 4 bytes of source into destination | |
352 | pshufd $(0x39),%xmm0,%xmm0 // rotate right 4 bytes (mask -- 00 11 10 01) | |
353 | movaps 44(%rsi,%rcx),%xmm3 | |
354 | movss %xmm2,%xmm1 | |
355 | pshufd $(0x39),%xmm1,%xmm1 | |
356 | movaps 60(%rsi,%rcx),%xmm4 | |
357 | movss %xmm3,%xmm2 | |
358 | pshufd $(0x39),%xmm2,%xmm2 | |
359 | ||
360 | movaps %xmm0,(%rdi,%rcx) | |
361 | movss %xmm4,%xmm3 | |
362 | pshufd $(0x39),%xmm3,%xmm3 | |
363 | movaps %xmm1,16(%rdi,%rcx) | |
364 | movaps %xmm2,32(%rdi,%rcx) | |
365 | movaps %xmm4,%xmm0 | |
366 | movaps %xmm3,48(%rdi,%rcx) | |
367 | ||
368 | addq $64,%rcx | |
369 | jnz 1b | |
370 | ||
371 | jmp LShort // copy remaining 0..63 bytes and done | |
372 | ||
373 | ||
374 | // Forward loop for medium length operands in which low four bits of %rsi == 0101 | |
375 | ||
376 | LMod5: | |
377 | movdqa -5(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq | |
378 | 1: // loop over 64-byte chunks | |
379 | movdqa 11(%rsi,%rcx),%xmm1 | |
380 | movdqa 27(%rsi,%rcx),%xmm2 | |
381 | movdqa 43(%rsi,%rcx),%xmm3 | |
382 | movdqa 59(%rsi,%rcx),%xmm4 | |
383 | ||
384 | movdqa %xmm0,%xmm5 | |
385 | movdqa %xmm4,%xmm0 | |
386 | ||
387 | palignr $5,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) | |
388 | palignr $5,%xmm2,%xmm3 | |
389 | palignr $5,%xmm1,%xmm2 | |
390 | palignr $5,%xmm5,%xmm1 | |
391 | ||
392 | movdqa %xmm1,(%rdi,%rcx) | |
393 | movdqa %xmm2,16(%rdi,%rcx) | |
394 | movdqa %xmm3,32(%rdi,%rcx) | |
395 | movdqa %xmm4,48(%rdi,%rcx) | |
396 | ||
397 | addq $64,%rcx | |
398 | jnz 1b | |
399 | ||
400 | jmp LShort // copy remaining 0..63 bytes and done | |
401 | ||
402 | ||
403 | // Forward loop for medium length operands in which low four bits of %rsi == 0110 | |
404 | ||
405 | LMod6: | |
406 | movdqa -6(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq | |
407 | 1: // loop over 64-byte chunks | |
408 | movdqa 10(%rsi,%rcx),%xmm1 | |
409 | movdqa 26(%rsi,%rcx),%xmm2 | |
410 | movdqa 42(%rsi,%rcx),%xmm3 | |
411 | movdqa 58(%rsi,%rcx),%xmm4 | |
412 | ||
413 | movdqa %xmm0,%xmm5 | |
414 | movdqa %xmm4,%xmm0 | |
415 | ||
416 | palignr $6,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) | |
417 | palignr $6,%xmm2,%xmm3 | |
418 | palignr $6,%xmm1,%xmm2 | |
419 | palignr $6,%xmm5,%xmm1 | |
420 | ||
421 | movdqa %xmm1,(%rdi,%rcx) | |
422 | movdqa %xmm2,16(%rdi,%rcx) | |
423 | movdqa %xmm3,32(%rdi,%rcx) | |
424 | movdqa %xmm4,48(%rdi,%rcx) | |
425 | ||
426 | addq $64,%rcx | |
427 | jnz 1b | |
428 | ||
429 | jmp LShort // copy remaining 0..63 bytes and done | |
430 | ||
431 | ||
432 | // Forward loop for medium length operands in which low four bits of %rsi == 0111 | |
433 | ||
434 | LMod7: | |
435 | movdqa -7(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq | |
436 | 1: // loop over 64-byte chunks | |
437 | movdqa 9(%rsi,%rcx),%xmm1 | |
438 | movdqa 25(%rsi,%rcx),%xmm2 | |
439 | movdqa 41(%rsi,%rcx),%xmm3 | |
440 | movdqa 57(%rsi,%rcx),%xmm4 | |
441 | ||
442 | movdqa %xmm0,%xmm5 | |
443 | movdqa %xmm4,%xmm0 | |
444 | ||
445 | palignr $7,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) | |
446 | palignr $7,%xmm2,%xmm3 | |
447 | palignr $7,%xmm1,%xmm2 | |
448 | palignr $7,%xmm5,%xmm1 | |
449 | ||
450 | movdqa %xmm1,(%rdi,%rcx) | |
451 | movdqa %xmm2,16(%rdi,%rcx) | |
452 | movdqa %xmm3,32(%rdi,%rcx) | |
453 | movdqa %xmm4,48(%rdi,%rcx) | |
454 | ||
455 | addq $64,%rcx | |
456 | jnz 1b | |
457 | ||
458 | jmp LShort // copy remaining 0..63 bytes and done | |
459 | ||
460 | ||
461 | // Forward loop for medium length operands in which low four bits of %rsi == 1000 | |
462 | // We use the float double data type in order to use "shufpd" to shift by 8 bytes. | |
463 | ||
464 | LMod8: | |
465 | cmpl $(-kFastUCode),%ecx // %rcx == -length, where (length < kVeryLong) | |
466 | jle Lfastpath // long enough for fastpath in microcode | |
467 | movapd -8(%rsi,%rcx),%xmm0 // 8-byte aligned: prime the loop | |
468 | jmp 1f | |
469 | .align 4,0x90 | |
470 | 1: // loop over 64-byte chunks | |
471 | movapd 8(%rsi,%rcx),%xmm1 | |
472 | movapd 24(%rsi,%rcx),%xmm2 | |
473 | shufpd $01,%xmm1,%xmm0 // %xmm0 <- shr( %xmm0 || %xmm1, 8 bytes) | |
474 | movapd 40(%rsi,%rcx),%xmm3 | |
475 | shufpd $01,%xmm2,%xmm1 | |
476 | movapd 56(%rsi,%rcx),%xmm4 | |
477 | shufpd $01,%xmm3,%xmm2 | |
478 | ||
479 | movapd %xmm0,(%rdi,%rcx) | |
480 | shufpd $01,%xmm4,%xmm3 | |
481 | movapd %xmm1,16(%rdi,%rcx) | |
482 | movapd %xmm2,32(%rdi,%rcx) | |
483 | movapd %xmm4,%xmm0 | |
484 | movapd %xmm3,48(%rdi,%rcx) | |
485 | ||
486 | addq $64,%rcx | |
487 | jnz 1b | |
488 | ||
489 | jmp LShort // copy remaining 0..63 bytes and done | |
490 | ||
491 | ||
492 | // Forward loop for medium length operands in which low four bits of %rsi == 1001 | |
493 | ||
494 | LMod9: | |
495 | movdqa -9(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq | |
496 | 1: // loop over 64-byte chunks | |
497 | movdqa 7(%rsi,%rcx),%xmm1 | |
498 | movdqa 23(%rsi,%rcx),%xmm2 | |
499 | movdqa 39(%rsi,%rcx),%xmm3 | |
500 | movdqa 55(%rsi,%rcx),%xmm4 | |
501 | ||
502 | movdqa %xmm0,%xmm5 | |
503 | movdqa %xmm4,%xmm0 | |
504 | ||
505 | palignr $9,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) | |
506 | palignr $9,%xmm2,%xmm3 | |
507 | palignr $9,%xmm1,%xmm2 | |
508 | palignr $9,%xmm5,%xmm1 | |
509 | ||
510 | movdqa %xmm1,(%rdi,%rcx) | |
511 | movdqa %xmm2,16(%rdi,%rcx) | |
512 | movdqa %xmm3,32(%rdi,%rcx) | |
513 | movdqa %xmm4,48(%rdi,%rcx) | |
514 | ||
515 | addq $64,%rcx | |
516 | jnz 1b | |
517 | ||
518 | jmp LShort // copy remaining 0..63 bytes and done | |
519 | ||
520 | ||
521 | // Forward loop for medium length operands in which low four bits of %rsi == 1010 | |
522 | ||
523 | LMod10: | |
524 | movdqa -10(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq | |
525 | 1: // loop over 64-byte chunks | |
526 | movdqa 6(%rsi,%rcx),%xmm1 | |
527 | movdqa 22(%rsi,%rcx),%xmm2 | |
528 | movdqa 38(%rsi,%rcx),%xmm3 | |
529 | movdqa 54(%rsi,%rcx),%xmm4 | |
530 | ||
531 | movdqa %xmm0,%xmm5 | |
532 | movdqa %xmm4,%xmm0 | |
533 | ||
534 | palignr $10,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) | |
535 | palignr $10,%xmm2,%xmm3 | |
536 | palignr $10,%xmm1,%xmm2 | |
537 | palignr $10,%xmm5,%xmm1 | |
538 | ||
539 | movdqa %xmm1,(%rdi,%rcx) | |
540 | movdqa %xmm2,16(%rdi,%rcx) | |
541 | movdqa %xmm3,32(%rdi,%rcx) | |
542 | movdqa %xmm4,48(%rdi,%rcx) | |
543 | ||
544 | addq $64,%rcx | |
545 | jnz 1b | |
546 | ||
547 | jmp LShort // copy remaining 0..63 bytes and done | |
548 | ||
549 | ||
550 | // Forward loop for medium length operands in which low four bits of %rsi == 1011 | |
551 | ||
552 | LMod11: | |
553 | movdqa -11(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq | |
554 | 1: // loop over 64-byte chunks | |
555 | movdqa 5(%rsi,%rcx),%xmm1 | |
556 | movdqa 21(%rsi,%rcx),%xmm2 | |
557 | movdqa 37(%rsi,%rcx),%xmm3 | |
558 | movdqa 53(%rsi,%rcx),%xmm4 | |
559 | ||
560 | movdqa %xmm0,%xmm5 | |
561 | movdqa %xmm4,%xmm0 | |
562 | ||
563 | palignr $11,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) | |
564 | palignr $11,%xmm2,%xmm3 | |
565 | palignr $11,%xmm1,%xmm2 | |
566 | palignr $11,%xmm5,%xmm1 | |
567 | ||
568 | movdqa %xmm1,(%rdi,%rcx) | |
569 | movdqa %xmm2,16(%rdi,%rcx) | |
570 | movdqa %xmm3,32(%rdi,%rcx) | |
571 | movdqa %xmm4,48(%rdi,%rcx) | |
572 | ||
573 | addq $64,%rcx | |
574 | jnz 1b | |
575 | ||
576 | jmp LShort // copy remaining 0..63 bytes and done | |
577 | ||
578 | ||
579 | // Forward loop for medium length operands in which low four bits of %rsi == 1100 | |
580 | // We use the float single data type in order to use "movss" to merge vectors. | |
581 | ||
582 | LMod12: | |
583 | movss (%rsi,%rcx),%xmm0 // prefetch 1st four bytes of source, right justified | |
584 | jmp 1f | |
585 | .align 4,0x90 | |
586 | 1: // loop over 64-byte chunks | |
587 | pshufd $(0x93),4(%rsi,%rcx),%xmm1 // load and rotate right 12 bytes (mask -- 10 01 00 11) | |
588 | pshufd $(0x93),20(%rsi,%rcx),%xmm2 | |
589 | pshufd $(0x93),36(%rsi,%rcx),%xmm3 | |
590 | pshufd $(0x93),52(%rsi,%rcx),%xmm4 | |
591 | ||
592 | movaps %xmm4,%xmm5 | |
593 | movss %xmm3,%xmm4 // copy low 4 bytes of source into destination | |
594 | movss %xmm2,%xmm3 | |
595 | movss %xmm1,%xmm2 | |
596 | movss %xmm0,%xmm1 | |
597 | ||
598 | movaps %xmm1,(%rdi,%rcx) | |
599 | movaps %xmm2,16(%rdi,%rcx) | |
600 | movaps %xmm5,%xmm0 | |
601 | movaps %xmm3,32(%rdi,%rcx) | |
602 | movaps %xmm4,48(%rdi,%rcx) | |
603 | ||
604 | addq $64,%rcx | |
605 | jnz 1b | |
606 | ||
607 | jmp LShort // copy remaining 0..63 bytes and done | |
608 | ||
609 | ||
610 | // Forward loop for medium length operands in which low four bits of %rsi == 1101 | |
611 | ||
612 | LMod13: | |
613 | movdqa -13(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq | |
614 | 1: // loop over 64-byte chunks | |
615 | movdqa 3(%rsi,%rcx),%xmm1 | |
616 | movdqa 19(%rsi,%rcx),%xmm2 | |
617 | movdqa 35(%rsi,%rcx),%xmm3 | |
618 | movdqa 51(%rsi,%rcx),%xmm4 | |
619 | ||
620 | movdqa %xmm0,%xmm5 | |
621 | movdqa %xmm4,%xmm0 | |
622 | ||
623 | palignr $13,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) | |
624 | palignr $13,%xmm2,%xmm3 | |
625 | palignr $13,%xmm1,%xmm2 | |
626 | palignr $13,%xmm5,%xmm1 | |
627 | ||
628 | movdqa %xmm1,(%rdi,%rcx) | |
629 | movdqa %xmm2,16(%rdi,%rcx) | |
630 | movdqa %xmm3,32(%rdi,%rcx) | |
631 | movdqa %xmm4,48(%rdi,%rcx) | |
632 | ||
633 | addq $64,%rcx | |
634 | jnz 1b | |
635 | ||
636 | jmp LShort // copy remaining 0..63 bytes and done | |
637 | ||
638 | ||
639 | // Forward loop for medium length operands in which low four bits of %rsi == 1110 | |
640 | ||
641 | LMod14: | |
642 | movdqa -14(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq | |
643 | 1: // loop over 64-byte chunks | |
644 | movdqa 2(%rsi,%rcx),%xmm1 | |
645 | movdqa 18(%rsi,%rcx),%xmm2 | |
646 | movdqa 34(%rsi,%rcx),%xmm3 | |
647 | movdqa 50(%rsi,%rcx),%xmm4 | |
648 | ||
649 | movdqa %xmm0,%xmm5 | |
650 | movdqa %xmm4,%xmm0 | |
651 | ||
652 | palignr $14,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) | |
653 | palignr $14,%xmm2,%xmm3 | |
654 | palignr $14,%xmm1,%xmm2 | |
655 | palignr $14,%xmm5,%xmm1 | |
656 | ||
657 | movdqa %xmm1,(%rdi,%rcx) | |
658 | movdqa %xmm2,16(%rdi,%rcx) | |
659 | movdqa %xmm3,32(%rdi,%rcx) | |
660 | movdqa %xmm4,48(%rdi,%rcx) | |
661 | ||
662 | addq $64,%rcx | |
663 | jnz 1b | |
664 | ||
665 | jmp LShort // copy remaining 0..63 bytes and done | |
666 | ||
667 | ||
668 | // Forward loop for medium length operands in which low four bits of %rsi == 1111 | |
669 | ||
670 | LMod15: | |
671 | movdqa -15(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq | |
672 | 1: // loop over 64-byte chunks | |
673 | movdqa 1(%rsi,%rcx),%xmm1 | |
674 | movdqa 17(%rsi,%rcx),%xmm2 | |
675 | movdqa 33(%rsi,%rcx),%xmm3 | |
676 | movdqa 49(%rsi,%rcx),%xmm4 | |
677 | ||
678 | movdqa %xmm0,%xmm5 | |
679 | movdqa %xmm4,%xmm0 | |
680 | ||
681 | palignr $15,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) | |
682 | palignr $15,%xmm2,%xmm3 | |
683 | palignr $15,%xmm1,%xmm2 | |
684 | palignr $15,%xmm5,%xmm1 | |
685 | ||
686 | movdqa %xmm1,(%rdi,%rcx) | |
687 | movdqa %xmm2,16(%rdi,%rcx) | |
688 | movdqa %xmm3,32(%rdi,%rcx) | |
689 | movdqa %xmm4,48(%rdi,%rcx) | |
690 | ||
691 | addq $64,%rcx | |
692 | jnz 1b | |
693 | ||
694 | jmp LShort // copy remaining 0..63 bytes and done | |
695 | ||
696 | ||
697 | // Reverse moves. These are not optimized as aggressively as their forward | |
698 | // counterparts, as they are only used with destructive overlap. | |
699 | // rdx = length | |
700 | // rsi = source ptr | |
701 | // rdi = dest ptr | |
702 | ||
703 | LReverse: | |
704 | addq %rdx,%rsi // point to end of strings | |
705 | addq %rdx,%rdi | |
706 | cmpq $(kShort),%rdx // long enough to bother with SSE? | |
707 | ja LReverseNotShort // yes | |
708 | ||
709 | // Handle reverse short copies. | |
710 | // edx = length (<= kShort) | |
711 | // rsi = one byte past end of source | |
712 | // rdi = one byte past end of dest | |
713 | ||
714 | LReverseShort: | |
715 | movl %edx,%ecx // copy length | |
716 | shrl $3,%ecx // #quadwords | |
717 | jz 3f | |
718 | 1: | |
719 | subq $8,%rsi | |
720 | movq (%rsi),%rax | |
721 | subq $8,%rdi | |
722 | movq %rax,(%rdi) | |
723 | decl %ecx | |
724 | jnz 1b | |
725 | 3: | |
726 | andl $7,%edx // bytes? | |
727 | jz 5f | |
728 | 4: | |
729 | decq %rsi | |
730 | movb (%rsi),%al | |
731 | decq %rdi | |
732 | movb %al,(%rdi) | |
733 | decl %edx | |
734 | jnz 4b | |
735 | 5: | |
736 | movq %r11,%rax // get return value (dst ptr) for memcpy/memmove | |
737 | popq %rbp | |
738 | ret | |
739 | ||
740 | // Handle a reverse move long enough to justify using SSE. | |
741 | // rdx = length (> kShort) | |
742 | // rsi = one byte past end of source | |
743 | // rdi = one byte past end of dest | |
744 | ||
745 | LReverseNotShort: | |
746 | movl %edi,%ecx // copy destination | |
747 | andl $15,%ecx // get #bytes to align destination | |
748 | je LReverseDestAligned // already aligned | |
749 | subq %rcx,%rdx // adjust length | |
750 | 1: // loop copying 1..15 bytes | |
751 | decq %rsi | |
752 | movb (%rsi),%al | |
753 | decq %rdi | |
754 | movb %al,(%rdi) | |
755 | decl %ecx | |
756 | jnz 1b | |
757 | ||
758 | // Destination is now aligned. Prepare for reverse loops. | |
759 | ||
760 | LReverseDestAligned: | |
761 | movq %rdx,%rcx // copy length | |
762 | andl $63,%edx // get remaining bytes for LReverseShort | |
763 | andq $-64,%rcx // get number of bytes we will copy in inner loop | |
764 | subq %rcx,%rsi // point to endpoint of copy | |
765 | subq %rcx,%rdi | |
766 | testl $15,%esi // is source aligned too? | |
767 | jnz LReverseUnalignedLoop // no | |
768 | ||
769 | LReverseAlignedLoop: // loop over 64-byte chunks | |
770 | movdqa -16(%rsi,%rcx),%xmm0 | |
771 | movdqa -32(%rsi,%rcx),%xmm1 | |
772 | movdqa -48(%rsi,%rcx),%xmm2 | |
773 | movdqa -64(%rsi,%rcx),%xmm3 | |
774 | ||
775 | movdqa %xmm0,-16(%rdi,%rcx) | |
776 | movdqa %xmm1,-32(%rdi,%rcx) | |
777 | movdqa %xmm2,-48(%rdi,%rcx) | |
778 | movdqa %xmm3,-64(%rdi,%rcx) | |
779 | ||
780 | subq $64,%rcx | |
781 | jne LReverseAlignedLoop | |
782 | ||
783 | jmp LReverseShort // copy remaining 0..63 bytes and done | |
784 | ||
785 | ||
786 | // Reverse, unaligned loop. LDDQU==MOVDQU on these machines. | |
787 | ||
788 | LReverseUnalignedLoop: // loop over 64-byte chunks | |
789 | movdqu -16(%rsi,%rcx),%xmm0 | |
790 | movdqu -32(%rsi,%rcx),%xmm1 | |
791 | movdqu -48(%rsi,%rcx),%xmm2 | |
792 | movdqu -64(%rsi,%rcx),%xmm3 | |
793 | ||
794 | movdqa %xmm0,-16(%rdi,%rcx) | |
795 | movdqa %xmm1,-32(%rdi,%rcx) | |
796 | movdqa %xmm2,-48(%rdi,%rcx) | |
797 | movdqa %xmm3,-64(%rdi,%rcx) | |
798 | ||
799 | subq $64,%rcx | |
800 | jne LReverseUnalignedLoop | |
801 | ||
802 | jmp LReverseShort // copy remaining 0..63 bytes and done | |
803 | ||
804 | PLATFUNC_DESCRIPTOR(bcopy,sse3x,kHasSSE2|kHasSupplementalSSE3|kCache64,kHasSSE4_2) | |
805 | PLATFUNC_DESCRIPTOR(memcpy,sse3x,kHasSSE2|kHasSupplementalSSE3|kCache64,kHasSSE4_2) | |
806 | PLATFUNC_DESCRIPTOR(memmove,sse3x,kHasSSE2|kHasSupplementalSSE3|kCache64,kHasSSE4_2) |