]>
Commit | Line | Data |
---|---|---|
0c530ab8 A |
1 | /* |
2 | * Copyright (c) 2006 Apple Computer, Inc. All rights reserved. | |
3 | * | |
2d21ac55 A |
4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ |
5 | * | |
6 | * This file contains Original Code and/or Modifications of Original Code | |
7 | * as defined in and that are subject to the Apple Public Source License | |
8 | * Version 2.0 (the 'License'). You may not use this file except in | |
9 | * compliance with the License. The rights granted to you under the License | |
10 | * may not be used to create, or enable the creation or redistribution of, | |
11 | * unlawful or unlicensed copies of an Apple operating system, or to | |
12 | * circumvent, violate, or enable the circumvention or violation of, any | |
13 | * terms of an Apple operating system software license agreement. | |
14 | * | |
15 | * Please obtain a copy of the License at | |
16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. | |
17 | * | |
18 | * The Original Code and all software distributed under the License are | |
19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
0c530ab8 A |
20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, |
21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
2d21ac55 A |
22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. |
23 | * Please see the License for the specific language governing rights and | |
24 | * limitations under the License. | |
25 | * | |
26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ | |
0c530ab8 A |
27 | */ |
28 | ||
29 | #include <machine/cpu_capabilities.h> | |
30 | #include <machine/commpage.h> | |
31 | ||
32 | /* | |
33 | * The bcopy/memcpy loops, tuned for 64-bit Pentium-M class processors with | |
2d21ac55 | 34 | * Supplemental SSE3 and 64-byte cache lines. This is the 64-bit version. |
0c530ab8 A |
35 | * |
36 | * The following #defines are tightly coupled to the u-architecture: | |
37 | */ | |
38 | ||
39 | #define kShort 80 // too short to bother with SSE (must be >=80) | |
40 | #define kVeryLong (500*1024) // large enough for non-temporal stores (>=8192 and <2GB) | |
41 | #define kFastUCode ((16*1024)-15) // cutoff for microcode fastpath for "rep/movsl" | |
42 | ||
43 | ||
44 | // void bcopy(const void *src, void *dst, size_t len); | |
45 | ||
46 | .text | |
47 | .code64 | |
48 | .align 5, 0x90 | |
49 | LZero: | |
2d21ac55 | 50 | Lbcopy_sse3x_64: // void bcopy(const void *src, void *dst, size_t len) |
0c530ab8 A |
51 | pushq %rbp // set up a frame for backtraces |
52 | movq %rsp,%rbp | |
53 | movq %rsi,%rax // copy dest ptr | |
54 | movq %rdi,%rsi // xchange source and dest ptrs | |
55 | movq %rax,%rdi | |
56 | subq %rsi,%rax // (dest - source) | |
57 | cmpq %rdx,%rax // must move in reverse if (dest - source) < length | |
58 | jb LReverseIsland | |
59 | cmpq $(kShort),%rdx // long enough to bother with SSE? | |
60 | jbe LShort // no | |
61 | jmp LNotShort | |
62 | ||
63 | // | |
64 | // void *memcpy(void *dst, const void *src, size_t len); | |
65 | // void *memmove(void *dst, const void *src, size_t len); | |
66 | // | |
67 | // NB: These need to be 32 bytes from bcopy(): | |
68 | // | |
69 | ||
70 | .align 5, 0x90 | |
71 | Lmemcpy: // void *memcpy(void *dst, const void *src, size_t len) | |
72 | Lmemmove: // void *memmove(void *dst, const void *src, size_t len) | |
73 | pushq %rbp // set up a frame for backtraces | |
74 | movq %rsp,%rbp | |
75 | movq %rdi,%r11 // save return value here | |
76 | movq %rdi,%rax | |
77 | subq %rsi,%rax // (dest - source) | |
78 | cmpq %rdx,%rax // must move in reverse if (dest - source) < length | |
79 | jb LReverseIsland | |
80 | cmpq $(kShort),%rdx // long enough to bother with SSE? | |
81 | ja LNotShort // yes | |
82 | ||
83 | // Handle short forward copies. As the most common case, this is the fall-through path. | |
84 | // rdx = length (<= kShort) | |
85 | // rsi = source ptr | |
86 | // rdi = dest ptr | |
87 | ||
88 | LShort: | |
89 | movl %edx,%ecx // copy length using 32-bit operation | |
90 | shrl $2,%ecx // get #doublewords | |
91 | jz LLeftovers | |
92 | 2: // loop copying doublewords | |
93 | movl (%rsi),%eax | |
94 | addq $4,%rsi | |
95 | movl %eax,(%rdi) | |
96 | addq $4,%rdi | |
97 | decl %ecx | |
98 | jnz 2b | |
99 | LLeftovers: // handle leftover bytes (0..3) in last word | |
100 | andl $3,%edx // any leftover bytes? | |
101 | jz 5f | |
102 | 4: // loop copying bytes | |
103 | movb (%rsi),%al | |
104 | incq %rsi | |
105 | movb %al,(%rdi) | |
106 | incq %rdi | |
107 | decl %edx | |
108 | jnz 4b | |
109 | 5: | |
110 | movq %r11,%rax // get return value (dst ptr) for memcpy/memmove | |
111 | popq %rbp | |
112 | ret | |
113 | ||
114 | ||
115 | LReverseIsland: // keep the "jb" above a short branch... | |
116 | jmp LReverse // ...because reverse moves are uncommon | |
117 | ||
118 | ||
119 | // Handle forward moves that are long enough to justify use of SSE. | |
120 | // First, 16-byte align the destination. | |
121 | // rdx = length (> kShort) | |
122 | // rsi = source ptr | |
123 | // rdi = dest ptr | |
124 | ||
125 | LNotShort: | |
126 | cmpq $(kVeryLong),%rdx // long enough to justify heavyweight loops? | |
127 | jae LVeryLong // use very-long-operand path | |
128 | movl %edi,%ecx // copy low half of destination ptr | |
129 | negl %ecx | |
130 | andl $15,%ecx // get #bytes to align destination | |
131 | jz LDestAligned // already aligned | |
132 | subl %ecx,%edx // decrement length | |
133 | rep // align destination | |
134 | movsb | |
135 | ||
136 | ||
137 | // Destination is now aligned. Dispatch to one of sixteen loops over 64-byte chunks, | |
138 | // based on the alignment of the source. All vector loads and stores are aligned. | |
139 | // Even though this means we have to shift and repack vectors, doing so is much faster | |
140 | // than unaligned loads. Since kShort>=80 and we've moved at most 15 bytes already, | |
141 | // there is at least one chunk. When we enter the copy loops, the following registers | |
142 | // are set up: | |
143 | // rdx = residual length (0..63) | |
144 | // rcx = -(length to move), a multiple of 64 less than 2GB | |
145 | // rsi = ptr to 1st source byte not to move (unaligned) | |
146 | // rdi = ptr to 1st dest byte not to move (aligned) | |
147 | ||
148 | LDestAligned: | |
149 | movl %edx,%ecx // copy length | |
150 | movl %esi,%eax // copy low half of source address | |
151 | andl $63,%edx // get remaining bytes for LShort | |
152 | andl $15,%eax // mask to low 4 bits of source address | |
153 | andl $-64,%ecx // get number of bytes we will copy in inner loop | |
154 | // We'd like to use lea with rip-relative addressing, but cannot in a .code64 block. | |
155 | // lea LTable(%rip),%r8 // point to dispatch table | |
156 | movq $(_COMM_PAGE_32_TO_64(_COMM_PAGE_BCOPY)),%r8 // work around 4586528 | |
157 | addq $(LTable-LZero),%r8 // work around 4586528 | |
158 | addq %rcx,%rsi // point to 1st byte not copied | |
159 | addq %rcx,%rdi | |
160 | movl (%r8,%rax,4),%eax // get offset of routine | |
161 | negq %rcx // now generate offset to 1st byte to be copied | |
162 | addq %r8,%rax // generate address of copy loop | |
163 | jmp *%rax // enter copy loop, selected by source alignment | |
164 | ||
165 | .align 2 | |
166 | LTable: // table of copy loop addresses | |
167 | .long (LMod0 - LTable) | |
168 | .long (LMod1 - LTable) | |
169 | .long (LMod2 - LTable) | |
170 | .long (LMod3 - LTable) | |
171 | .long (LMod4 - LTable) | |
172 | .long (LMod5 - LTable) | |
173 | .long (LMod6 - LTable) | |
174 | .long (LMod7 - LTable) | |
175 | .long (LMod8 - LTable) | |
176 | .long (LMod9 - LTable) | |
177 | .long (LMod10 - LTable) | |
178 | .long (LMod11 - LTable) | |
179 | .long (LMod12 - LTable) | |
180 | .long (LMod13 - LTable) | |
181 | .long (LMod14 - LTable) | |
182 | .long (LMod15 - LTable) | |
183 | ||
184 | ||
185 | // Very long forward moves. These are at least several pages. They are special cased | |
186 | // and aggressively optimized, not so much because they are common or useful, but | |
187 | // because they are subject to benchmark. There isn't enough room for them in the | |
188 | // area reserved on the commpage for bcopy, so we put them elsewhere. We call | |
189 | // the longcopy routine using the normal ABI: | |
190 | // rdi = dest | |
191 | // rsi = source | |
192 | // rdx = length (>= kVeryLong bytes) | |
193 | ||
194 | LVeryLong: | |
195 | pushq %r11 // save return value | |
196 | movq $_COMM_PAGE_32_TO_64(_COMM_PAGE_LONGCOPY),%rax | |
197 | call *%rax // call very long operand routine | |
198 | popq %rax // pop return value | |
199 | popq %rbp | |
200 | ret | |
201 | ||
202 | ||
203 | // On Pentium-M, the microcode for "rep/movsl" is faster than SSE for 16-byte | |
204 | // aligned operands from about 32KB up to kVeryLong for the hot cache case, and from | |
205 | // about 256 bytes up to kVeryLong for cold caches. This is because the microcode | |
206 | // avoids having to read destination cache lines that will be completely overwritten. | |
207 | // The cutoff we use (ie, kFastUCode) must somehow balance the two cases, since | |
208 | // we do not know if the destination is in cache or not. | |
209 | ||
210 | Lfastpath: | |
211 | addq %rcx,%rsi // restore ptrs to 1st byte of source and dest | |
212 | addq %rcx,%rdi | |
213 | negl %ecx // make length positive (known to be < 2GB) | |
214 | orl %edx,%ecx // restore total #bytes remaining to move | |
215 | cld // we'll move forward | |
216 | shrl $2,%ecx // compute #words to move | |
217 | rep // the u-code will optimize this | |
218 | movsl | |
219 | jmp LLeftovers // handle 0..3 leftover bytes | |
220 | ||
221 | ||
222 | // Forward loop for medium length operands in which low four bits of %rsi == 0000 | |
223 | ||
224 | LMod0: | |
225 | cmpl $(-kFastUCode),%ecx // %rcx == -length, where (length < kVeryLong) | |
226 | jle Lfastpath // long enough for fastpath in microcode | |
227 | jmp 1f | |
228 | .align 4,0x90 // 16-byte align inner loops | |
229 | 1: // loop over 64-byte chunks | |
230 | movdqa (%rsi,%rcx),%xmm0 | |
231 | movdqa 16(%rsi,%rcx),%xmm1 | |
232 | movdqa 32(%rsi,%rcx),%xmm2 | |
233 | movdqa 48(%rsi,%rcx),%xmm3 | |
234 | ||
235 | movdqa %xmm0,(%rdi,%rcx) | |
236 | movdqa %xmm1,16(%rdi,%rcx) | |
237 | movdqa %xmm2,32(%rdi,%rcx) | |
238 | movdqa %xmm3,48(%rdi,%rcx) | |
239 | ||
240 | addq $64,%rcx | |
241 | jnz 1b | |
242 | ||
243 | jmp LShort // copy remaining 0..63 bytes and done | |
244 | ||
245 | ||
246 | // Forward loop for medium length operands in which low four bits of %rsi == 0001 | |
247 | ||
248 | LMod1: | |
249 | movdqa -1(%rsi,%rcx),%xmm0 // prime the loop by loading 1st quadword | |
250 | 1: // loop over 64-byte chunks | |
251 | movdqa 15(%rsi,%rcx),%xmm1 | |
252 | movdqa 31(%rsi,%rcx),%xmm2 | |
253 | movdqa 47(%rsi,%rcx),%xmm3 | |
254 | movdqa 63(%rsi,%rcx),%xmm4 | |
255 | ||
256 | movdqa %xmm0,%xmm5 | |
257 | movdqa %xmm4,%xmm0 | |
258 | ||
259 | palignr $1,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) | |
260 | palignr $1,%xmm2,%xmm3 | |
261 | palignr $1,%xmm1,%xmm2 | |
262 | palignr $1,%xmm5,%xmm1 | |
263 | ||
264 | movdqa %xmm1,(%rdi,%rcx) | |
265 | movdqa %xmm2,16(%rdi,%rcx) | |
266 | movdqa %xmm3,32(%rdi,%rcx) | |
267 | movdqa %xmm4,48(%rdi,%rcx) | |
268 | ||
269 | addq $64,%rcx | |
270 | jnz 1b | |
271 | ||
272 | jmp LShort // copy remaining 0..63 bytes and done | |
273 | ||
274 | ||
275 | // Forward loop for medium length operands in which low four bits of %rsi == 0010 | |
276 | ||
277 | LMod2: | |
278 | movdqa -2(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq | |
279 | 1: // loop over 64-byte chunks | |
280 | movdqa 14(%rsi,%rcx),%xmm1 | |
281 | movdqa 30(%rsi,%rcx),%xmm2 | |
282 | movdqa 46(%rsi,%rcx),%xmm3 | |
283 | movdqa 62(%rsi,%rcx),%xmm4 | |
284 | ||
285 | movdqa %xmm0,%xmm5 | |
286 | movdqa %xmm4,%xmm0 | |
287 | ||
288 | palignr $2,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) | |
289 | palignr $2,%xmm2,%xmm3 | |
290 | palignr $2,%xmm1,%xmm2 | |
291 | palignr $2,%xmm5,%xmm1 | |
292 | ||
293 | movdqa %xmm1,(%rdi,%rcx) | |
294 | movdqa %xmm2,16(%rdi,%rcx) | |
295 | movdqa %xmm3,32(%rdi,%rcx) | |
296 | movdqa %xmm4,48(%rdi,%rcx) | |
297 | ||
298 | addq $64,%rcx | |
299 | jnz 1b | |
300 | ||
301 | jmp LShort // copy remaining 0..63 bytes and done | |
302 | ||
303 | ||
304 | // Forward loop for medium length operands in which low four bits of %rsi == 0011 | |
305 | ||
306 | LMod3: | |
307 | movdqa -3(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq | |
308 | 1: // loop over 64-byte chunks | |
309 | movdqa 13(%rsi,%rcx),%xmm1 | |
310 | movdqa 29(%rsi,%rcx),%xmm2 | |
311 | movdqa 45(%rsi,%rcx),%xmm3 | |
312 | movdqa 61(%rsi,%rcx),%xmm4 | |
313 | ||
314 | movdqa %xmm0,%xmm5 | |
315 | movdqa %xmm4,%xmm0 | |
316 | ||
317 | palignr $3,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) | |
318 | palignr $3,%xmm2,%xmm3 | |
319 | palignr $3,%xmm1,%xmm2 | |
320 | palignr $3,%xmm5,%xmm1 | |
321 | ||
322 | movdqa %xmm1,(%rdi,%rcx) | |
323 | movdqa %xmm2,16(%rdi,%rcx) | |
324 | movdqa %xmm3,32(%rdi,%rcx) | |
325 | movdqa %xmm4,48(%rdi,%rcx) | |
326 | ||
327 | addq $64,%rcx | |
328 | jnz 1b | |
329 | ||
330 | jmp LShort // copy remaining 0..63 bytes and done | |
331 | ||
332 | ||
333 | // Forward loop for medium length operands in which low four bits of %rsi == 0100 | |
334 | // We use the float single data type in order to use "movss" to merge vectors. | |
335 | ||
336 | LMod4: | |
337 | movaps -4(%rsi,%rcx),%xmm0 // 4-byte aligned: prime the loop | |
338 | jmp 1f | |
339 | .align 4,0x90 | |
340 | 1: // loop over 64-byte chunks | |
341 | movaps 12(%rsi,%rcx),%xmm1 | |
342 | movaps 28(%rsi,%rcx),%xmm2 | |
343 | movss %xmm1,%xmm0 // copy low 4 bytes of source into destination | |
344 | pshufd $(0x39),%xmm0,%xmm0 // rotate right 4 bytes (mask -- 00 11 10 01) | |
345 | movaps 44(%rsi,%rcx),%xmm3 | |
346 | movss %xmm2,%xmm1 | |
347 | pshufd $(0x39),%xmm1,%xmm1 | |
348 | movaps 60(%rsi,%rcx),%xmm4 | |
349 | movss %xmm3,%xmm2 | |
350 | pshufd $(0x39),%xmm2,%xmm2 | |
351 | ||
352 | movaps %xmm0,(%rdi,%rcx) | |
353 | movss %xmm4,%xmm3 | |
354 | pshufd $(0x39),%xmm3,%xmm3 | |
355 | movaps %xmm1,16(%rdi,%rcx) | |
356 | movaps %xmm2,32(%rdi,%rcx) | |
357 | movaps %xmm4,%xmm0 | |
358 | movaps %xmm3,48(%rdi,%rcx) | |
359 | ||
360 | addq $64,%rcx | |
361 | jnz 1b | |
362 | ||
363 | jmp LShort // copy remaining 0..63 bytes and done | |
364 | ||
365 | ||
366 | // Forward loop for medium length operands in which low four bits of %rsi == 0101 | |
367 | ||
368 | LMod5: | |
369 | movdqa -5(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq | |
370 | 1: // loop over 64-byte chunks | |
371 | movdqa 11(%rsi,%rcx),%xmm1 | |
372 | movdqa 27(%rsi,%rcx),%xmm2 | |
373 | movdqa 43(%rsi,%rcx),%xmm3 | |
374 | movdqa 59(%rsi,%rcx),%xmm4 | |
375 | ||
376 | movdqa %xmm0,%xmm5 | |
377 | movdqa %xmm4,%xmm0 | |
378 | ||
379 | palignr $5,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) | |
380 | palignr $5,%xmm2,%xmm3 | |
381 | palignr $5,%xmm1,%xmm2 | |
382 | palignr $5,%xmm5,%xmm1 | |
383 | ||
384 | movdqa %xmm1,(%rdi,%rcx) | |
385 | movdqa %xmm2,16(%rdi,%rcx) | |
386 | movdqa %xmm3,32(%rdi,%rcx) | |
387 | movdqa %xmm4,48(%rdi,%rcx) | |
388 | ||
389 | addq $64,%rcx | |
390 | jnz 1b | |
391 | ||
392 | jmp LShort // copy remaining 0..63 bytes and done | |
393 | ||
394 | ||
395 | // Forward loop for medium length operands in which low four bits of %rsi == 0110 | |
396 | ||
397 | LMod6: | |
398 | movdqa -6(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq | |
399 | 1: // loop over 64-byte chunks | |
400 | movdqa 10(%rsi,%rcx),%xmm1 | |
401 | movdqa 26(%rsi,%rcx),%xmm2 | |
402 | movdqa 42(%rsi,%rcx),%xmm3 | |
403 | movdqa 58(%rsi,%rcx),%xmm4 | |
404 | ||
405 | movdqa %xmm0,%xmm5 | |
406 | movdqa %xmm4,%xmm0 | |
407 | ||
408 | palignr $6,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) | |
409 | palignr $6,%xmm2,%xmm3 | |
410 | palignr $6,%xmm1,%xmm2 | |
411 | palignr $6,%xmm5,%xmm1 | |
412 | ||
413 | movdqa %xmm1,(%rdi,%rcx) | |
414 | movdqa %xmm2,16(%rdi,%rcx) | |
415 | movdqa %xmm3,32(%rdi,%rcx) | |
416 | movdqa %xmm4,48(%rdi,%rcx) | |
417 | ||
418 | addq $64,%rcx | |
419 | jnz 1b | |
420 | ||
421 | jmp LShort // copy remaining 0..63 bytes and done | |
422 | ||
423 | ||
424 | // Forward loop for medium length operands in which low four bits of %rsi == 0111 | |
425 | ||
426 | LMod7: | |
427 | movdqa -7(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq | |
428 | 1: // loop over 64-byte chunks | |
429 | movdqa 9(%rsi,%rcx),%xmm1 | |
430 | movdqa 25(%rsi,%rcx),%xmm2 | |
431 | movdqa 41(%rsi,%rcx),%xmm3 | |
432 | movdqa 57(%rsi,%rcx),%xmm4 | |
433 | ||
434 | movdqa %xmm0,%xmm5 | |
435 | movdqa %xmm4,%xmm0 | |
436 | ||
437 | palignr $7,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) | |
438 | palignr $7,%xmm2,%xmm3 | |
439 | palignr $7,%xmm1,%xmm2 | |
440 | palignr $7,%xmm5,%xmm1 | |
441 | ||
442 | movdqa %xmm1,(%rdi,%rcx) | |
443 | movdqa %xmm2,16(%rdi,%rcx) | |
444 | movdqa %xmm3,32(%rdi,%rcx) | |
445 | movdqa %xmm4,48(%rdi,%rcx) | |
446 | ||
447 | addq $64,%rcx | |
448 | jnz 1b | |
449 | ||
450 | jmp LShort // copy remaining 0..63 bytes and done | |
451 | ||
452 | ||
453 | // Forward loop for medium length operands in which low four bits of %rsi == 1000 | |
454 | // We use the float double data type in order to use "shufpd" to shift by 8 bytes. | |
455 | ||
456 | LMod8: | |
457 | cmpl $(-kFastUCode),%ecx // %rcx == -length, where (length < kVeryLong) | |
458 | jle Lfastpath // long enough for fastpath in microcode | |
459 | movapd -8(%rsi,%rcx),%xmm0 // 8-byte aligned: prime the loop | |
460 | jmp 1f | |
461 | .align 4,0x90 | |
462 | 1: // loop over 64-byte chunks | |
463 | movapd 8(%rsi,%rcx),%xmm1 | |
464 | movapd 24(%rsi,%rcx),%xmm2 | |
465 | shufpd $01,%xmm1,%xmm0 // %xmm0 <- shr( %xmm0 || %xmm1, 8 bytes) | |
466 | movapd 40(%rsi,%rcx),%xmm3 | |
467 | shufpd $01,%xmm2,%xmm1 | |
468 | movapd 56(%rsi,%rcx),%xmm4 | |
469 | shufpd $01,%xmm3,%xmm2 | |
470 | ||
471 | movapd %xmm0,(%rdi,%rcx) | |
472 | shufpd $01,%xmm4,%xmm3 | |
473 | movapd %xmm1,16(%rdi,%rcx) | |
474 | movapd %xmm2,32(%rdi,%rcx) | |
475 | movapd %xmm4,%xmm0 | |
476 | movapd %xmm3,48(%rdi,%rcx) | |
477 | ||
478 | addq $64,%rcx | |
479 | jnz 1b | |
480 | ||
481 | jmp LShort // copy remaining 0..63 bytes and done | |
482 | ||
483 | ||
484 | // Forward loop for medium length operands in which low four bits of %rsi == 1001 | |
485 | ||
486 | LMod9: | |
487 | movdqa -9(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq | |
488 | 1: // loop over 64-byte chunks | |
489 | movdqa 7(%rsi,%rcx),%xmm1 | |
490 | movdqa 23(%rsi,%rcx),%xmm2 | |
491 | movdqa 39(%rsi,%rcx),%xmm3 | |
492 | movdqa 55(%rsi,%rcx),%xmm4 | |
493 | ||
494 | movdqa %xmm0,%xmm5 | |
495 | movdqa %xmm4,%xmm0 | |
496 | ||
497 | palignr $9,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) | |
498 | palignr $9,%xmm2,%xmm3 | |
499 | palignr $9,%xmm1,%xmm2 | |
500 | palignr $9,%xmm5,%xmm1 | |
501 | ||
502 | movdqa %xmm1,(%rdi,%rcx) | |
503 | movdqa %xmm2,16(%rdi,%rcx) | |
504 | movdqa %xmm3,32(%rdi,%rcx) | |
505 | movdqa %xmm4,48(%rdi,%rcx) | |
506 | ||
507 | addq $64,%rcx | |
508 | jnz 1b | |
509 | ||
510 | jmp LShort // copy remaining 0..63 bytes and done | |
511 | ||
512 | ||
513 | // Forward loop for medium length operands in which low four bits of %rsi == 1010 | |
514 | ||
515 | LMod10: | |
516 | movdqa -10(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq | |
517 | 1: // loop over 64-byte chunks | |
518 | movdqa 6(%rsi,%rcx),%xmm1 | |
519 | movdqa 22(%rsi,%rcx),%xmm2 | |
520 | movdqa 38(%rsi,%rcx),%xmm3 | |
521 | movdqa 54(%rsi,%rcx),%xmm4 | |
522 | ||
523 | movdqa %xmm0,%xmm5 | |
524 | movdqa %xmm4,%xmm0 | |
525 | ||
526 | palignr $10,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) | |
527 | palignr $10,%xmm2,%xmm3 | |
528 | palignr $10,%xmm1,%xmm2 | |
529 | palignr $10,%xmm5,%xmm1 | |
530 | ||
531 | movdqa %xmm1,(%rdi,%rcx) | |
532 | movdqa %xmm2,16(%rdi,%rcx) | |
533 | movdqa %xmm3,32(%rdi,%rcx) | |
534 | movdqa %xmm4,48(%rdi,%rcx) | |
535 | ||
536 | addq $64,%rcx | |
537 | jnz 1b | |
538 | ||
539 | jmp LShort // copy remaining 0..63 bytes and done | |
540 | ||
541 | ||
542 | // Forward loop for medium length operands in which low four bits of %rsi == 1011 | |
543 | ||
544 | LMod11: | |
545 | movdqa -11(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq | |
546 | 1: // loop over 64-byte chunks | |
547 | movdqa 5(%rsi,%rcx),%xmm1 | |
548 | movdqa 21(%rsi,%rcx),%xmm2 | |
549 | movdqa 37(%rsi,%rcx),%xmm3 | |
550 | movdqa 53(%rsi,%rcx),%xmm4 | |
551 | ||
552 | movdqa %xmm0,%xmm5 | |
553 | movdqa %xmm4,%xmm0 | |
554 | ||
555 | palignr $11,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) | |
556 | palignr $11,%xmm2,%xmm3 | |
557 | palignr $11,%xmm1,%xmm2 | |
558 | palignr $11,%xmm5,%xmm1 | |
559 | ||
560 | movdqa %xmm1,(%rdi,%rcx) | |
561 | movdqa %xmm2,16(%rdi,%rcx) | |
562 | movdqa %xmm3,32(%rdi,%rcx) | |
563 | movdqa %xmm4,48(%rdi,%rcx) | |
564 | ||
565 | addq $64,%rcx | |
566 | jnz 1b | |
567 | ||
568 | jmp LShort // copy remaining 0..63 bytes and done | |
569 | ||
570 | ||
571 | // Forward loop for medium length operands in which low four bits of %rsi == 1100 | |
572 | // We use the float single data type in order to use "movss" to merge vectors. | |
573 | ||
574 | LMod12: | |
575 | movss (%rsi,%rcx),%xmm0 // prefetch 1st four bytes of source, right justified | |
576 | jmp 1f | |
577 | .align 4,0x90 | |
578 | 1: // loop over 64-byte chunks | |
579 | pshufd $(0x93),4(%rsi,%rcx),%xmm1 // load and rotate right 12 bytes (mask -- 10 01 00 11) | |
580 | pshufd $(0x93),20(%rsi,%rcx),%xmm2 | |
581 | pshufd $(0x93),36(%rsi,%rcx),%xmm3 | |
582 | pshufd $(0x93),52(%rsi,%rcx),%xmm4 | |
583 | ||
584 | movaps %xmm4,%xmm5 | |
585 | movss %xmm3,%xmm4 // copy low 4 bytes of source into destination | |
586 | movss %xmm2,%xmm3 | |
587 | movss %xmm1,%xmm2 | |
588 | movss %xmm0,%xmm1 | |
589 | ||
590 | movaps %xmm1,(%rdi,%rcx) | |
591 | movaps %xmm2,16(%rdi,%rcx) | |
592 | movaps %xmm5,%xmm0 | |
593 | movaps %xmm3,32(%rdi,%rcx) | |
594 | movaps %xmm4,48(%rdi,%rcx) | |
595 | ||
596 | addq $64,%rcx | |
597 | jnz 1b | |
598 | ||
599 | jmp LShort // copy remaining 0..63 bytes and done | |
600 | ||
601 | ||
602 | // Forward loop for medium length operands in which low four bits of %rsi == 1101 | |
603 | ||
604 | LMod13: | |
605 | movdqa -13(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq | |
606 | 1: // loop over 64-byte chunks | |
607 | movdqa 3(%rsi,%rcx),%xmm1 | |
608 | movdqa 19(%rsi,%rcx),%xmm2 | |
609 | movdqa 35(%rsi,%rcx),%xmm3 | |
610 | movdqa 51(%rsi,%rcx),%xmm4 | |
611 | ||
612 | movdqa %xmm0,%xmm5 | |
613 | movdqa %xmm4,%xmm0 | |
614 | ||
615 | palignr $13,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) | |
616 | palignr $13,%xmm2,%xmm3 | |
617 | palignr $13,%xmm1,%xmm2 | |
618 | palignr $13,%xmm5,%xmm1 | |
619 | ||
620 | movdqa %xmm1,(%rdi,%rcx) | |
621 | movdqa %xmm2,16(%rdi,%rcx) | |
622 | movdqa %xmm3,32(%rdi,%rcx) | |
623 | movdqa %xmm4,48(%rdi,%rcx) | |
624 | ||
625 | addq $64,%rcx | |
626 | jnz 1b | |
627 | ||
628 | jmp LShort // copy remaining 0..63 bytes and done | |
629 | ||
630 | ||
631 | // Forward loop for medium length operands in which low four bits of %rsi == 1110 | |
632 | ||
633 | LMod14: | |
634 | movdqa -14(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq | |
635 | 1: // loop over 64-byte chunks | |
636 | movdqa 2(%rsi,%rcx),%xmm1 | |
637 | movdqa 18(%rsi,%rcx),%xmm2 | |
638 | movdqa 34(%rsi,%rcx),%xmm3 | |
639 | movdqa 50(%rsi,%rcx),%xmm4 | |
640 | ||
641 | movdqa %xmm0,%xmm5 | |
642 | movdqa %xmm4,%xmm0 | |
643 | ||
644 | palignr $14,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) | |
645 | palignr $14,%xmm2,%xmm3 | |
646 | palignr $14,%xmm1,%xmm2 | |
647 | palignr $14,%xmm5,%xmm1 | |
648 | ||
649 | movdqa %xmm1,(%rdi,%rcx) | |
650 | movdqa %xmm2,16(%rdi,%rcx) | |
651 | movdqa %xmm3,32(%rdi,%rcx) | |
652 | movdqa %xmm4,48(%rdi,%rcx) | |
653 | ||
654 | addq $64,%rcx | |
655 | jnz 1b | |
656 | ||
657 | jmp LShort // copy remaining 0..63 bytes and done | |
658 | ||
659 | ||
660 | // Forward loop for medium length operands in which low four bits of %rsi == 1111 | |
661 | ||
662 | LMod15: | |
663 | movdqa -15(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq | |
664 | 1: // loop over 64-byte chunks | |
665 | movdqa 1(%rsi,%rcx),%xmm1 | |
666 | movdqa 17(%rsi,%rcx),%xmm2 | |
667 | movdqa 33(%rsi,%rcx),%xmm3 | |
668 | movdqa 49(%rsi,%rcx),%xmm4 | |
669 | ||
670 | movdqa %xmm0,%xmm5 | |
671 | movdqa %xmm4,%xmm0 | |
672 | ||
673 | palignr $15,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) | |
674 | palignr $15,%xmm2,%xmm3 | |
675 | palignr $15,%xmm1,%xmm2 | |
676 | palignr $15,%xmm5,%xmm1 | |
677 | ||
678 | movdqa %xmm1,(%rdi,%rcx) | |
679 | movdqa %xmm2,16(%rdi,%rcx) | |
680 | movdqa %xmm3,32(%rdi,%rcx) | |
681 | movdqa %xmm4,48(%rdi,%rcx) | |
682 | ||
683 | addq $64,%rcx | |
684 | jnz 1b | |
685 | ||
686 | jmp LShort // copy remaining 0..63 bytes and done | |
687 | ||
688 | ||
689 | // Reverse moves. These are not optimized as aggressively as their forward | |
690 | // counterparts, as they are only used with destructive overlap. | |
691 | // rdx = length | |
692 | // rsi = source ptr | |
693 | // rdi = dest ptr | |
694 | ||
695 | LReverse: | |
696 | addq %rdx,%rsi // point to end of strings | |
697 | addq %rdx,%rdi | |
698 | cmpq $(kShort),%rdx // long enough to bother with SSE? | |
699 | ja LReverseNotShort // yes | |
700 | ||
701 | // Handle reverse short copies. | |
702 | // edx = length (<= kShort) | |
703 | // rsi = one byte past end of source | |
704 | // rdi = one byte past end of dest | |
705 | ||
706 | LReverseShort: | |
707 | movl %edx,%ecx // copy length | |
708 | shrl $3,%ecx // #quadwords | |
709 | jz 3f | |
710 | 1: | |
711 | subq $8,%rsi | |
712 | movq (%rsi),%rax | |
713 | subq $8,%rdi | |
714 | movq %rax,(%rdi) | |
715 | decl %ecx | |
716 | jnz 1b | |
717 | 3: | |
718 | andl $7,%edx // bytes? | |
719 | jz 5f | |
720 | 4: | |
721 | decq %rsi | |
722 | movb (%rsi),%al | |
723 | decq %rdi | |
724 | movb %al,(%rdi) | |
725 | decl %edx | |
726 | jnz 4b | |
727 | 5: | |
728 | movq %r11,%rax // get return value (dst ptr) for memcpy/memmove | |
729 | popq %rbp | |
730 | ret | |
731 | ||
732 | // Handle a reverse move long enough to justify using SSE. | |
733 | // rdx = length (> kShort) | |
734 | // rsi = one byte past end of source | |
735 | // rdi = one byte past end of dest | |
736 | ||
737 | LReverseNotShort: | |
738 | movl %edi,%ecx // copy destination | |
739 | andl $15,%ecx // get #bytes to align destination | |
740 | je LReverseDestAligned // already aligned | |
741 | subq %rcx,%rdx // adjust length | |
742 | 1: // loop copying 1..15 bytes | |
743 | decq %rsi | |
744 | movb (%rsi),%al | |
745 | decq %rdi | |
746 | movb %al,(%rdi) | |
747 | decl %ecx | |
748 | jnz 1b | |
749 | ||
750 | // Destination is now aligned. Prepare for reverse loops. | |
751 | ||
752 | LReverseDestAligned: | |
753 | movq %rdx,%rcx // copy length | |
754 | andl $63,%edx // get remaining bytes for LReverseShort | |
755 | andq $-64,%rcx // get number of bytes we will copy in inner loop | |
756 | subq %rcx,%rsi // point to endpoint of copy | |
757 | subq %rcx,%rdi | |
758 | testl $15,%esi // is source aligned too? | |
759 | jnz LReverseUnalignedLoop // no | |
760 | ||
761 | LReverseAlignedLoop: // loop over 64-byte chunks | |
762 | movdqa -16(%rsi,%rcx),%xmm0 | |
763 | movdqa -32(%rsi,%rcx),%xmm1 | |
764 | movdqa -48(%rsi,%rcx),%xmm2 | |
765 | movdqa -64(%rsi,%rcx),%xmm3 | |
766 | ||
767 | movdqa %xmm0,-16(%rdi,%rcx) | |
768 | movdqa %xmm1,-32(%rdi,%rcx) | |
769 | movdqa %xmm2,-48(%rdi,%rcx) | |
770 | movdqa %xmm3,-64(%rdi,%rcx) | |
771 | ||
772 | subq $64,%rcx | |
773 | jne LReverseAlignedLoop | |
774 | ||
775 | jmp LReverseShort // copy remaining 0..63 bytes and done | |
776 | ||
777 | ||
778 | // Reverse, unaligned loop. LDDQU==MOVDQU on these machines. | |
779 | ||
780 | LReverseUnalignedLoop: // loop over 64-byte chunks | |
781 | movdqu -16(%rsi,%rcx),%xmm0 | |
782 | movdqu -32(%rsi,%rcx),%xmm1 | |
783 | movdqu -48(%rsi,%rcx),%xmm2 | |
784 | movdqu -64(%rsi,%rcx),%xmm3 | |
785 | ||
786 | movdqa %xmm0,-16(%rdi,%rcx) | |
787 | movdqa %xmm1,-32(%rdi,%rcx) | |
788 | movdqa %xmm2,-48(%rdi,%rcx) | |
789 | movdqa %xmm3,-64(%rdi,%rcx) | |
790 | ||
791 | subq $64,%rcx | |
792 | jne LReverseUnalignedLoop | |
793 | ||
794 | jmp LReverseShort // copy remaining 0..63 bytes and done | |
795 | ||
796 | ||
2d21ac55 | 797 | COMMPAGE_DESCRIPTOR(bcopy_sse3x_64,_COMM_PAGE_BCOPY,kHasSSE2+kHasSupplementalSSE3+kCache64,0) |