]>
Commit | Line | Data |
---|---|---|
0c530ab8 A |
1 | /* |
2 | * Copyright (c) 2006 Apple Computer, Inc. All rights reserved. | |
3 | * | |
4 | * @APPLE_LICENSE_HEADER_START@ | |
5 | * | |
6 | * The contents of this file constitute Original Code as defined in and | |
7 | * are subject to the Apple Public Source License Version 1.1 (the | |
8 | * "License"). You may not use this file except in compliance with the | |
9 | * License. Please obtain a copy of the License at | |
10 | * http://www.apple.com/publicsource and read it before using this file. | |
11 | * | |
12 | * This Original Code and all software distributed under the License are | |
13 | * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
14 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, | |
15 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
16 | * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the | |
17 | * License for the specific language governing rights and limitations | |
18 | * under the License. | |
19 | * | |
20 | * @APPLE_LICENSE_HEADER_END@ | |
21 | */ | |
22 | ||
23 | #include <machine/cpu_capabilities.h> | |
24 | #include <machine/commpage.h> | |
25 | ||
26 | /* | |
27 | * The bcopy/memcpy loops, tuned for 64-bit Pentium-M class processors with | |
28 | * SSE4 and 64-byte cache lines. This is the 64-bit version. | |
29 | * | |
30 | * The following #defines are tightly coupled to the u-architecture: | |
31 | */ | |
32 | ||
33 | #define kShort 80 // too short to bother with SSE (must be >=80) | |
34 | #define kVeryLong (500*1024) // large enough for non-temporal stores (>=8192 and <2GB) | |
35 | #define kFastUCode ((16*1024)-15) // cutoff for microcode fastpath for "rep/movsl" | |
36 | ||
37 | ||
38 | // void bcopy(const void *src, void *dst, size_t len); | |
39 | ||
40 | .text | |
41 | .code64 | |
42 | .align 5, 0x90 | |
43 | LZero: | |
44 | Lbcopy_sse4_64: // void bcopy(const void *src, void *dst, size_t len) | |
45 | pushq %rbp // set up a frame for backtraces | |
46 | movq %rsp,%rbp | |
47 | movq %rsi,%rax // copy dest ptr | |
48 | movq %rdi,%rsi // xchange source and dest ptrs | |
49 | movq %rax,%rdi | |
50 | subq %rsi,%rax // (dest - source) | |
51 | cmpq %rdx,%rax // must move in reverse if (dest - source) < length | |
52 | jb LReverseIsland | |
53 | cmpq $(kShort),%rdx // long enough to bother with SSE? | |
54 | jbe LShort // no | |
55 | jmp LNotShort | |
56 | ||
57 | // | |
58 | // void *memcpy(void *dst, const void *src, size_t len); | |
59 | // void *memmove(void *dst, const void *src, size_t len); | |
60 | // | |
61 | // NB: These need to be 32 bytes from bcopy(): | |
62 | // | |
63 | ||
64 | .align 5, 0x90 | |
65 | Lmemcpy: // void *memcpy(void *dst, const void *src, size_t len) | |
66 | Lmemmove: // void *memmove(void *dst, const void *src, size_t len) | |
67 | pushq %rbp // set up a frame for backtraces | |
68 | movq %rsp,%rbp | |
69 | movq %rdi,%r11 // save return value here | |
70 | movq %rdi,%rax | |
71 | subq %rsi,%rax // (dest - source) | |
72 | cmpq %rdx,%rax // must move in reverse if (dest - source) < length | |
73 | jb LReverseIsland | |
74 | cmpq $(kShort),%rdx // long enough to bother with SSE? | |
75 | ja LNotShort // yes | |
76 | ||
77 | // Handle short forward copies. As the most common case, this is the fall-through path. | |
78 | // rdx = length (<= kShort) | |
79 | // rsi = source ptr | |
80 | // rdi = dest ptr | |
81 | ||
82 | LShort: | |
83 | movl %edx,%ecx // copy length using 32-bit operation | |
84 | shrl $2,%ecx // get #doublewords | |
85 | jz LLeftovers | |
86 | 2: // loop copying doublewords | |
87 | movl (%rsi),%eax | |
88 | addq $4,%rsi | |
89 | movl %eax,(%rdi) | |
90 | addq $4,%rdi | |
91 | decl %ecx | |
92 | jnz 2b | |
93 | LLeftovers: // handle leftover bytes (0..3) in last word | |
94 | andl $3,%edx // any leftover bytes? | |
95 | jz 5f | |
96 | 4: // loop copying bytes | |
97 | movb (%rsi),%al | |
98 | incq %rsi | |
99 | movb %al,(%rdi) | |
100 | incq %rdi | |
101 | decl %edx | |
102 | jnz 4b | |
103 | 5: | |
104 | movq %r11,%rax // get return value (dst ptr) for memcpy/memmove | |
105 | popq %rbp | |
106 | ret | |
107 | ||
108 | ||
109 | LReverseIsland: // keep the "jb" above a short branch... | |
110 | jmp LReverse // ...because reverse moves are uncommon | |
111 | ||
112 | ||
113 | // Handle forward moves that are long enough to justify use of SSE. | |
114 | // First, 16-byte align the destination. | |
115 | // rdx = length (> kShort) | |
116 | // rsi = source ptr | |
117 | // rdi = dest ptr | |
118 | ||
119 | LNotShort: | |
120 | cmpq $(kVeryLong),%rdx // long enough to justify heavyweight loops? | |
121 | jae LVeryLong // use very-long-operand path | |
122 | movl %edi,%ecx // copy low half of destination ptr | |
123 | negl %ecx | |
124 | andl $15,%ecx // get #bytes to align destination | |
125 | jz LDestAligned // already aligned | |
126 | subl %ecx,%edx // decrement length | |
127 | rep // align destination | |
128 | movsb | |
129 | ||
130 | ||
131 | // Destination is now aligned. Dispatch to one of sixteen loops over 64-byte chunks, | |
132 | // based on the alignment of the source. All vector loads and stores are aligned. | |
133 | // Even though this means we have to shift and repack vectors, doing so is much faster | |
134 | // than unaligned loads. Since kShort>=80 and we've moved at most 15 bytes already, | |
135 | // there is at least one chunk. When we enter the copy loops, the following registers | |
136 | // are set up: | |
137 | // rdx = residual length (0..63) | |
138 | // rcx = -(length to move), a multiple of 64 less than 2GB | |
139 | // rsi = ptr to 1st source byte not to move (unaligned) | |
140 | // rdi = ptr to 1st dest byte not to move (aligned) | |
141 | ||
142 | LDestAligned: | |
143 | movl %edx,%ecx // copy length | |
144 | movl %esi,%eax // copy low half of source address | |
145 | andl $63,%edx // get remaining bytes for LShort | |
146 | andl $15,%eax // mask to low 4 bits of source address | |
147 | andl $-64,%ecx // get number of bytes we will copy in inner loop | |
148 | // We'd like to use lea with rip-relative addressing, but cannot in a .code64 block. | |
149 | // lea LTable(%rip),%r8 // point to dispatch table | |
150 | movq $(_COMM_PAGE_32_TO_64(_COMM_PAGE_BCOPY)),%r8 // work around 4586528 | |
151 | addq $(LTable-LZero),%r8 // work around 4586528 | |
152 | addq %rcx,%rsi // point to 1st byte not copied | |
153 | addq %rcx,%rdi | |
154 | movl (%r8,%rax,4),%eax // get offset of routine | |
155 | negq %rcx // now generate offset to 1st byte to be copied | |
156 | addq %r8,%rax // generate address of copy loop | |
157 | jmp *%rax // enter copy loop, selected by source alignment | |
158 | ||
159 | .align 2 | |
160 | LTable: // table of copy loop addresses | |
161 | .long (LMod0 - LTable) | |
162 | .long (LMod1 - LTable) | |
163 | .long (LMod2 - LTable) | |
164 | .long (LMod3 - LTable) | |
165 | .long (LMod4 - LTable) | |
166 | .long (LMod5 - LTable) | |
167 | .long (LMod6 - LTable) | |
168 | .long (LMod7 - LTable) | |
169 | .long (LMod8 - LTable) | |
170 | .long (LMod9 - LTable) | |
171 | .long (LMod10 - LTable) | |
172 | .long (LMod11 - LTable) | |
173 | .long (LMod12 - LTable) | |
174 | .long (LMod13 - LTable) | |
175 | .long (LMod14 - LTable) | |
176 | .long (LMod15 - LTable) | |
177 | ||
178 | ||
179 | // Very long forward moves. These are at least several pages. They are special cased | |
180 | // and aggressively optimized, not so much because they are common or useful, but | |
181 | // because they are subject to benchmark. There isn't enough room for them in the | |
182 | // area reserved on the commpage for bcopy, so we put them elsewhere. We call | |
183 | // the longcopy routine using the normal ABI: | |
184 | // rdi = dest | |
185 | // rsi = source | |
186 | // rdx = length (>= kVeryLong bytes) | |
187 | ||
188 | LVeryLong: | |
189 | pushq %r11 // save return value | |
190 | movq $_COMM_PAGE_32_TO_64(_COMM_PAGE_LONGCOPY),%rax | |
191 | call *%rax // call very long operand routine | |
192 | popq %rax // pop return value | |
193 | popq %rbp | |
194 | ret | |
195 | ||
196 | ||
197 | // On Pentium-M, the microcode for "rep/movsl" is faster than SSE for 16-byte | |
198 | // aligned operands from about 32KB up to kVeryLong for the hot cache case, and from | |
199 | // about 256 bytes up to kVeryLong for cold caches. This is because the microcode | |
200 | // avoids having to read destination cache lines that will be completely overwritten. | |
201 | // The cutoff we use (ie, kFastUCode) must somehow balance the two cases, since | |
202 | // we do not know if the destination is in cache or not. | |
203 | ||
204 | Lfastpath: | |
205 | addq %rcx,%rsi // restore ptrs to 1st byte of source and dest | |
206 | addq %rcx,%rdi | |
207 | negl %ecx // make length positive (known to be < 2GB) | |
208 | orl %edx,%ecx // restore total #bytes remaining to move | |
209 | cld // we'll move forward | |
210 | shrl $2,%ecx // compute #words to move | |
211 | rep // the u-code will optimize this | |
212 | movsl | |
213 | jmp LLeftovers // handle 0..3 leftover bytes | |
214 | ||
215 | ||
216 | // Forward loop for medium length operands in which low four bits of %rsi == 0000 | |
217 | ||
218 | LMod0: | |
219 | cmpl $(-kFastUCode),%ecx // %rcx == -length, where (length < kVeryLong) | |
220 | jle Lfastpath // long enough for fastpath in microcode | |
221 | jmp 1f | |
222 | .align 4,0x90 // 16-byte align inner loops | |
223 | 1: // loop over 64-byte chunks | |
224 | movdqa (%rsi,%rcx),%xmm0 | |
225 | movdqa 16(%rsi,%rcx),%xmm1 | |
226 | movdqa 32(%rsi,%rcx),%xmm2 | |
227 | movdqa 48(%rsi,%rcx),%xmm3 | |
228 | ||
229 | movdqa %xmm0,(%rdi,%rcx) | |
230 | movdqa %xmm1,16(%rdi,%rcx) | |
231 | movdqa %xmm2,32(%rdi,%rcx) | |
232 | movdqa %xmm3,48(%rdi,%rcx) | |
233 | ||
234 | addq $64,%rcx | |
235 | jnz 1b | |
236 | ||
237 | jmp LShort // copy remaining 0..63 bytes and done | |
238 | ||
239 | ||
240 | // Forward loop for medium length operands in which low four bits of %rsi == 0001 | |
241 | ||
242 | LMod1: | |
243 | movdqa -1(%rsi,%rcx),%xmm0 // prime the loop by loading 1st quadword | |
244 | 1: // loop over 64-byte chunks | |
245 | movdqa 15(%rsi,%rcx),%xmm1 | |
246 | movdqa 31(%rsi,%rcx),%xmm2 | |
247 | movdqa 47(%rsi,%rcx),%xmm3 | |
248 | movdqa 63(%rsi,%rcx),%xmm4 | |
249 | ||
250 | movdqa %xmm0,%xmm5 | |
251 | movdqa %xmm4,%xmm0 | |
252 | ||
253 | palignr $1,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) | |
254 | palignr $1,%xmm2,%xmm3 | |
255 | palignr $1,%xmm1,%xmm2 | |
256 | palignr $1,%xmm5,%xmm1 | |
257 | ||
258 | movdqa %xmm1,(%rdi,%rcx) | |
259 | movdqa %xmm2,16(%rdi,%rcx) | |
260 | movdqa %xmm3,32(%rdi,%rcx) | |
261 | movdqa %xmm4,48(%rdi,%rcx) | |
262 | ||
263 | addq $64,%rcx | |
264 | jnz 1b | |
265 | ||
266 | jmp LShort // copy remaining 0..63 bytes and done | |
267 | ||
268 | ||
269 | // Forward loop for medium length operands in which low four bits of %rsi == 0010 | |
270 | ||
271 | LMod2: | |
272 | movdqa -2(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq | |
273 | 1: // loop over 64-byte chunks | |
274 | movdqa 14(%rsi,%rcx),%xmm1 | |
275 | movdqa 30(%rsi,%rcx),%xmm2 | |
276 | movdqa 46(%rsi,%rcx),%xmm3 | |
277 | movdqa 62(%rsi,%rcx),%xmm4 | |
278 | ||
279 | movdqa %xmm0,%xmm5 | |
280 | movdqa %xmm4,%xmm0 | |
281 | ||
282 | palignr $2,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) | |
283 | palignr $2,%xmm2,%xmm3 | |
284 | palignr $2,%xmm1,%xmm2 | |
285 | palignr $2,%xmm5,%xmm1 | |
286 | ||
287 | movdqa %xmm1,(%rdi,%rcx) | |
288 | movdqa %xmm2,16(%rdi,%rcx) | |
289 | movdqa %xmm3,32(%rdi,%rcx) | |
290 | movdqa %xmm4,48(%rdi,%rcx) | |
291 | ||
292 | addq $64,%rcx | |
293 | jnz 1b | |
294 | ||
295 | jmp LShort // copy remaining 0..63 bytes and done | |
296 | ||
297 | ||
298 | // Forward loop for medium length operands in which low four bits of %rsi == 0011 | |
299 | ||
300 | LMod3: | |
301 | movdqa -3(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq | |
302 | 1: // loop over 64-byte chunks | |
303 | movdqa 13(%rsi,%rcx),%xmm1 | |
304 | movdqa 29(%rsi,%rcx),%xmm2 | |
305 | movdqa 45(%rsi,%rcx),%xmm3 | |
306 | movdqa 61(%rsi,%rcx),%xmm4 | |
307 | ||
308 | movdqa %xmm0,%xmm5 | |
309 | movdqa %xmm4,%xmm0 | |
310 | ||
311 | palignr $3,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) | |
312 | palignr $3,%xmm2,%xmm3 | |
313 | palignr $3,%xmm1,%xmm2 | |
314 | palignr $3,%xmm5,%xmm1 | |
315 | ||
316 | movdqa %xmm1,(%rdi,%rcx) | |
317 | movdqa %xmm2,16(%rdi,%rcx) | |
318 | movdqa %xmm3,32(%rdi,%rcx) | |
319 | movdqa %xmm4,48(%rdi,%rcx) | |
320 | ||
321 | addq $64,%rcx | |
322 | jnz 1b | |
323 | ||
324 | jmp LShort // copy remaining 0..63 bytes and done | |
325 | ||
326 | ||
327 | // Forward loop for medium length operands in which low four bits of %rsi == 0100 | |
328 | // We use the float single data type in order to use "movss" to merge vectors. | |
329 | ||
330 | LMod4: | |
331 | movaps -4(%rsi,%rcx),%xmm0 // 4-byte aligned: prime the loop | |
332 | jmp 1f | |
333 | .align 4,0x90 | |
334 | 1: // loop over 64-byte chunks | |
335 | movaps 12(%rsi,%rcx),%xmm1 | |
336 | movaps 28(%rsi,%rcx),%xmm2 | |
337 | movss %xmm1,%xmm0 // copy low 4 bytes of source into destination | |
338 | pshufd $(0x39),%xmm0,%xmm0 // rotate right 4 bytes (mask -- 00 11 10 01) | |
339 | movaps 44(%rsi,%rcx),%xmm3 | |
340 | movss %xmm2,%xmm1 | |
341 | pshufd $(0x39),%xmm1,%xmm1 | |
342 | movaps 60(%rsi,%rcx),%xmm4 | |
343 | movss %xmm3,%xmm2 | |
344 | pshufd $(0x39),%xmm2,%xmm2 | |
345 | ||
346 | movaps %xmm0,(%rdi,%rcx) | |
347 | movss %xmm4,%xmm3 | |
348 | pshufd $(0x39),%xmm3,%xmm3 | |
349 | movaps %xmm1,16(%rdi,%rcx) | |
350 | movaps %xmm2,32(%rdi,%rcx) | |
351 | movaps %xmm4,%xmm0 | |
352 | movaps %xmm3,48(%rdi,%rcx) | |
353 | ||
354 | addq $64,%rcx | |
355 | jnz 1b | |
356 | ||
357 | jmp LShort // copy remaining 0..63 bytes and done | |
358 | ||
359 | ||
360 | // Forward loop for medium length operands in which low four bits of %rsi == 0101 | |
361 | ||
362 | LMod5: | |
363 | movdqa -5(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq | |
364 | 1: // loop over 64-byte chunks | |
365 | movdqa 11(%rsi,%rcx),%xmm1 | |
366 | movdqa 27(%rsi,%rcx),%xmm2 | |
367 | movdqa 43(%rsi,%rcx),%xmm3 | |
368 | movdqa 59(%rsi,%rcx),%xmm4 | |
369 | ||
370 | movdqa %xmm0,%xmm5 | |
371 | movdqa %xmm4,%xmm0 | |
372 | ||
373 | palignr $5,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) | |
374 | palignr $5,%xmm2,%xmm3 | |
375 | palignr $5,%xmm1,%xmm2 | |
376 | palignr $5,%xmm5,%xmm1 | |
377 | ||
378 | movdqa %xmm1,(%rdi,%rcx) | |
379 | movdqa %xmm2,16(%rdi,%rcx) | |
380 | movdqa %xmm3,32(%rdi,%rcx) | |
381 | movdqa %xmm4,48(%rdi,%rcx) | |
382 | ||
383 | addq $64,%rcx | |
384 | jnz 1b | |
385 | ||
386 | jmp LShort // copy remaining 0..63 bytes and done | |
387 | ||
388 | ||
389 | // Forward loop for medium length operands in which low four bits of %rsi == 0110 | |
390 | ||
391 | LMod6: | |
392 | movdqa -6(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq | |
393 | 1: // loop over 64-byte chunks | |
394 | movdqa 10(%rsi,%rcx),%xmm1 | |
395 | movdqa 26(%rsi,%rcx),%xmm2 | |
396 | movdqa 42(%rsi,%rcx),%xmm3 | |
397 | movdqa 58(%rsi,%rcx),%xmm4 | |
398 | ||
399 | movdqa %xmm0,%xmm5 | |
400 | movdqa %xmm4,%xmm0 | |
401 | ||
402 | palignr $6,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) | |
403 | palignr $6,%xmm2,%xmm3 | |
404 | palignr $6,%xmm1,%xmm2 | |
405 | palignr $6,%xmm5,%xmm1 | |
406 | ||
407 | movdqa %xmm1,(%rdi,%rcx) | |
408 | movdqa %xmm2,16(%rdi,%rcx) | |
409 | movdqa %xmm3,32(%rdi,%rcx) | |
410 | movdqa %xmm4,48(%rdi,%rcx) | |
411 | ||
412 | addq $64,%rcx | |
413 | jnz 1b | |
414 | ||
415 | jmp LShort // copy remaining 0..63 bytes and done | |
416 | ||
417 | ||
418 | // Forward loop for medium length operands in which low four bits of %rsi == 0111 | |
419 | ||
420 | LMod7: | |
421 | movdqa -7(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq | |
422 | 1: // loop over 64-byte chunks | |
423 | movdqa 9(%rsi,%rcx),%xmm1 | |
424 | movdqa 25(%rsi,%rcx),%xmm2 | |
425 | movdqa 41(%rsi,%rcx),%xmm3 | |
426 | movdqa 57(%rsi,%rcx),%xmm4 | |
427 | ||
428 | movdqa %xmm0,%xmm5 | |
429 | movdqa %xmm4,%xmm0 | |
430 | ||
431 | palignr $7,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) | |
432 | palignr $7,%xmm2,%xmm3 | |
433 | palignr $7,%xmm1,%xmm2 | |
434 | palignr $7,%xmm5,%xmm1 | |
435 | ||
436 | movdqa %xmm1,(%rdi,%rcx) | |
437 | movdqa %xmm2,16(%rdi,%rcx) | |
438 | movdqa %xmm3,32(%rdi,%rcx) | |
439 | movdqa %xmm4,48(%rdi,%rcx) | |
440 | ||
441 | addq $64,%rcx | |
442 | jnz 1b | |
443 | ||
444 | jmp LShort // copy remaining 0..63 bytes and done | |
445 | ||
446 | ||
447 | // Forward loop for medium length operands in which low four bits of %rsi == 1000 | |
448 | // We use the float double data type in order to use "shufpd" to shift by 8 bytes. | |
449 | ||
450 | LMod8: | |
451 | cmpl $(-kFastUCode),%ecx // %rcx == -length, where (length < kVeryLong) | |
452 | jle Lfastpath // long enough for fastpath in microcode | |
453 | movapd -8(%rsi,%rcx),%xmm0 // 8-byte aligned: prime the loop | |
454 | jmp 1f | |
455 | .align 4,0x90 | |
456 | 1: // loop over 64-byte chunks | |
457 | movapd 8(%rsi,%rcx),%xmm1 | |
458 | movapd 24(%rsi,%rcx),%xmm2 | |
459 | shufpd $01,%xmm1,%xmm0 // %xmm0 <- shr( %xmm0 || %xmm1, 8 bytes) | |
460 | movapd 40(%rsi,%rcx),%xmm3 | |
461 | shufpd $01,%xmm2,%xmm1 | |
462 | movapd 56(%rsi,%rcx),%xmm4 | |
463 | shufpd $01,%xmm3,%xmm2 | |
464 | ||
465 | movapd %xmm0,(%rdi,%rcx) | |
466 | shufpd $01,%xmm4,%xmm3 | |
467 | movapd %xmm1,16(%rdi,%rcx) | |
468 | movapd %xmm2,32(%rdi,%rcx) | |
469 | movapd %xmm4,%xmm0 | |
470 | movapd %xmm3,48(%rdi,%rcx) | |
471 | ||
472 | addq $64,%rcx | |
473 | jnz 1b | |
474 | ||
475 | jmp LShort // copy remaining 0..63 bytes and done | |
476 | ||
477 | ||
478 | // Forward loop for medium length operands in which low four bits of %rsi == 1001 | |
479 | ||
480 | LMod9: | |
481 | movdqa -9(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq | |
482 | 1: // loop over 64-byte chunks | |
483 | movdqa 7(%rsi,%rcx),%xmm1 | |
484 | movdqa 23(%rsi,%rcx),%xmm2 | |
485 | movdqa 39(%rsi,%rcx),%xmm3 | |
486 | movdqa 55(%rsi,%rcx),%xmm4 | |
487 | ||
488 | movdqa %xmm0,%xmm5 | |
489 | movdqa %xmm4,%xmm0 | |
490 | ||
491 | palignr $9,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) | |
492 | palignr $9,%xmm2,%xmm3 | |
493 | palignr $9,%xmm1,%xmm2 | |
494 | palignr $9,%xmm5,%xmm1 | |
495 | ||
496 | movdqa %xmm1,(%rdi,%rcx) | |
497 | movdqa %xmm2,16(%rdi,%rcx) | |
498 | movdqa %xmm3,32(%rdi,%rcx) | |
499 | movdqa %xmm4,48(%rdi,%rcx) | |
500 | ||
501 | addq $64,%rcx | |
502 | jnz 1b | |
503 | ||
504 | jmp LShort // copy remaining 0..63 bytes and done | |
505 | ||
506 | ||
507 | // Forward loop for medium length operands in which low four bits of %rsi == 1010 | |
508 | ||
509 | LMod10: | |
510 | movdqa -10(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq | |
511 | 1: // loop over 64-byte chunks | |
512 | movdqa 6(%rsi,%rcx),%xmm1 | |
513 | movdqa 22(%rsi,%rcx),%xmm2 | |
514 | movdqa 38(%rsi,%rcx),%xmm3 | |
515 | movdqa 54(%rsi,%rcx),%xmm4 | |
516 | ||
517 | movdqa %xmm0,%xmm5 | |
518 | movdqa %xmm4,%xmm0 | |
519 | ||
520 | palignr $10,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) | |
521 | palignr $10,%xmm2,%xmm3 | |
522 | palignr $10,%xmm1,%xmm2 | |
523 | palignr $10,%xmm5,%xmm1 | |
524 | ||
525 | movdqa %xmm1,(%rdi,%rcx) | |
526 | movdqa %xmm2,16(%rdi,%rcx) | |
527 | movdqa %xmm3,32(%rdi,%rcx) | |
528 | movdqa %xmm4,48(%rdi,%rcx) | |
529 | ||
530 | addq $64,%rcx | |
531 | jnz 1b | |
532 | ||
533 | jmp LShort // copy remaining 0..63 bytes and done | |
534 | ||
535 | ||
536 | // Forward loop for medium length operands in which low four bits of %rsi == 1011 | |
537 | ||
538 | LMod11: | |
539 | movdqa -11(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq | |
540 | 1: // loop over 64-byte chunks | |
541 | movdqa 5(%rsi,%rcx),%xmm1 | |
542 | movdqa 21(%rsi,%rcx),%xmm2 | |
543 | movdqa 37(%rsi,%rcx),%xmm3 | |
544 | movdqa 53(%rsi,%rcx),%xmm4 | |
545 | ||
546 | movdqa %xmm0,%xmm5 | |
547 | movdqa %xmm4,%xmm0 | |
548 | ||
549 | palignr $11,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) | |
550 | palignr $11,%xmm2,%xmm3 | |
551 | palignr $11,%xmm1,%xmm2 | |
552 | palignr $11,%xmm5,%xmm1 | |
553 | ||
554 | movdqa %xmm1,(%rdi,%rcx) | |
555 | movdqa %xmm2,16(%rdi,%rcx) | |
556 | movdqa %xmm3,32(%rdi,%rcx) | |
557 | movdqa %xmm4,48(%rdi,%rcx) | |
558 | ||
559 | addq $64,%rcx | |
560 | jnz 1b | |
561 | ||
562 | jmp LShort // copy remaining 0..63 bytes and done | |
563 | ||
564 | ||
565 | // Forward loop for medium length operands in which low four bits of %rsi == 1100 | |
566 | // We use the float single data type in order to use "movss" to merge vectors. | |
567 | ||
568 | LMod12: | |
569 | movss (%rsi,%rcx),%xmm0 // prefetch 1st four bytes of source, right justified | |
570 | jmp 1f | |
571 | .align 4,0x90 | |
572 | 1: // loop over 64-byte chunks | |
573 | pshufd $(0x93),4(%rsi,%rcx),%xmm1 // load and rotate right 12 bytes (mask -- 10 01 00 11) | |
574 | pshufd $(0x93),20(%rsi,%rcx),%xmm2 | |
575 | pshufd $(0x93),36(%rsi,%rcx),%xmm3 | |
576 | pshufd $(0x93),52(%rsi,%rcx),%xmm4 | |
577 | ||
578 | movaps %xmm4,%xmm5 | |
579 | movss %xmm3,%xmm4 // copy low 4 bytes of source into destination | |
580 | movss %xmm2,%xmm3 | |
581 | movss %xmm1,%xmm2 | |
582 | movss %xmm0,%xmm1 | |
583 | ||
584 | movaps %xmm1,(%rdi,%rcx) | |
585 | movaps %xmm2,16(%rdi,%rcx) | |
586 | movaps %xmm5,%xmm0 | |
587 | movaps %xmm3,32(%rdi,%rcx) | |
588 | movaps %xmm4,48(%rdi,%rcx) | |
589 | ||
590 | addq $64,%rcx | |
591 | jnz 1b | |
592 | ||
593 | jmp LShort // copy remaining 0..63 bytes and done | |
594 | ||
595 | ||
596 | // Forward loop for medium length operands in which low four bits of %rsi == 1101 | |
597 | ||
598 | LMod13: | |
599 | movdqa -13(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq | |
600 | 1: // loop over 64-byte chunks | |
601 | movdqa 3(%rsi,%rcx),%xmm1 | |
602 | movdqa 19(%rsi,%rcx),%xmm2 | |
603 | movdqa 35(%rsi,%rcx),%xmm3 | |
604 | movdqa 51(%rsi,%rcx),%xmm4 | |
605 | ||
606 | movdqa %xmm0,%xmm5 | |
607 | movdqa %xmm4,%xmm0 | |
608 | ||
609 | palignr $13,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) | |
610 | palignr $13,%xmm2,%xmm3 | |
611 | palignr $13,%xmm1,%xmm2 | |
612 | palignr $13,%xmm5,%xmm1 | |
613 | ||
614 | movdqa %xmm1,(%rdi,%rcx) | |
615 | movdqa %xmm2,16(%rdi,%rcx) | |
616 | movdqa %xmm3,32(%rdi,%rcx) | |
617 | movdqa %xmm4,48(%rdi,%rcx) | |
618 | ||
619 | addq $64,%rcx | |
620 | jnz 1b | |
621 | ||
622 | jmp LShort // copy remaining 0..63 bytes and done | |
623 | ||
624 | ||
625 | // Forward loop for medium length operands in which low four bits of %rsi == 1110 | |
626 | ||
627 | LMod14: | |
628 | movdqa -14(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq | |
629 | 1: // loop over 64-byte chunks | |
630 | movdqa 2(%rsi,%rcx),%xmm1 | |
631 | movdqa 18(%rsi,%rcx),%xmm2 | |
632 | movdqa 34(%rsi,%rcx),%xmm3 | |
633 | movdqa 50(%rsi,%rcx),%xmm4 | |
634 | ||
635 | movdqa %xmm0,%xmm5 | |
636 | movdqa %xmm4,%xmm0 | |
637 | ||
638 | palignr $14,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) | |
639 | palignr $14,%xmm2,%xmm3 | |
640 | palignr $14,%xmm1,%xmm2 | |
641 | palignr $14,%xmm5,%xmm1 | |
642 | ||
643 | movdqa %xmm1,(%rdi,%rcx) | |
644 | movdqa %xmm2,16(%rdi,%rcx) | |
645 | movdqa %xmm3,32(%rdi,%rcx) | |
646 | movdqa %xmm4,48(%rdi,%rcx) | |
647 | ||
648 | addq $64,%rcx | |
649 | jnz 1b | |
650 | ||
651 | jmp LShort // copy remaining 0..63 bytes and done | |
652 | ||
653 | ||
654 | // Forward loop for medium length operands in which low four bits of %rsi == 1111 | |
655 | ||
656 | LMod15: | |
657 | movdqa -15(%rsi,%rcx),%xmm0 // prime the loop by loading 1st source dq | |
658 | 1: // loop over 64-byte chunks | |
659 | movdqa 1(%rsi,%rcx),%xmm1 | |
660 | movdqa 17(%rsi,%rcx),%xmm2 | |
661 | movdqa 33(%rsi,%rcx),%xmm3 | |
662 | movdqa 49(%rsi,%rcx),%xmm4 | |
663 | ||
664 | movdqa %xmm0,%xmm5 | |
665 | movdqa %xmm4,%xmm0 | |
666 | ||
667 | palignr $15,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) | |
668 | palignr $15,%xmm2,%xmm3 | |
669 | palignr $15,%xmm1,%xmm2 | |
670 | palignr $15,%xmm5,%xmm1 | |
671 | ||
672 | movdqa %xmm1,(%rdi,%rcx) | |
673 | movdqa %xmm2,16(%rdi,%rcx) | |
674 | movdqa %xmm3,32(%rdi,%rcx) | |
675 | movdqa %xmm4,48(%rdi,%rcx) | |
676 | ||
677 | addq $64,%rcx | |
678 | jnz 1b | |
679 | ||
680 | jmp LShort // copy remaining 0..63 bytes and done | |
681 | ||
682 | ||
683 | // Reverse moves. These are not optimized as aggressively as their forward | |
684 | // counterparts, as they are only used with destructive overlap. | |
685 | // rdx = length | |
686 | // rsi = source ptr | |
687 | // rdi = dest ptr | |
688 | ||
689 | LReverse: | |
690 | addq %rdx,%rsi // point to end of strings | |
691 | addq %rdx,%rdi | |
692 | cmpq $(kShort),%rdx // long enough to bother with SSE? | |
693 | ja LReverseNotShort // yes | |
694 | ||
695 | // Handle reverse short copies. | |
696 | // edx = length (<= kShort) | |
697 | // rsi = one byte past end of source | |
698 | // rdi = one byte past end of dest | |
699 | ||
700 | LReverseShort: | |
701 | movl %edx,%ecx // copy length | |
702 | shrl $3,%ecx // #quadwords | |
703 | jz 3f | |
704 | 1: | |
705 | subq $8,%rsi | |
706 | movq (%rsi),%rax | |
707 | subq $8,%rdi | |
708 | movq %rax,(%rdi) | |
709 | decl %ecx | |
710 | jnz 1b | |
711 | 3: | |
712 | andl $7,%edx // bytes? | |
713 | jz 5f | |
714 | 4: | |
715 | decq %rsi | |
716 | movb (%rsi),%al | |
717 | decq %rdi | |
718 | movb %al,(%rdi) | |
719 | decl %edx | |
720 | jnz 4b | |
721 | 5: | |
722 | movq %r11,%rax // get return value (dst ptr) for memcpy/memmove | |
723 | popq %rbp | |
724 | ret | |
725 | ||
726 | // Handle a reverse move long enough to justify using SSE. | |
727 | // rdx = length (> kShort) | |
728 | // rsi = one byte past end of source | |
729 | // rdi = one byte past end of dest | |
730 | ||
731 | LReverseNotShort: | |
732 | movl %edi,%ecx // copy destination | |
733 | andl $15,%ecx // get #bytes to align destination | |
734 | je LReverseDestAligned // already aligned | |
735 | subq %rcx,%rdx // adjust length | |
736 | 1: // loop copying 1..15 bytes | |
737 | decq %rsi | |
738 | movb (%rsi),%al | |
739 | decq %rdi | |
740 | movb %al,(%rdi) | |
741 | decl %ecx | |
742 | jnz 1b | |
743 | ||
744 | // Destination is now aligned. Prepare for reverse loops. | |
745 | ||
746 | LReverseDestAligned: | |
747 | movq %rdx,%rcx // copy length | |
748 | andl $63,%edx // get remaining bytes for LReverseShort | |
749 | andq $-64,%rcx // get number of bytes we will copy in inner loop | |
750 | subq %rcx,%rsi // point to endpoint of copy | |
751 | subq %rcx,%rdi | |
752 | testl $15,%esi // is source aligned too? | |
753 | jnz LReverseUnalignedLoop // no | |
754 | ||
755 | LReverseAlignedLoop: // loop over 64-byte chunks | |
756 | movdqa -16(%rsi,%rcx),%xmm0 | |
757 | movdqa -32(%rsi,%rcx),%xmm1 | |
758 | movdqa -48(%rsi,%rcx),%xmm2 | |
759 | movdqa -64(%rsi,%rcx),%xmm3 | |
760 | ||
761 | movdqa %xmm0,-16(%rdi,%rcx) | |
762 | movdqa %xmm1,-32(%rdi,%rcx) | |
763 | movdqa %xmm2,-48(%rdi,%rcx) | |
764 | movdqa %xmm3,-64(%rdi,%rcx) | |
765 | ||
766 | subq $64,%rcx | |
767 | jne LReverseAlignedLoop | |
768 | ||
769 | jmp LReverseShort // copy remaining 0..63 bytes and done | |
770 | ||
771 | ||
772 | // Reverse, unaligned loop. LDDQU==MOVDQU on these machines. | |
773 | ||
774 | LReverseUnalignedLoop: // loop over 64-byte chunks | |
775 | movdqu -16(%rsi,%rcx),%xmm0 | |
776 | movdqu -32(%rsi,%rcx),%xmm1 | |
777 | movdqu -48(%rsi,%rcx),%xmm2 | |
778 | movdqu -64(%rsi,%rcx),%xmm3 | |
779 | ||
780 | movdqa %xmm0,-16(%rdi,%rcx) | |
781 | movdqa %xmm1,-32(%rdi,%rcx) | |
782 | movdqa %xmm2,-48(%rdi,%rcx) | |
783 | movdqa %xmm3,-64(%rdi,%rcx) | |
784 | ||
785 | subq $64,%rcx | |
786 | jne LReverseUnalignedLoop | |
787 | ||
788 | jmp LReverseShort // copy remaining 0..63 bytes and done | |
789 | ||
790 | ||
791 | COMMPAGE_DESCRIPTOR(bcopy_sse4_64,_COMM_PAGE_BCOPY,kHasSSE3+kHasSupplementalSSE3+kCache64,0) |