]>
Commit | Line | Data |
---|---|---|
5d5c5d0d A |
1 | /* |
2 | * Copyright (c) 2006 Apple Computer, Inc. All rights reserved. | |
3 | * | |
4 | * @APPLE_LICENSE_OSREFERENCE_HEADER_START@ | |
5 | * | |
6 | * This file contains Original Code and/or Modifications of Original Code | |
7 | * as defined in and that are subject to the Apple Public Source License | |
8 | * Version 2.0 (the 'License'). You may not use this file except in | |
9 | * compliance with the License. The rights granted to you under the | |
10 | * License may not be used to create, or enable the creation or | |
11 | * redistribution of, unlawful or unlicensed copies of an Apple operating | |
12 | * system, or to circumvent, violate, or enable the circumvention or | |
13 | * violation of, any terms of an Apple operating system software license | |
14 | * agreement. | |
15 | * | |
16 | * Please obtain a copy of the License at | |
17 | * http://www.opensource.apple.com/apsl/ and read it before using this | |
18 | * file. | |
19 | * | |
20 | * The Original Code and all software distributed under the License are | |
21 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
22 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, | |
23 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
24 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. | |
25 | * Please see the License for the specific language governing rights and | |
26 | * limitations under the License. | |
27 | * | |
28 | * @APPLE_LICENSE_OSREFERENCE_HEADER_END@ | |
29 | */ | |
30 | ||
31 | #include <machine/cpu_capabilities.h> | |
32 | #include <machine/commpage.h> | |
33 | ||
34 | /* | |
35 | * The bcopy/memcpy loops, tuned for Pentium-M class processors with SSE4 | |
36 | * and 64-byte cache lines. | |
37 | * | |
38 | * The following #defines are tightly coupled to the u-architecture: | |
39 | */ | |
40 | ||
41 | #define kShort 80 // too short to bother with SSE (must be >=80) | |
42 | #define kVeryLong (500*1024) // large enough for non-temporal stores (must be >= 8192) | |
43 | #define kFastUCode ((16*1024)-15) // cutoff for microcode fastpath for "rep/movsl" | |
44 | ||
45 | ||
46 | // void bcopy(const void *src, void *dst, size_t len); | |
47 | ||
48 | .text | |
49 | .align 5, 0x90 | |
50 | LZero: | |
51 | Lbcopy_sse4: // void bcopy(const void *src, void *dst, size_t len) | |
52 | pushl %ebp // set up a frame for backtraces | |
53 | movl %esp,%ebp | |
54 | pushl %esi | |
55 | pushl %edi | |
56 | movl 8(%ebp),%esi // get source ptr | |
57 | movl 12(%ebp),%edi // get dest ptr | |
58 | movl 16(%ebp),%ecx // get length | |
59 | movl %edi,%edx | |
60 | subl %esi,%edx // (dest - source) | |
61 | cmpl %ecx,%edx // must move in reverse if (dest - source) < length | |
62 | jb LReverseIsland | |
63 | cmpl $(kShort),%ecx // long enough to bother with SSE? | |
64 | jbe Lshort // no | |
65 | jmp LNotShort | |
66 | ||
67 | // | |
68 | // void *memcpy(void *dst, const void *src, size_t len); | |
69 | // void *memmove(void *dst, const void *src, size_t len); | |
70 | // | |
71 | // NB: These need to be 32 bytes from bcopy(): | |
72 | // | |
73 | ||
74 | .align 5, 0x90 | |
75 | Lmemcpy: // void *memcpy(void *dst, const void *src, size_t len) | |
76 | Lmemmove: // void *memmove(void *dst, const void *src, size_t len) | |
77 | pushl %ebp // set up a frame for backtraces | |
78 | movl %esp,%ebp | |
79 | pushl %esi | |
80 | pushl %edi | |
81 | movl 8(%ebp),%edi // get dest ptr | |
82 | movl 12(%ebp),%esi // get source ptr | |
83 | movl 16(%ebp),%ecx // get length | |
84 | movl %edi,%edx | |
85 | subl %esi,%edx // (dest - source) | |
86 | cmpl %ecx,%edx // must move in reverse if (dest - source) < length | |
87 | jb LReverseIsland | |
88 | cmpl $(kShort),%ecx // long enough to bother with SSE? | |
89 | ja LNotShort // yes | |
90 | ||
91 | // Handle short forward copies. As the most common case, this is the fall-through path. | |
92 | // ecx = length (<= kShort) | |
93 | // esi = source ptr | |
94 | // edi = dest ptr | |
95 | ||
96 | Lshort: | |
97 | movl %ecx,%edx // copy length | |
98 | shrl $2,%ecx // get #doublewords | |
99 | jz LLeftovers | |
100 | 2: // loop copying doublewords | |
101 | movl (%esi),%eax | |
102 | addl $4,%esi | |
103 | movl %eax,(%edi) | |
104 | addl $4,%edi | |
105 | dec %ecx | |
106 | jnz 2b | |
107 | LLeftovers: // handle leftover bytes (0..3) in last word | |
108 | andl $3,%edx // any leftover bytes? | |
109 | jz Lexit | |
110 | 4: // loop copying bytes | |
111 | movb (%esi),%al | |
112 | inc %esi | |
113 | movb %al,(%edi) | |
114 | inc %edi | |
115 | dec %edx | |
116 | jnz 4b | |
117 | Lexit: | |
118 | movl 8(%ebp),%eax // get return value (dst ptr) for memcpy/memmove | |
119 | popl %edi | |
120 | popl %esi | |
121 | popl %ebp | |
122 | ret | |
123 | ||
124 | ||
125 | LReverseIsland: // keep the "jb" above a short branch... | |
126 | jmp LReverse // ...because reverse moves are uncommon | |
127 | ||
128 | ||
129 | // Handle forward moves that are long enough to justify use of SSE3. | |
130 | // First, 16-byte align the destination. | |
131 | // ecx = length (> kShort) | |
132 | // esi = source ptr | |
133 | // edi = dest ptr | |
134 | ||
135 | LNotShort: | |
136 | cmpl $(kVeryLong),%ecx // long enough to justify heavyweight loops? | |
137 | movl %edi,%edx // copy destination | |
138 | jae LVeryLong // use very-long-operand path | |
139 | negl %edx | |
140 | andl $15,%edx // get #bytes to align destination | |
141 | jz LDestAligned // already aligned | |
142 | subl %edx,%ecx // decrement length | |
143 | 1: // loop copying 1..15 bytes | |
144 | movb (%esi),%al | |
145 | inc %esi | |
146 | movb %al,(%edi) | |
147 | inc %edi | |
148 | dec %edx | |
149 | jnz 1b | |
150 | ||
151 | // Destination is now aligned. Dispatch to one of sixteen loops over 64-byte chunks, | |
152 | // based on the alignment of the source. All vector loads and stores are aligned. | |
153 | // Even though this means we have to shift and repack vectors, doing so is much faster | |
154 | // than unaligned loads. Since kShort>=80 and we've moved at most 15 bytes already, | |
155 | // there is at least one chunk. When we enter the copy loops, the following registers | |
156 | // are set up: | |
157 | // ecx = residual length (0..63) | |
158 | // edx = -(length to move), a multiple of 64 | |
159 | // esi = ptr to 1st source byte not to move (unaligned) | |
160 | // edi = ptr to 1st dest byte not to move (aligned) | |
161 | ||
162 | LDestAligned: | |
163 | movl %ecx,%edx // copy length | |
164 | movl %esi,%eax // copy source address | |
165 | andl $63,%ecx // get remaining bytes for Lshort | |
166 | andl $-64,%edx // get number of bytes we will copy in inner loop | |
167 | andl $15,%eax // mask to low 4 bits of source address | |
168 | addl %edx,%esi // point to 1st byte not copied | |
169 | addl %edx,%edi | |
170 | negl %edx // now generate offset to 1st byte to be copied | |
171 | movl (_COMM_PAGE_BCOPY+LTable-LZero)(,%eax,4),%eax | |
172 | jmp *%eax | |
173 | ||
174 | .align 2 | |
175 | LTable: // table of copy loop addresses | |
176 | .long LMod0 + _COMM_PAGE_BCOPY - LZero | |
177 | .long LMod1 + _COMM_PAGE_BCOPY - LZero | |
178 | .long LMod2 + _COMM_PAGE_BCOPY - LZero | |
179 | .long LMod3 + _COMM_PAGE_BCOPY - LZero | |
180 | .long LMod4 + _COMM_PAGE_BCOPY - LZero | |
181 | .long LMod5 + _COMM_PAGE_BCOPY - LZero | |
182 | .long LMod6 + _COMM_PAGE_BCOPY - LZero | |
183 | .long LMod7 + _COMM_PAGE_BCOPY - LZero | |
184 | .long LMod8 + _COMM_PAGE_BCOPY - LZero | |
185 | .long LMod9 + _COMM_PAGE_BCOPY - LZero | |
186 | .long LMod10 + _COMM_PAGE_BCOPY - LZero | |
187 | .long LMod11 + _COMM_PAGE_BCOPY - LZero | |
188 | .long LMod12 + _COMM_PAGE_BCOPY - LZero | |
189 | .long LMod13 + _COMM_PAGE_BCOPY - LZero | |
190 | .long LMod14 + _COMM_PAGE_BCOPY - LZero | |
191 | .long LMod15 + _COMM_PAGE_BCOPY - LZero | |
192 | ||
193 | ||
194 | // Very long forward moves. These are at least several pages. They are special cased | |
195 | // and aggressively optimized, not so much because they are common or useful, but | |
196 | // because they are subject to benchmark. There isn't enough room for them in the | |
197 | // area reserved on the commpage for bcopy, so we put them elsewhere. We call | |
198 | // the longcopy routine using the normal ABI. | |
199 | ||
200 | LVeryLong: | |
201 | pushl %ecx // length (>= kVeryLong) | |
202 | pushl %esi // source ptr | |
203 | pushl %edi // dest ptr | |
204 | movl $(_COMM_PAGE_LONGCOPY),%eax | |
205 | call *%eax // do the long copy | |
206 | addl $12,%esp // pop off our parameters | |
207 | jmp Lexit | |
208 | ||
209 | ||
210 | // On Pentium-M, the microcode for "rep/movsl" is faster than SSE for 8-byte | |
211 | // aligned operands from about 32KB up to kVeryLong for the hot cache case, and from | |
212 | // about 256 bytes up to kVeryLong for cold caches. This is because the microcode | |
213 | // avoids having to read destination cache lines that will be completely overwritten. | |
214 | // The cutoff we use (ie, kFastUCode) must somehow balance the two cases, since | |
215 | // we do not know if the destination is in cache or not. | |
216 | ||
217 | Lfastpath: | |
218 | addl %edx,%esi // restore ptrs to 1st byte of source and dest | |
219 | addl %edx,%edi | |
220 | negl %edx // make length positive | |
221 | orl %edx,%ecx // restore total #bytes remaining to move | |
222 | cld // we'll move forward | |
223 | movl %ecx,%edx // copy total length to move | |
224 | shrl $2,%ecx // compute #words to move | |
225 | rep // the u-code will optimize this | |
226 | movsl | |
227 | jmp LLeftovers // handle 0..3 leftover bytes | |
228 | ||
229 | ||
230 | // Forward loop for medium length operands in which low four bits of %esi == 0000 | |
231 | ||
232 | LMod0: | |
233 | cmpl $(-kFastUCode),%edx // %edx == -length, where (length < kVeryLong) | |
234 | jle Lfastpath // long enough for fastpath in microcode | |
235 | jmp 1f | |
236 | .align 4,0x90 // 16-byte align inner loops | |
237 | 1: // loop over 64-byte chunks | |
238 | movdqa (%esi,%edx),%xmm0 | |
239 | movdqa 16(%esi,%edx),%xmm1 | |
240 | movdqa 32(%esi,%edx),%xmm2 | |
241 | movdqa 48(%esi,%edx),%xmm3 | |
242 | ||
243 | movdqa %xmm0,(%edi,%edx) | |
244 | movdqa %xmm1,16(%edi,%edx) | |
245 | movdqa %xmm2,32(%edi,%edx) | |
246 | movdqa %xmm3,48(%edi,%edx) | |
247 | ||
248 | addl $64,%edx | |
249 | jnz 1b | |
250 | ||
251 | jmp Lshort // copy remaining 0..63 bytes and done | |
252 | ||
253 | ||
254 | // Forward loop for medium length operands in which low four bits of %esi == 0001 | |
255 | ||
256 | LMod1: | |
257 | movdqa -1(%esi,%edx),%xmm0 // prime the loop by loading 1st quadword | |
258 | 1: // loop over 64-byte chunks | |
259 | movdqa 15(%esi,%edx),%xmm1 | |
260 | movdqa 31(%esi,%edx),%xmm2 | |
261 | movdqa 47(%esi,%edx),%xmm3 | |
262 | movdqa 63(%esi,%edx),%xmm4 | |
263 | ||
264 | movdqa %xmm0,%xmm5 | |
265 | movdqa %xmm4,%xmm0 | |
266 | ||
267 | palignr $1,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) | |
268 | palignr $1,%xmm2,%xmm3 | |
269 | palignr $1,%xmm1,%xmm2 | |
270 | palignr $1,%xmm5,%xmm1 | |
271 | ||
272 | movdqa %xmm1,(%edi,%edx) | |
273 | movdqa %xmm2,16(%edi,%edx) | |
274 | movdqa %xmm3,32(%edi,%edx) | |
275 | movdqa %xmm4,48(%edi,%edx) | |
276 | ||
277 | addl $64,%edx | |
278 | jnz 1b | |
279 | ||
280 | jmp Lshort // copy remaining 0..63 bytes and done | |
281 | ||
282 | ||
283 | // Forward loop for medium length operands in which low four bits of %esi == 0010 | |
284 | ||
285 | LMod2: | |
286 | movdqa -2(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq | |
287 | 1: // loop over 64-byte chunks | |
288 | movdqa 14(%esi,%edx),%xmm1 | |
289 | movdqa 30(%esi,%edx),%xmm2 | |
290 | movdqa 46(%esi,%edx),%xmm3 | |
291 | movdqa 62(%esi,%edx),%xmm4 | |
292 | ||
293 | movdqa %xmm0,%xmm5 | |
294 | movdqa %xmm4,%xmm0 | |
295 | ||
296 | palignr $2,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) | |
297 | palignr $2,%xmm2,%xmm3 | |
298 | palignr $2,%xmm1,%xmm2 | |
299 | palignr $2,%xmm5,%xmm1 | |
300 | ||
301 | movdqa %xmm1,(%edi,%edx) | |
302 | movdqa %xmm2,16(%edi,%edx) | |
303 | movdqa %xmm3,32(%edi,%edx) | |
304 | movdqa %xmm4,48(%edi,%edx) | |
305 | ||
306 | addl $64,%edx | |
307 | jnz 1b | |
308 | ||
309 | jmp Lshort // copy remaining 0..63 bytes and done | |
310 | ||
311 | ||
312 | // Forward loop for medium length operands in which low four bits of %esi == 0011 | |
313 | ||
314 | LMod3: | |
315 | movdqa -3(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq | |
316 | 1: // loop over 64-byte chunks | |
317 | movdqa 13(%esi,%edx),%xmm1 | |
318 | movdqa 29(%esi,%edx),%xmm2 | |
319 | movdqa 45(%esi,%edx),%xmm3 | |
320 | movdqa 61(%esi,%edx),%xmm4 | |
321 | ||
322 | movdqa %xmm0,%xmm5 | |
323 | movdqa %xmm4,%xmm0 | |
324 | ||
325 | palignr $3,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) | |
326 | palignr $3,%xmm2,%xmm3 | |
327 | palignr $3,%xmm1,%xmm2 | |
328 | palignr $3,%xmm5,%xmm1 | |
329 | ||
330 | movdqa %xmm1,(%edi,%edx) | |
331 | movdqa %xmm2,16(%edi,%edx) | |
332 | movdqa %xmm3,32(%edi,%edx) | |
333 | movdqa %xmm4,48(%edi,%edx) | |
334 | ||
335 | addl $64,%edx | |
336 | jnz 1b | |
337 | ||
338 | jmp Lshort // copy remaining 0..63 bytes and done | |
339 | ||
340 | ||
341 | // Forward loop for medium length operands in which low four bits of %esi == 0100 | |
342 | // We use the float single data type in order to use "movss" to merge vectors. | |
343 | ||
344 | LMod4: | |
345 | movaps -4(%esi,%edx),%xmm0 // 4-byte aligned: prime the loop | |
346 | jmp 1f | |
347 | .align 4,0x90 | |
348 | 1: // loop over 64-byte chunks | |
349 | movaps 12(%esi,%edx),%xmm1 | |
350 | movaps 28(%esi,%edx),%xmm2 | |
351 | movss %xmm1,%xmm0 // copy low 4 bytes of source into destination | |
352 | pshufd $(0x39),%xmm0,%xmm0 // rotate right 4 bytes (mask -- 00 11 10 01) | |
353 | movaps 44(%esi,%edx),%xmm3 | |
354 | movss %xmm2,%xmm1 | |
355 | pshufd $(0x39),%xmm1,%xmm1 | |
356 | movaps 60(%esi,%edx),%xmm4 | |
357 | movss %xmm3,%xmm2 | |
358 | pshufd $(0x39),%xmm2,%xmm2 | |
359 | ||
360 | movaps %xmm0,(%edi,%edx) | |
361 | movss %xmm4,%xmm3 | |
362 | pshufd $(0x39),%xmm3,%xmm3 | |
363 | movaps %xmm1,16(%edi,%edx) | |
364 | movaps %xmm2,32(%edi,%edx) | |
365 | movaps %xmm4,%xmm0 | |
366 | movaps %xmm3,48(%edi,%edx) | |
367 | ||
368 | addl $64,%edx | |
369 | jnz 1b | |
370 | ||
371 | jmp Lshort // copy remaining 0..63 bytes and done | |
372 | ||
373 | ||
374 | // Forward loop for medium length operands in which low four bits of %esi == 0101 | |
375 | ||
376 | LMod5: | |
377 | movdqa -5(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq | |
378 | 1: // loop over 64-byte chunks | |
379 | movdqa 11(%esi,%edx),%xmm1 | |
380 | movdqa 27(%esi,%edx),%xmm2 | |
381 | movdqa 43(%esi,%edx),%xmm3 | |
382 | movdqa 59(%esi,%edx),%xmm4 | |
383 | ||
384 | movdqa %xmm0,%xmm5 | |
385 | movdqa %xmm4,%xmm0 | |
386 | ||
387 | palignr $5,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) | |
388 | palignr $5,%xmm2,%xmm3 | |
389 | palignr $5,%xmm1,%xmm2 | |
390 | palignr $5,%xmm5,%xmm1 | |
391 | ||
392 | movdqa %xmm1,(%edi,%edx) | |
393 | movdqa %xmm2,16(%edi,%edx) | |
394 | movdqa %xmm3,32(%edi,%edx) | |
395 | movdqa %xmm4,48(%edi,%edx) | |
396 | ||
397 | addl $64,%edx | |
398 | jnz 1b | |
399 | ||
400 | jmp Lshort // copy remaining 0..63 bytes and done | |
401 | ||
402 | ||
403 | // Forward loop for medium length operands in which low four bits of %esi == 0110 | |
404 | ||
405 | LMod6: | |
406 | movdqa -6(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq | |
407 | 1: // loop over 64-byte chunks | |
408 | movdqa 10(%esi,%edx),%xmm1 | |
409 | movdqa 26(%esi,%edx),%xmm2 | |
410 | movdqa 42(%esi,%edx),%xmm3 | |
411 | movdqa 58(%esi,%edx),%xmm4 | |
412 | ||
413 | movdqa %xmm0,%xmm5 | |
414 | movdqa %xmm4,%xmm0 | |
415 | ||
416 | palignr $6,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) | |
417 | palignr $6,%xmm2,%xmm3 | |
418 | palignr $6,%xmm1,%xmm2 | |
419 | palignr $6,%xmm5,%xmm1 | |
420 | ||
421 | movdqa %xmm1,(%edi,%edx) | |
422 | movdqa %xmm2,16(%edi,%edx) | |
423 | movdqa %xmm3,32(%edi,%edx) | |
424 | movdqa %xmm4,48(%edi,%edx) | |
425 | ||
426 | addl $64,%edx | |
427 | jnz 1b | |
428 | ||
429 | jmp Lshort // copy remaining 0..63 bytes and done | |
430 | ||
431 | ||
432 | // Forward loop for medium length operands in which low four bits of %esi == 0111 | |
433 | ||
434 | LMod7: | |
435 | movdqa -7(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq | |
436 | 1: // loop over 64-byte chunks | |
437 | movdqa 9(%esi,%edx),%xmm1 | |
438 | movdqa 25(%esi,%edx),%xmm2 | |
439 | movdqa 41(%esi,%edx),%xmm3 | |
440 | movdqa 57(%esi,%edx),%xmm4 | |
441 | ||
442 | movdqa %xmm0,%xmm5 | |
443 | movdqa %xmm4,%xmm0 | |
444 | ||
445 | palignr $7,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) | |
446 | palignr $7,%xmm2,%xmm3 | |
447 | palignr $7,%xmm1,%xmm2 | |
448 | palignr $7,%xmm5,%xmm1 | |
449 | ||
450 | movdqa %xmm1,(%edi,%edx) | |
451 | movdqa %xmm2,16(%edi,%edx) | |
452 | movdqa %xmm3,32(%edi,%edx) | |
453 | movdqa %xmm4,48(%edi,%edx) | |
454 | ||
455 | addl $64,%edx | |
456 | jnz 1b | |
457 | ||
458 | jmp Lshort // copy remaining 0..63 bytes and done | |
459 | ||
460 | ||
461 | // Forward loop for medium length operands in which low four bits of %esi == 1000 | |
462 | // We use the float double data type in order to use "shufpd" to shift by 8 bytes. | |
463 | ||
464 | LMod8: | |
465 | cmpl $(-kFastUCode),%edx // %edx == -length, where (length < kVeryLong) | |
466 | jle Lfastpath // long enough for fastpath in microcode | |
467 | movapd -8(%esi,%edx),%xmm0 // 8-byte aligned: prime the loop | |
468 | jmp 1f | |
469 | .align 4,0x90 | |
470 | 1: // loop over 64-byte chunks | |
471 | movapd 8(%esi,%edx),%xmm1 | |
472 | movapd 24(%esi,%edx),%xmm2 | |
473 | shufpd $01,%xmm1,%xmm0 // %xmm0 <- shr( %xmm0 || %xmm1, 8 bytes) | |
474 | movapd 40(%esi,%edx),%xmm3 | |
475 | shufpd $01,%xmm2,%xmm1 | |
476 | movapd 56(%esi,%edx),%xmm4 | |
477 | shufpd $01,%xmm3,%xmm2 | |
478 | ||
479 | movapd %xmm0,(%edi,%edx) | |
480 | shufpd $01,%xmm4,%xmm3 | |
481 | movapd %xmm1,16(%edi,%edx) | |
482 | movapd %xmm2,32(%edi,%edx) | |
483 | movapd %xmm4,%xmm0 | |
484 | movapd %xmm3,48(%edi,%edx) | |
485 | ||
486 | addl $64,%edx | |
487 | jnz 1b | |
488 | ||
489 | jmp Lshort // copy remaining 0..63 bytes and done | |
490 | ||
491 | ||
492 | // Forward loop for medium length operands in which low four bits of %esi == 1001 | |
493 | ||
494 | LMod9: | |
495 | movdqa -9(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq | |
496 | 1: // loop over 64-byte chunks | |
497 | movdqa 7(%esi,%edx),%xmm1 | |
498 | movdqa 23(%esi,%edx),%xmm2 | |
499 | movdqa 39(%esi,%edx),%xmm3 | |
500 | movdqa 55(%esi,%edx),%xmm4 | |
501 | ||
502 | movdqa %xmm0,%xmm5 | |
503 | movdqa %xmm4,%xmm0 | |
504 | ||
505 | palignr $9,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) | |
506 | palignr $9,%xmm2,%xmm3 | |
507 | palignr $9,%xmm1,%xmm2 | |
508 | palignr $9,%xmm5,%xmm1 | |
509 | ||
510 | movdqa %xmm1,(%edi,%edx) | |
511 | movdqa %xmm2,16(%edi,%edx) | |
512 | movdqa %xmm3,32(%edi,%edx) | |
513 | movdqa %xmm4,48(%edi,%edx) | |
514 | ||
515 | addl $64,%edx | |
516 | jnz 1b | |
517 | ||
518 | jmp Lshort // copy remaining 0..63 bytes and done | |
519 | ||
520 | ||
521 | // Forward loop for medium length operands in which low four bits of %esi == 1010 | |
522 | ||
523 | LMod10: | |
524 | movdqa -10(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq | |
525 | 1: // loop over 64-byte chunks | |
526 | movdqa 6(%esi,%edx),%xmm1 | |
527 | movdqa 22(%esi,%edx),%xmm2 | |
528 | movdqa 38(%esi,%edx),%xmm3 | |
529 | movdqa 54(%esi,%edx),%xmm4 | |
530 | ||
531 | movdqa %xmm0,%xmm5 | |
532 | movdqa %xmm4,%xmm0 | |
533 | ||
534 | palignr $10,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) | |
535 | palignr $10,%xmm2,%xmm3 | |
536 | palignr $10,%xmm1,%xmm2 | |
537 | palignr $10,%xmm5,%xmm1 | |
538 | ||
539 | movdqa %xmm1,(%edi,%edx) | |
540 | movdqa %xmm2,16(%edi,%edx) | |
541 | movdqa %xmm3,32(%edi,%edx) | |
542 | movdqa %xmm4,48(%edi,%edx) | |
543 | ||
544 | addl $64,%edx | |
545 | jnz 1b | |
546 | ||
547 | jmp Lshort // copy remaining 0..63 bytes and done | |
548 | ||
549 | ||
550 | // Forward loop for medium length operands in which low four bits of %esi == 1011 | |
551 | ||
552 | LMod11: | |
553 | movdqa -11(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq | |
554 | 1: // loop over 64-byte chunks | |
555 | movdqa 5(%esi,%edx),%xmm1 | |
556 | movdqa 21(%esi,%edx),%xmm2 | |
557 | movdqa 37(%esi,%edx),%xmm3 | |
558 | movdqa 53(%esi,%edx),%xmm4 | |
559 | ||
560 | movdqa %xmm0,%xmm5 | |
561 | movdqa %xmm4,%xmm0 | |
562 | ||
563 | palignr $11,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) | |
564 | palignr $11,%xmm2,%xmm3 | |
565 | palignr $11,%xmm1,%xmm2 | |
566 | palignr $11,%xmm5,%xmm1 | |
567 | ||
568 | movdqa %xmm1,(%edi,%edx) | |
569 | movdqa %xmm2,16(%edi,%edx) | |
570 | movdqa %xmm3,32(%edi,%edx) | |
571 | movdqa %xmm4,48(%edi,%edx) | |
572 | ||
573 | addl $64,%edx | |
574 | jnz 1b | |
575 | ||
576 | jmp Lshort // copy remaining 0..63 bytes and done | |
577 | ||
578 | ||
579 | // Forward loop for medium length operands in which low four bits of %esi == 1100 | |
580 | // We use the float single data type in order to use "movss" to merge vectors. | |
581 | ||
582 | LMod12: | |
583 | movss (%esi,%edx),%xmm0 // prefetch 1st four bytes of source, right justified | |
584 | jmp 1f | |
585 | .align 4,0x90 | |
586 | 1: // loop over 64-byte chunks | |
587 | pshufd $(0x93),4(%esi,%edx),%xmm1 // load and rotate right 12 bytes (mask -- 10 01 00 11) | |
588 | pshufd $(0x93),20(%esi,%edx),%xmm2 | |
589 | pshufd $(0x93),36(%esi,%edx),%xmm3 | |
590 | pshufd $(0x93),52(%esi,%edx),%xmm4 | |
591 | ||
592 | movaps %xmm4,%xmm5 | |
593 | movss %xmm3,%xmm4 // copy low 4 bytes of source into destination | |
594 | movss %xmm2,%xmm3 | |
595 | movss %xmm1,%xmm2 | |
596 | movss %xmm0,%xmm1 | |
597 | ||
598 | movaps %xmm1,(%edi,%edx) | |
599 | movaps %xmm2,16(%edi,%edx) | |
600 | movaps %xmm5,%xmm0 | |
601 | movaps %xmm3,32(%edi,%edx) | |
602 | movaps %xmm4,48(%edi,%edx) | |
603 | ||
604 | addl $64,%edx | |
605 | jnz 1b | |
606 | ||
607 | jmp Lshort // copy remaining 0..63 bytes and done | |
608 | ||
609 | ||
610 | // Forward loop for medium length operands in which low four bits of %esi == 1101 | |
611 | ||
612 | LMod13: | |
613 | movdqa -13(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq | |
614 | 1: // loop over 64-byte chunks | |
615 | movdqa 3(%esi,%edx),%xmm1 | |
616 | movdqa 19(%esi,%edx),%xmm2 | |
617 | movdqa 35(%esi,%edx),%xmm3 | |
618 | movdqa 51(%esi,%edx),%xmm4 | |
619 | ||
620 | movdqa %xmm0,%xmm5 | |
621 | movdqa %xmm4,%xmm0 | |
622 | ||
623 | palignr $13,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) | |
624 | palignr $13,%xmm2,%xmm3 | |
625 | palignr $13,%xmm1,%xmm2 | |
626 | palignr $13,%xmm5,%xmm1 | |
627 | ||
628 | movdqa %xmm1,(%edi,%edx) | |
629 | movdqa %xmm2,16(%edi,%edx) | |
630 | movdqa %xmm3,32(%edi,%edx) | |
631 | movdqa %xmm4,48(%edi,%edx) | |
632 | ||
633 | addl $64,%edx | |
634 | jnz 1b | |
635 | ||
636 | jmp Lshort // copy remaining 0..63 bytes and done | |
637 | ||
638 | ||
639 | // Forward loop for medium length operands in which low four bits of %esi == 1110 | |
640 | ||
641 | LMod14: | |
642 | movdqa -14(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq | |
643 | 1: // loop over 64-byte chunks | |
644 | movdqa 2(%esi,%edx),%xmm1 | |
645 | movdqa 18(%esi,%edx),%xmm2 | |
646 | movdqa 34(%esi,%edx),%xmm3 | |
647 | movdqa 50(%esi,%edx),%xmm4 | |
648 | ||
649 | movdqa %xmm0,%xmm5 | |
650 | movdqa %xmm4,%xmm0 | |
651 | ||
652 | palignr $14,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) | |
653 | palignr $14,%xmm2,%xmm3 | |
654 | palignr $14,%xmm1,%xmm2 | |
655 | palignr $14,%xmm5,%xmm1 | |
656 | ||
657 | movdqa %xmm1,(%edi,%edx) | |
658 | movdqa %xmm2,16(%edi,%edx) | |
659 | movdqa %xmm3,32(%edi,%edx) | |
660 | movdqa %xmm4,48(%edi,%edx) | |
661 | ||
662 | addl $64,%edx | |
663 | jnz 1b | |
664 | ||
665 | jmp Lshort // copy remaining 0..63 bytes and done | |
666 | ||
667 | ||
668 | // Forward loop for medium length operands in which low four bits of %esi == 1111 | |
669 | ||
670 | LMod15: | |
671 | movdqa -15(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq | |
672 | 1: // loop over 64-byte chunks | |
673 | movdqa 1(%esi,%edx),%xmm1 | |
674 | movdqa 17(%esi,%edx),%xmm2 | |
675 | movdqa 33(%esi,%edx),%xmm3 | |
676 | movdqa 49(%esi,%edx),%xmm4 | |
677 | ||
678 | movdqa %xmm0,%xmm5 | |
679 | movdqa %xmm4,%xmm0 | |
680 | ||
681 | palignr $15,%xmm3,%xmm4 // dest <- shr( dest || source, imm*8 ) | |
682 | palignr $15,%xmm2,%xmm3 | |
683 | palignr $15,%xmm1,%xmm2 | |
684 | palignr $15,%xmm5,%xmm1 | |
685 | ||
686 | movdqa %xmm1,(%edi,%edx) | |
687 | movdqa %xmm2,16(%edi,%edx) | |
688 | movdqa %xmm3,32(%edi,%edx) | |
689 | movdqa %xmm4,48(%edi,%edx) | |
690 | ||
691 | addl $64,%edx | |
692 | jnz 1b | |
693 | ||
694 | jmp Lshort // copy remaining 0..63 bytes and done | |
695 | ||
696 | ||
697 | // Reverse moves. These are not optimized as aggressively as their forward | |
698 | // counterparts, as they are only used with destructive overlap. | |
699 | // ecx = length | |
700 | // esi = source ptr | |
701 | // edi = dest ptr | |
702 | ||
703 | LReverse: | |
704 | addl %ecx,%esi // point to end of strings | |
705 | addl %ecx,%edi | |
706 | cmpl $(kShort),%ecx // long enough to bother with SSE? | |
707 | ja LReverseNotShort // yes | |
708 | ||
709 | // Handle reverse short copies. | |
710 | // ecx = length | |
711 | // esi = one byte past end of source | |
712 | // edi = one byte past end of dest | |
713 | ||
714 | LReverseShort: | |
715 | movl %ecx,%edx // copy length | |
716 | shrl $2,%ecx // #words | |
717 | jz 3f | |
718 | 1: | |
719 | subl $4,%esi | |
720 | movl (%esi),%eax | |
721 | subl $4,%edi | |
722 | movl %eax,(%edi) | |
723 | dec %ecx | |
724 | jnz 1b | |
725 | 3: | |
726 | andl $3,%edx // bytes? | |
727 | jz 5f | |
728 | 4: | |
729 | dec %esi | |
730 | movb (%esi),%al | |
731 | dec %edi | |
732 | movb %al,(%edi) | |
733 | dec %edx | |
734 | jnz 4b | |
735 | 5: | |
736 | movl 8(%ebp),%eax // get return value (dst ptr) for memcpy/memmove | |
737 | popl %edi | |
738 | popl %esi | |
739 | popl %ebp | |
740 | ret | |
741 | ||
742 | // Handle a reverse move long enough to justify using SSE. | |
743 | // ecx = length | |
744 | // esi = one byte past end of source | |
745 | // edi = one byte past end of dest | |
746 | ||
747 | LReverseNotShort: | |
748 | movl %edi,%edx // copy destination | |
749 | andl $15,%edx // get #bytes to align destination | |
750 | je LReverseDestAligned // already aligned | |
751 | subl %edx,%ecx // adjust length | |
752 | 1: // loop copying 1..15 bytes | |
753 | dec %esi | |
754 | movb (%esi),%al | |
755 | dec %edi | |
756 | movb %al,(%edi) | |
757 | dec %edx | |
758 | jnz 1b | |
759 | ||
760 | // Destination is now aligned. Prepare for reverse loops. | |
761 | ||
762 | LReverseDestAligned: | |
763 | movl %ecx,%edx // copy length | |
764 | andl $63,%ecx // get remaining bytes for Lshort | |
765 | andl $-64,%edx // get number of bytes we will copy in inner loop | |
766 | subl %edx,%esi // point to endpoint of copy | |
767 | subl %edx,%edi | |
768 | testl $15,%esi // is source aligned too? | |
769 | jnz LReverseUnalignedLoop // no | |
770 | ||
771 | LReverseAlignedLoop: // loop over 64-byte chunks | |
772 | movdqa -16(%esi,%edx),%xmm0 | |
773 | movdqa -32(%esi,%edx),%xmm1 | |
774 | movdqa -48(%esi,%edx),%xmm2 | |
775 | movdqa -64(%esi,%edx),%xmm3 | |
776 | ||
777 | movdqa %xmm0,-16(%edi,%edx) | |
778 | movdqa %xmm1,-32(%edi,%edx) | |
779 | movdqa %xmm2,-48(%edi,%edx) | |
780 | movdqa %xmm3,-64(%edi,%edx) | |
781 | ||
782 | subl $64,%edx | |
783 | jne LReverseAlignedLoop | |
784 | ||
785 | jmp LReverseShort // copy remaining 0..63 bytes and done | |
786 | ||
787 | ||
788 | // Reverse, unaligned loop. LDDQU==MOVDQU on these machines. | |
789 | ||
790 | LReverseUnalignedLoop: // loop over 64-byte chunks | |
791 | movdqu -16(%esi,%edx),%xmm0 | |
792 | movdqu -32(%esi,%edx),%xmm1 | |
793 | movdqu -48(%esi,%edx),%xmm2 | |
794 | movdqu -64(%esi,%edx),%xmm3 | |
795 | ||
796 | movdqa %xmm0,-16(%edi,%edx) | |
797 | movdqa %xmm1,-32(%edi,%edx) | |
798 | movdqa %xmm2,-48(%edi,%edx) | |
799 | movdqa %xmm3,-64(%edi,%edx) | |
800 | ||
801 | subl $64,%edx | |
802 | jne LReverseUnalignedLoop | |
803 | ||
804 | jmp LReverseShort // copy remaining 0..63 bytes and done | |
805 | ||
806 | ||
807 | COMMPAGE_DESCRIPTOR(bcopy_sse4,_COMM_PAGE_BCOPY,kHasSSE3+kHasSupplementalSSE3+kCache64,0) |