]>
Commit | Line | Data |
---|---|---|
55e303ae A |
1 | /* |
2 | * Copyright (c) 2003 Apple Computer, Inc. All rights reserved. | |
3 | * | |
8ad349bb | 4 | * @APPLE_LICENSE_OSREFERENCE_HEADER_START@ |
55e303ae | 5 | * |
8ad349bb A |
6 | * This file contains Original Code and/or Modifications of Original Code |
7 | * as defined in and that are subject to the Apple Public Source License | |
8 | * Version 2.0 (the 'License'). You may not use this file except in | |
9 | * compliance with the License. The rights granted to you under the | |
10 | * License may not be used to create, or enable the creation or | |
11 | * redistribution of, unlawful or unlicensed copies of an Apple operating | |
12 | * system, or to circumvent, violate, or enable the circumvention or | |
13 | * violation of, any terms of an Apple operating system software license | |
14 | * agreement. | |
15 | * | |
16 | * Please obtain a copy of the License at | |
17 | * http://www.opensource.apple.com/apsl/ and read it before using this | |
18 | * file. | |
19 | * | |
20 | * The Original Code and all software distributed under the License are | |
21 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
22 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, | |
23 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
24 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. | |
25 | * Please see the License for the specific language governing rights and | |
26 | * limitations under the License. | |
27 | * | |
28 | * @APPLE_LICENSE_OSREFERENCE_HEADER_END@ | |
55e303ae A |
29 | */ |
30 | /* ======================================= | |
31 | * BCOPY, MEMCPY, and MEMMOVE for Mac OS X | |
32 | * ======================================= | |
33 | * | |
34 | * Version of 6/11/2003, tuned for the IBM 970. | |
35 | * | |
55e303ae A |
36 | * Register usage. Note the rather delicate way we assign multiple uses |
37 | * to the same register. Beware. | |
38 | * r0 = temp (NB: cannot use r0 for any constant such as "c16") | |
39 | * r3 = not used, as memcpy and memmove return 1st parameter as a value | |
40 | * r4 = source ptr ("rs") | |
41 | * r5 = count of bytes to move ("rc") | |
42 | * r6 = "w1", "c16", or "cm17" | |
43 | * r7 = "w2", "c32", or "cm33" | |
44 | * r8 = "w3", "c48", or "cm49" | |
45 | * r9 = "w4", or "cm1" | |
46 | * r10 = vrsave ("rv") | |
47 | * r11 = unused | |
48 | * r12 = destination ptr ("rd") | |
49 | * v0 = permute vector ("vp") | |
50 | * v1-v8 = qw's loaded from source | |
51 | *v9-v12 = permuted qw's ("vw", "vx", "vy", and "vz") | |
52 | */ | |
53 | #define rs r4 | |
54 | #define rd r12 | |
55 | #define rc r5 | |
56 | #define rv r10 | |
57 | ||
58 | #define w1 r6 | |
59 | #define w2 r7 | |
60 | #define w3 r8 | |
61 | #define w4 r9 | |
62 | ||
63 | #define c16 r6 | |
64 | #define cm17 r6 | |
65 | #define c32 r7 | |
66 | #define cm33 r7 | |
67 | #define c48 r8 | |
68 | #define cm49 r8 | |
69 | #define cm1 r9 | |
70 | ||
71 | #define vp v0 | |
72 | #define vw v9 | |
73 | #define vx v10 | |
74 | #define vy v11 | |
75 | #define vz v12 | |
76 | ||
77 | #define ASSEMBLER | |
78 | #include <sys/appleapiopts.h> | |
79 | #include <ppc/asm.h> | |
80 | #include <machine/cpu_capabilities.h> | |
81 | #include <machine/commpage.h> | |
82 | ||
83 | .text | |
91447636 A |
84 | /* |
85 | * WARNING: this code is written for 32-bit mode, and ported by the kernel if necessary | |
86 | * to 64-bit mode for use in the 64-bit commpage. This "port" consists of the following | |
87 | * simple transformations: | |
88 | * - all word compares are changed to doubleword | |
89 | * - all "srwi[.]" opcodes are changed to "srdi[.]" | |
90 | * Nothing else is done. For this to work, the following rules must be | |
91 | * carefully followed: | |
92 | * - do not use carry or overflow | |
93 | * - only use record mode if you are sure the results are mode-invariant | |
94 | * for example, all "andi." and almost all "rlwinm." are fine | |
95 | * - do not use "slwi", "slw", or "srw" | |
96 | * An imaginative programmer could break the porting model in other ways, but the above | |
97 | * are the most likely problem areas. It is perhaps surprising how well in practice | |
98 | * this simple method works. | |
99 | */ | |
55e303ae A |
100 | |
101 | #define kShort 64 | |
102 | #define kVeryLong (128*1024) | |
103 | ||
104 | ||
105 | // Main entry points. | |
106 | ||
107 | .align 5 | |
108 | bcopy_970: // void bcopy(const void *src, void *dst, size_t len) | |
109 | cmplwi rc,kShort // short or long? | |
110 | sub w1,r4,r3 // must move in reverse if (rd-rs)<rc | |
111 | mr rd,r4 // move registers to canonic spot | |
112 | mr rs,r3 | |
113 | blt LShort // handle short operands | |
114 | dcbt 0,rs // touch in the first line of source | |
115 | dcbtst 0,rd // touch in destination | |
116 | b LLong1 // join long operand code | |
117 | ||
118 | // NB: memmove() must be 8 words past bcopy(), to agree with comm page addresses. | |
119 | ||
120 | .align 5 | |
121 | Lmemcpy_970: // void* memcpy(void *dst, void *src, size_t len) | |
122 | Lmemmove_970: // void* memmove(void *dst, const void *src, size_t len) | |
123 | cmplwi rc,kShort // short or long? | |
124 | sub w1,r3,r4 // must move in reverse if (rd-rs)<rc | |
125 | mr rd,r3 // must leave r3 alone, it is return value for memcpy etc | |
126 | bge LLong0 // handle long operands | |
127 | ||
128 | // Handle short operands. | |
129 | // rs = source | |
130 | // rd = destination | |
131 | // rc = count | |
132 | // w1 = (rd-rs), must move reverse if (rd-rs)<rc | |
133 | ||
134 | LShort: | |
135 | cmplw cr1,w1,rc // set cr1 blt if we must move reverse | |
136 | mtcrf 0x02,rc // move length to cr6 and cr7 one at a time | |
137 | mtcrf 0x01,rc | |
138 | blt-- cr1,LShortReverse | |
139 | ||
140 | // Forward short operands. This is the most frequent case, so it is inline. | |
141 | ||
142 | bf 26,0f // 32-byte chunk to move? | |
143 | ld w1,0(rs) | |
144 | ld w2,8(rs) | |
145 | ld w3,16(rs) | |
146 | ld w4,24(rs) | |
147 | addi rs,rs,32 | |
148 | std w1,0(rd) | |
149 | std w2,8(rd) | |
150 | std w3,16(rd) | |
151 | std w4,24(rd) | |
152 | addi rd,rd,32 | |
153 | 0: | |
154 | LShort32: | |
155 | bf 27,1f // quadword to move? | |
156 | ld w1,0(rs) | |
157 | ld w3,8(rs) | |
158 | addi rs,rs,16 | |
159 | std w1,0(rd) | |
160 | std w3,8(rd) | |
161 | addi rd,rd,16 | |
162 | 1: | |
163 | LShort16: // join here to xfer 0-15 bytes | |
164 | bf 28,2f // doubleword? | |
165 | ld w1,0(rs) | |
166 | addi rs,rs,8 | |
167 | std w1,0(rd) | |
168 | addi rd,rd,8 | |
169 | 2: | |
170 | bf 29,3f // word? | |
171 | lwz w1,0(rs) | |
172 | addi rs,rs,4 | |
173 | stw w1,0(rd) | |
174 | addi rd,rd,4 | |
175 | 3: | |
176 | bf 30,4f // halfword to move? | |
177 | lhz w1,0(rs) | |
178 | addi rs,rs,2 | |
179 | sth w1,0(rd) | |
180 | addi rd,rd,2 | |
181 | 4: | |
182 | bflr 31 // skip if no odd byte | |
183 | lbz w1,0(rs) | |
184 | stb w1,0(rd) | |
185 | blr | |
186 | ||
187 | ||
188 | // Handle short reverse operands. | |
189 | // cr = length in bits 26-31 | |
190 | ||
191 | LShortReverse: | |
192 | add rs,rs,rc // adjust ptrs for reverse move | |
193 | add rd,rd,rc | |
194 | bf 26,0f // 32 bytes to move? | |
195 | ld w1,-8(rs) | |
196 | ld w2,-16(rs) | |
197 | ld w3,-24(rs) | |
198 | ldu w4,-32(rs) | |
199 | std w1,-8(rd) | |
200 | std w2,-16(rd) | |
201 | std w3,-24(rd) | |
202 | stdu w4,-32(rd) | |
203 | 0: | |
204 | bf 27,1f // quadword to move? | |
205 | ld w1,-8(rs) | |
206 | ldu w2,-16(rs) | |
207 | std w1,-8(rd) | |
208 | stdu w2,-16(rd) | |
209 | 1: | |
210 | LShortReverse16: // join here to xfer 0-15 bytes and return | |
211 | bf 28,2f // doubleword? | |
212 | ldu w1,-8(rs) | |
213 | stdu w1,-8(rd) | |
214 | 2: | |
215 | bf 29,3f // word? | |
216 | lwzu w1,-4(rs) | |
217 | stwu w1,-4(rd) | |
218 | 3: | |
219 | bf 30,4f // halfword to move? | |
220 | lhzu w1,-2(rs) | |
221 | sthu w1,-2(rd) | |
222 | 4: | |
223 | bflr 31 // done if no odd byte | |
224 | lbz w1,-1(rs) // no update | |
225 | stb w1,-1(rd) | |
226 | blr | |
227 | ||
228 | ||
229 | // Long operands, use Altivec in most cases. | |
230 | // rs = source | |
231 | // rd = destination | |
232 | // rc = count | |
233 | // w1 = (rd-rs), must move reverse if (rd-rs)<rc | |
234 | ||
235 | LLong0: // entry from memmove() | |
236 | dcbt 0,rs // touch in source | |
237 | dcbtst 0,rd // touch in destination | |
238 | LLong1: // entry from bcopy() with operands already touched in | |
239 | cmplw cr1,w1,rc // set cr1 blt iff we must move reverse | |
240 | neg w3,rd // start to compute #bytes to align destination | |
241 | rlwinm w2,w1,0,0xF // 16-byte aligned? (w2==0 if so) | |
242 | andi. w4,w3,0xF // w4 <- #bytes to 16-byte align destination | |
243 | cmpwi cr5,w2,0 // set cr5 beq if relatively 16-byte aligned | |
244 | blt-- cr1,LLongReverse // handle reverse moves | |
245 | sub rc,rc,w4 // adjust length for aligning destination | |
246 | srwi r0,rc,7 // get #cache lines to copy (may be 0) | |
247 | cmpwi cr1,r0,0 // set cr1 on #chunks | |
248 | beq LFwdAligned // dest is already aligned | |
249 | ||
250 | // 16-byte align destination. | |
251 | ||
252 | mtcrf 0x01,w4 // cr7 <- #bytes to align dest (nonzero) | |
253 | bf 31,1f // byte to move? | |
254 | lbz w1,0(rs) | |
255 | addi rs,rs,1 | |
256 | stb w1,0(rd) | |
257 | addi rd,rd,1 | |
258 | 1: | |
259 | bf 30,2f // halfword? | |
260 | lhz w1,0(rs) | |
261 | addi rs,rs,2 | |
262 | sth w1,0(rd) | |
263 | addi rd,rd,2 | |
264 | 2: | |
265 | bf 29,3f // word? | |
266 | lwz w1,0(rs) | |
267 | addi rs,rs,4 | |
268 | stw w1,0(rd) | |
269 | addi rd,rd,4 | |
270 | 3: | |
271 | bf 28,LFwdAligned // doubleword? | |
272 | ld w1,0(rs) | |
273 | addi rs,rs,8 | |
274 | std w1,0(rd) | |
275 | addi rd,rd,8 | |
276 | ||
277 | ||
278 | // Forward, destination is 16-byte aligned. There are five cases: | |
279 | // 1. If the length>=kVeryLong (ie, several pages), then use the | |
280 | // "bigcopy" path that pulls all the punches. This is the fastest | |
281 | // case for cold-cache operands, as any this long will likely be. | |
282 | // 2. If length>=128 and source is 16-byte aligned, then use the | |
283 | // lvx/stvx loop over 128-byte chunks. This is the fastest | |
284 | // case for hot-cache operands, 2nd fastest for cold. | |
285 | // 3. If length>=128 and source is not 16-byte aligned, then use the | |
286 | // lvx/vperm/stvx loop over 128-byte chunks. | |
287 | // 4. If length<128 and source is 8-byte aligned, then use the | |
288 | // ld/std loop over 32-byte chunks. | |
289 | // 5. If length<128 and source is not 8-byte aligned, then use the | |
290 | // lvx/vperm/stvx loop over 32-byte chunks. This is the slowest case. | |
291 | // Registers at this point: | |
292 | // r0/cr1 = count of cache lines ("chunks") that we'll cover (may be 0) | |
293 | // rs = alignment unknown | |
294 | // rd = 16-byte aligned | |
295 | // rc = bytes remaining | |
296 | // w2 = low 4 bits of (rd-rs), used to check alignment | |
297 | // cr5 = beq if source is also 16-byte aligned | |
298 | ||
299 | LFwdAligned: | |
300 | andi. w3,w2,7 // is source at least 8-byte aligned? | |
301 | mtcrf 0x01,rc // move leftover count to cr7 for LShort16 | |
302 | bne cr1,LFwdLongVectors // at least one 128-byte chunk, so use vectors | |
303 | srwi w1,rc,5 // get 32-byte chunk count | |
304 | mtcrf 0x02,rc // move bit 27 of length to cr6 for LShort32 | |
305 | mtctr w1 // set up 32-byte loop (w1!=0) | |
306 | beq LFwdMedAligned // source is 8-byte aligned, so use ld/std loop | |
307 | mfspr rv,vrsave // get bitmap of live vector registers | |
308 | oris w4,rv,0xFFF8 // we use v0-v12 | |
309 | li c16,16 // get constant used in lvx | |
310 | li c32,32 | |
311 | mtspr vrsave,w4 // update mask | |
312 | lvx v1,0,rs // prefetch 1st source quadword | |
313 | lvsl vp,0,rs // get permute vector to shift left | |
314 | ||
315 | ||
316 | // Fewer than 128 bytes but not doubleword aligned: use lvx/vperm/stvx. | |
317 | ||
318 | 1: // loop over 32-byte chunks | |
319 | lvx v2,c16,rs | |
320 | lvx v3,c32,rs | |
321 | addi rs,rs,32 | |
322 | vperm vx,v1,v2,vp | |
323 | vperm vy,v2,v3,vp | |
324 | vor v1,v3,v3 // v1 <- v3 | |
325 | stvx vx,0,rd | |
326 | stvx vy,c16,rd | |
327 | addi rd,rd,32 | |
328 | bdnz 1b | |
329 | ||
330 | mtspr vrsave,rv // restore bitmap of live vr's | |
331 | b LShort32 | |
332 | ||
333 | ||
334 | // Fewer than 128 bytes and doubleword aligned: use ld/std. | |
335 | ||
336 | .align 5 | |
337 | LFwdMedAligned: // loop over 32-byte chunks | |
338 | ld w1,0(rs) | |
339 | ld w2,8(rs) | |
340 | ld w3,16(rs) | |
341 | ld w4,24(rs) | |
342 | addi rs,rs,32 | |
343 | std w1,0(rd) | |
344 | std w2,8(rd) | |
345 | std w3,16(rd) | |
346 | std w4,24(rd) | |
347 | addi rd,rd,32 | |
348 | bdnz LFwdMedAligned | |
349 | ||
350 | b LShort32 | |
351 | ||
352 | ||
353 | // Forward, 128 bytes or more: use vectors. When entered: | |
354 | // r0 = 128-byte chunks to move (>0) | |
355 | // rd = 16-byte aligned | |
356 | // cr5 = beq if source is 16-byte aligned | |
357 | // cr7 = low 4 bits of rc (ie, leftover byte count 0-15) | |
358 | // We set up many registers: | |
359 | // ctr = number of 128-byte chunks to move | |
360 | // r0/cr0 = leftover QWs to move | |
361 | // cr7 = low 4 bits of rc (ie, leftover byte count 0-15) | |
362 | // cr6 = beq if leftover byte count is 0 | |
363 | // rv = original value of VRSave | |
364 | // c16,c32,c48 = loaded | |
365 | ||
366 | LFwdLongVectors: | |
367 | mfspr rv,vrsave // get bitmap of live vector registers | |
368 | lis w3,kVeryLong>>16 // cutoff for very-long-operand special case path | |
369 | cmplw cr1,rc,w3 // very long operand? | |
370 | rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3 | |
91447636 | 371 | bge-- cr1,LBigCopy // handle big copies separately |
55e303ae A |
372 | mtctr r0 // set up loop count |
373 | cmpwi cr6,w3,0 // set cr6 on leftover byte count | |
374 | oris w4,rv,0xFFF8 // we use v0-v12 | |
375 | rlwinm. r0,rc,28,29,31 // get number of quadword leftovers (0-7) and set cr0 | |
376 | li c16,16 // get constants used in ldvx/stvx | |
377 | mtspr vrsave,w4 // update mask | |
378 | li c32,32 | |
379 | li c48,48 | |
380 | beq cr5,LFwdLongAligned // source is also 16-byte aligned, no need for vperm | |
381 | lvsl vp,0,rs // get permute vector to shift left | |
382 | lvx v1,0,rs // prefetch 1st source quadword | |
383 | b LFwdLongUnaligned | |
384 | ||
385 | ||
386 | // Forward, long, unaligned vector loop. | |
387 | ||
388 | .align 5 // align inner loops | |
389 | LFwdLongUnaligned: // loop over 128-byte chunks | |
390 | addi w4,rs,64 | |
391 | lvx v2,c16,rs | |
392 | lvx v3,c32,rs | |
393 | lvx v4,c48,rs | |
394 | lvx v5,0,w4 | |
395 | lvx v6,c16,w4 | |
396 | vperm vw,v1,v2,vp | |
397 | lvx v7,c32,w4 | |
398 | lvx v8,c48,w4 | |
399 | addi rs,rs,128 | |
400 | vperm vx,v2,v3,vp | |
401 | addi w4,rd,64 | |
402 | lvx v1,0,rs | |
403 | stvx vw,0,rd | |
404 | vperm vy,v3,v4,vp | |
405 | stvx vx,c16,rd | |
406 | vperm vz,v4,v5,vp | |
407 | stvx vy,c32,rd | |
408 | vperm vw,v5,v6,vp | |
409 | stvx vz,c48,rd | |
410 | vperm vx,v6,v7,vp | |
411 | addi rd,rd,128 | |
412 | stvx vw,0,w4 | |
413 | vperm vy,v7,v8,vp | |
414 | stvx vx,c16,w4 | |
415 | vperm vz,v8,v1,vp | |
416 | stvx vy,c32,w4 | |
417 | stvx vz,c48,w4 | |
418 | bdnz LFwdLongUnaligned | |
419 | ||
420 | beq 4f // no leftover quadwords | |
421 | mtctr r0 | |
422 | 3: // loop over remaining quadwords | |
423 | lvx v2,c16,rs | |
424 | addi rs,rs,16 | |
425 | vperm vx,v1,v2,vp | |
426 | vor v1,v2,v2 // v1 <- v2 | |
427 | stvx vx,0,rd | |
428 | addi rd,rd,16 | |
429 | bdnz 3b | |
430 | 4: | |
431 | mtspr vrsave,rv // restore bitmap of live vr's | |
432 | bne cr6,LShort16 // handle last 0-15 bytes if any | |
433 | blr | |
434 | ||
435 | ||
436 | // Forward, long, 16-byte aligned vector loop. | |
437 | ||
438 | .align 5 | |
439 | LFwdLongAligned: // loop over 128-byte chunks | |
440 | addi w4,rs,64 | |
441 | lvx v1,0,rs | |
442 | lvx v2,c16,rs | |
443 | lvx v3,c32,rs | |
444 | lvx v4,c48,rs | |
445 | lvx v5,0,w4 | |
446 | lvx v6,c16,w4 | |
447 | lvx v7,c32,w4 | |
448 | lvx v8,c48,w4 | |
449 | addi rs,rs,128 | |
450 | addi w4,rd,64 | |
451 | stvx v1,0,rd | |
452 | stvx v2,c16,rd | |
453 | stvx v3,c32,rd | |
454 | stvx v4,c48,rd | |
455 | stvx v5,0,w4 | |
456 | stvx v6,c16,w4 | |
457 | stvx v7,c32,w4 | |
458 | stvx v8,c48,w4 | |
459 | addi rd,rd,128 | |
460 | bdnz LFwdLongAligned | |
461 | ||
462 | beq 4f // no leftover quadwords | |
463 | mtctr r0 | |
464 | 3: // loop over remaining quadwords (1-7) | |
465 | lvx v1,0,rs | |
466 | addi rs,rs,16 | |
467 | stvx v1,0,rd | |
468 | addi rd,rd,16 | |
469 | bdnz 3b | |
470 | 4: | |
471 | mtspr vrsave,rv // restore bitmap of live vr's | |
472 | bne cr6,LShort16 // handle last 0-15 bytes if any | |
473 | blr | |
474 | ||
475 | ||
476 | // Long, reverse moves. | |
477 | // rs = source | |
478 | // rd = destination | |
479 | // rc = count | |
480 | // cr5 = beq if relatively 16-byte aligned | |
481 | ||
482 | LLongReverse: | |
483 | add rd,rd,rc // point to end of operands | |
484 | add rs,rs,rc | |
485 | andi. r0,rd,0xF // #bytes to 16-byte align destination | |
486 | beq 2f // already aligned | |
487 | ||
488 | // 16-byte align destination. | |
489 | ||
490 | mtctr r0 // set up for loop | |
491 | sub rc,rc,r0 | |
492 | 1: | |
493 | lbzu w1,-1(rs) | |
494 | stbu w1,-1(rd) | |
495 | bdnz 1b | |
496 | ||
497 | // Prepare for reverse vector loop. When entered: | |
498 | // rd = 16-byte aligned | |
499 | // cr5 = beq if source also 16-byte aligned | |
500 | // We set up many registers: | |
501 | // ctr/cr1 = number of 64-byte chunks to move (may be 0) | |
502 | // r0/cr0 = leftover QWs to move | |
503 | // cr7 = low 4 bits of rc (ie, leftover byte count 0-15) | |
504 | // cr6 = beq if leftover byte count is 0 | |
505 | // cm1 = -1 | |
506 | // rv = original value of vrsave | |
507 | ||
508 | 2: | |
509 | mfspr rv,vrsave // get bitmap of live vector registers | |
510 | srwi r0,rc,6 // get count of 64-byte chunks to move (may be 0) | |
511 | oris w1,rv,0xFFF8 // we use v0-v12 | |
512 | mtcrf 0x01,rc // prepare for moving last 0-15 bytes in LShortReverse16 | |
513 | rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3 too | |
514 | cmpwi cr1,r0,0 // set cr1 on chunk count | |
515 | mtspr vrsave,w1 // update mask | |
516 | mtctr r0 // set up loop count | |
517 | cmpwi cr6,w3,0 // set cr6 on leftover byte count | |
518 | rlwinm. r0,rc,28,30,31 // get number of quadword leftovers (0-3) and set cr0 | |
519 | li cm1,-1 // get constants used in ldvx/stvx | |
520 | ||
521 | bne cr5,LReverseVecUnal // handle unaligned operands | |
522 | beq cr1,2f // no chunks (if no chunks, must be leftover QWs) | |
523 | li cm17,-17 | |
524 | li cm33,-33 | |
525 | li cm49,-49 | |
526 | b 1f | |
527 | ||
528 | // Long, reverse 16-byte-aligned vector loop. | |
529 | ||
530 | .align 5 // align inner loops | |
531 | 1: // loop over 64-byte chunks | |
532 | lvx v1,cm1,rs | |
533 | lvx v2,cm17,rs | |
534 | lvx v3,cm33,rs | |
535 | lvx v4,cm49,rs | |
536 | subi rs,rs,64 | |
537 | stvx v1,cm1,rd | |
538 | stvx v2,cm17,rd | |
539 | stvx v3,cm33,rd | |
540 | stvx v4,cm49,rd | |
541 | subi rd,rd,64 | |
542 | bdnz 1b | |
543 | ||
544 | beq 4f // no leftover quadwords | |
545 | 2: // r0=#QWs, rv=vrsave, cr7=(rc & F), cr6 set on cr7 | |
546 | mtctr r0 | |
547 | 3: // loop over remaining quadwords (1-7) | |
548 | lvx v1,cm1,rs | |
549 | subi rs,rs,16 | |
550 | stvx v1,cm1,rd | |
551 | subi rd,rd,16 | |
552 | bdnz 3b | |
553 | 4: | |
554 | mtspr vrsave,rv // restore bitmap of live vr's | |
555 | bne cr6,LShortReverse16 // handle last 0-15 bytes if any | |
556 | blr | |
557 | ||
558 | ||
559 | // Long, reverse, unaligned vector loop. | |
560 | // ctr/cr1 = number of 64-byte chunks to move (may be 0) | |
561 | // r0/cr0 = leftover QWs to move | |
562 | // cr7 = low 4 bits of rc (ie, leftover byte count 0-15) | |
563 | // cr6 = beq if leftover byte count is 0 | |
564 | // rv = original value of vrsave | |
565 | // cm1 = -1 | |
566 | ||
567 | LReverseVecUnal: | |
568 | lvsl vp,0,rs // get permute vector to shift left | |
569 | lvx v1,cm1,rs // v1 always looks ahead | |
570 | li cm17,-17 | |
571 | beq cr1,2f // no chunks (if no chunks, must be leftover QWs) | |
572 | li cm33,-33 | |
573 | li cm49,-49 | |
574 | b 1f | |
575 | ||
576 | .align 5 // align the inner loops | |
577 | 1: // loop over 64-byte chunks | |
578 | lvx v2,cm17,rs | |
579 | lvx v3,cm33,rs | |
580 | lvx v4,cm49,rs | |
581 | subi rs,rs,64 | |
582 | vperm vx,v2,v1,vp | |
583 | lvx v1,cm1,rs | |
584 | vperm vy,v3,v2,vp | |
585 | stvx vx,cm1,rd | |
586 | vperm vz,v4,v3,vp | |
587 | stvx vy,cm17,rd | |
588 | vperm vx,v1,v4,vp | |
589 | stvx vz,cm33,rd | |
590 | stvx vx,cm49,rd | |
591 | subi rd,rd,64 | |
592 | bdnz 1b | |
593 | ||
594 | beq 4f // no leftover quadwords | |
595 | 2: // r0=#QWs, rv=vrsave, v1=next QW, cr7=(rc & F), cr6 set on cr7 | |
596 | mtctr r0 | |
597 | 3: // loop over 1-3 quadwords | |
598 | lvx v2,cm17,rs | |
599 | subi rs,rs,16 | |
600 | vperm vx,v2,v1,vp | |
601 | vor v1,v2,v2 // v1 <- v2 | |
602 | stvx vx,cm1,rd | |
603 | subi rd,rd,16 | |
604 | bdnz 3b | |
605 | 4: | |
606 | mtspr vrsave,rv // restore bitmap of live vr's | |
607 | bne cr6,LShortReverse16 // handle last 0-15 bytes iff any | |
608 | blr | |
609 | ||
91447636 A |
610 | |
611 | // Very Big Copy Path. Save our return address in the stack for help decoding backtraces. | |
612 | // The conditions bigcopy expects are: | |
613 | // r0 = return address (also stored in caller's SF) | |
614 | // r4 = source ptr | |
615 | // r5 = length (at least several pages) | |
616 | // r12 = dest ptr | |
617 | ||
618 | LBigCopy: | |
619 | lis r2,0x4000 // r2 <- 0x40000000 | |
620 | mflr r0 // get our return address | |
621 | add. r2,r2,r2 // set cr0_lt if running in 32-bit mode | |
622 | stw r0,8(r1) // save return, assuming 32-bit mode ("crsave" if 64-bit mode) | |
623 | blta _COMM_PAGE_BIGCOPY // 32-bit mode, join big operand copy | |
624 | std r0,16(r1) // save return in correct spot for 64-bit mode | |
625 | ba _COMM_PAGE_BIGCOPY // then join big operand code | |
626 | ||
627 | ||
628 | COMMPAGE_DESCRIPTOR(bcopy_970,_COMM_PAGE_BCOPY,k64Bit+kHasAltivec,0, \ | |
629 | kCommPageMTCRF+kCommPageBoth+kPort32to64) |