]>
Commit | Line | Data |
---|---|---|
55e303ae A |
1 | /* |
2 | * Copyright (c) 2003 Apple Computer, Inc. All rights reserved. | |
3 | * | |
2d21ac55 | 4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ |
55e303ae | 5 | * |
2d21ac55 A |
6 | * This file contains Original Code and/or Modifications of Original Code |
7 | * as defined in and that are subject to the Apple Public Source License | |
8 | * Version 2.0 (the 'License'). You may not use this file except in | |
9 | * compliance with the License. The rights granted to you under the License | |
10 | * may not be used to create, or enable the creation or redistribution of, | |
11 | * unlawful or unlicensed copies of an Apple operating system, or to | |
12 | * circumvent, violate, or enable the circumvention or violation of, any | |
13 | * terms of an Apple operating system software license agreement. | |
8f6c56a5 | 14 | * |
2d21ac55 A |
15 | * Please obtain a copy of the License at |
16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. | |
17 | * | |
18 | * The Original Code and all software distributed under the License are | |
19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
8f6c56a5 A |
20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, |
21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
2d21ac55 A |
22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. |
23 | * Please see the License for the specific language governing rights and | |
24 | * limitations under the License. | |
8f6c56a5 | 25 | * |
2d21ac55 | 26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ |
55e303ae A |
27 | */ |
28 | /* ======================================= | |
29 | * BCOPY, MEMCPY, and MEMMOVE for Mac OS X | |
30 | * ======================================= | |
31 | * | |
32 | * Version of 2/20/2003, tuned for G4. The inner loops use DCBA to avoid | |
33 | * reading destination cache lines. Only the 7450 actually benefits from | |
34 | * this, and then only in the cold-cache case. On 7400s and 7455s, we | |
35 | * patch the DCBAs into NOPs. | |
36 | * | |
37 | * Register usage. Note we use R2, so this code will not run in a PEF/CFM | |
38 | * environment. Note also the rather delicate way we assign multiple uses | |
39 | * to the same register. Beware. | |
40 | * | |
41 | * r0 = "w7" or temp (NB: cannot use r0 for any constant such as "c16") | |
42 | * r2 = "w8" or vrsave ("rv") | |
43 | * r3 = not used, as memcpy and memmove return 1st parameter as a value | |
44 | * r4 = source ptr ("rs") | |
45 | * r5 = count of bytes to move ("rc") | |
46 | * r6 = "w1", "c16", or "cm17" | |
47 | * r7 = "w2", "c32", or "cm33" | |
48 | * r8 = "w3", "c48", or "cm49" | |
49 | * r9 = "w4", or "cm1" | |
50 | * r10 = "w5", "c96", or "cm97" | |
51 | * r11 = "w6", "c128", or "cm129" | |
52 | * r12 = destination ptr ("rd") | |
53 | * v0 = permute vector ("vp") | |
54 | * v1-v4 = qw's loaded from source | |
55 | * v5-v7 = permuted qw's ("vw", "vx", "vy") | |
56 | */ | |
57 | #define rs r4 | |
58 | #define rd r12 | |
59 | #define rc r5 | |
60 | #define rv r2 | |
61 | ||
62 | #define w1 r6 | |
63 | #define w2 r7 | |
64 | #define w3 r8 | |
65 | #define w4 r9 | |
66 | #define w5 r10 | |
67 | #define w6 r11 | |
68 | #define w7 r0 | |
69 | #define w8 r2 | |
70 | ||
71 | #define c16 r6 | |
72 | #define cm17 r6 | |
73 | #define c32 r7 | |
74 | #define cm33 r7 | |
75 | #define c48 r8 | |
76 | #define cm49 r8 | |
77 | #define cm1 r9 | |
78 | #define c96 r10 | |
79 | #define cm97 r10 | |
80 | #define c128 r11 | |
81 | #define cm129 r11 | |
82 | ||
83 | #define vp v0 | |
84 | #define vw v5 | |
85 | #define vx v6 | |
86 | #define vy v7 | |
87 | ||
55e303ae A |
88 | #include <sys/appleapiopts.h> |
89 | #include <ppc/asm.h> | |
90 | #include <machine/cpu_capabilities.h> | |
91 | #include <machine/commpage.h> | |
92 | ||
93 | .text | |
55e303ae A |
94 | |
95 | #define kMedium 32 // too long for inline loopless code | |
96 | #define kLong 96 // long enough to justify use of Altivec | |
97 | ||
98 | ||
99 | // Main entry points. | |
100 | ||
101 | .align 5 | |
102 | bcopy_g4: // void bcopy(const void *src, void *dst, size_t len) | |
103 | cmplwi rc,kMedium // short or long? | |
104 | sub w1,r4,r3 // must move in reverse if (rd-rs)<rc | |
105 | cmplw cr1,w1,rc // set cr1 blt iff we must move reverse | |
106 | mr rd,r4 // start to move registers to canonic spot | |
107 | mr rs,r3 | |
108 | blt+ LShort // handle short operands | |
109 | dcbt 0,r3 // touch in destination | |
110 | b LMedium // join medium/long operand code | |
111 | ||
112 | // NB: memmove() must be 8 words past bcopy(), to agree with comm page addresses. | |
113 | ||
114 | .align 5 | |
115 | Lmemcpy_g4: // void* memcpy(void *dst, void *src, size_t len) | |
116 | Lmemmove_g4: // void* memmove(void *dst, const void *src, size_t len) | |
117 | cmplwi rc,kMedium // short or long? | |
118 | sub w1,r3,r4 // must move in reverse if (rd-rs)<rc | |
119 | dcbt 0,r4 // touch in the first line of source | |
120 | cmplw cr1,w1,rc // set cr1 blt iff we must move reverse | |
121 | mr rd,r3 // must leave r3 alone, it is return value for memcpy etc | |
122 | bge- LMedium // handle medium or long operands | |
123 | ||
124 | // Handle short operands. | |
125 | ||
126 | LShort: | |
127 | andi. r0,rc,0x10 // test bit 27 separately (faster on G4) | |
128 | mtcrf 0x01,rc // put length bits 28-31 in cr7 | |
129 | blt- cr1,LShortReverse | |
130 | ||
131 | // Forward short operands. This is the most frequent case, so it is inline. | |
132 | ||
133 | beq LShort16 // quadword to move? | |
134 | lwz w1,0(rs) | |
135 | lwz w2,4(rs) | |
136 | lwz w3,8(rs) | |
137 | lwz w4,12(rs) | |
138 | addi rs,rs,16 | |
139 | stw w1,0(rd) | |
140 | stw w2,4(rd) | |
141 | stw w3,8(rd) | |
142 | stw w4,12(rd) | |
143 | addi rd,rd,16 | |
144 | LShort16: // join here to xfer 0-15 bytes | |
145 | bf 28,2f // doubleword? | |
146 | lwz w1,0(rs) | |
147 | lwz w2,4(rs) | |
148 | addi rs,rs,8 | |
149 | stw w1,0(rd) | |
150 | stw w2,4(rd) | |
151 | addi rd,rd,8 | |
152 | 2: | |
153 | bf 29,3f // word? | |
154 | lwz w1,0(rs) | |
155 | addi rs,rs,4 | |
156 | stw w1,0(rd) | |
157 | addi rd,rd,4 | |
158 | 3: | |
159 | bf 30,4f // halfword to move? | |
160 | lhz w1,0(rs) | |
161 | addi rs,rs,2 | |
162 | sth w1,0(rd) | |
163 | addi rd,rd,2 | |
164 | 4: | |
165 | bflr 31 // skip if no odd byte | |
166 | lbz w1,0(rs) | |
167 | stb w1,0(rd) | |
168 | blr | |
169 | ||
170 | ||
171 | // Handle short reverse operands. | |
172 | // cr0 = bne if bit 27 of length is set | |
173 | // cr7 = bits 28-31 of length | |
174 | ||
175 | LShortReverse: | |
176 | add rs,rs,rc // adjust ptrs for reverse move | |
177 | add rd,rd,rc | |
178 | beq LShortReverse16 // quadword to move? | |
179 | lwz w1,-4(rs) | |
180 | lwz w2,-8(rs) | |
181 | lwz w3,-12(rs) | |
182 | lwzu w4,-16(rs) | |
183 | stw w1,-4(rd) | |
184 | stw w2,-8(rd) | |
185 | stw w3,-12(rd) | |
186 | stwu w4,-16(rd) | |
187 | LShortReverse16: // join here to xfer 0-15 bytes and return | |
188 | bf 28,2f // doubleword? | |
189 | lwz w1,-4(rs) | |
190 | lwzu w2,-8(rs) | |
191 | stw w1,-4(rd) | |
192 | stwu w2,-8(rd) | |
193 | 2: | |
194 | bf 29,3f // word? | |
195 | lwzu w1,-4(rs) | |
196 | stwu w1,-4(rd) | |
197 | 3: | |
198 | bf 30,4f // halfword to move? | |
199 | lhzu w1,-2(rs) | |
200 | sthu w1,-2(rd) | |
201 | 4: | |
202 | bflr 31 // done if no odd byte | |
203 | lbz w1,-1(rs) // no update | |
204 | stb w1,-1(rd) | |
205 | blr | |
206 | ||
207 | ||
208 | // Medium and long operands. Use Altivec if long enough, else scalar loops. | |
209 | // w1 = (rd-rs), used to check for alignment | |
210 | // cr1 = blt iff we must move reverse | |
211 | ||
212 | .align 4 | |
213 | LMedium: | |
214 | dcbtst 0,rd // touch in destination | |
215 | cmplwi cr7,rc,kLong // long enough for vectors? | |
216 | neg w3,rd // start to compute #bytes to align destination | |
217 | rlwinm r0,w1,0,0x7 // check relative 8-byte alignment | |
218 | andi. w6,w3,7 // w6 <- #bytes to 8-byte align destination | |
219 | blt cr1,LMediumReverse // handle reverse moves | |
220 | rlwinm w4,w3,0,0x1F // w4 <- #bytes to 32-byte align destination | |
221 | cmpwi cr6,r0,0 // set cr6 beq if relatively aligned | |
222 | bge cr7,LFwdLong // long enough for vectors | |
223 | ||
224 | // Medium length: use scalar loops. | |
225 | // w6/cr0 = #bytes to 8-byte align destination | |
226 | // cr6 = beq if relatively doubleword aligned | |
227 | ||
228 | sub rc,rc,w6 // decrement length remaining | |
229 | beq 1f // skip if dest already doubleword aligned | |
230 | mtxer w6 // set up count for move | |
231 | lswx w1,0,rs // move w6 bytes to align destination | |
232 | stswx w1,0,rd | |
233 | add rs,rs,w6 // bump ptrs past | |
234 | add rd,rd,w6 | |
235 | 1: | |
236 | srwi r0,rc,4 // get # 16-byte chunks (>=1) | |
237 | mtcrf 0x01,rc // save remaining byte count here for LShort16 | |
238 | mtctr r0 // set up 16-byte loop | |
239 | bne cr6,3f // source not 4-byte aligned | |
240 | b 2f | |
241 | ||
242 | .align 4 | |
243 | 2: // loop over 16-byte aligned chunks | |
244 | lfd f0,0(rs) | |
245 | lfd f1,8(rs) | |
246 | addi rs,rs,16 | |
247 | stfd f0,0(rd) | |
248 | stfd f1,8(rd) | |
249 | addi rd,rd,16 | |
250 | bdnz 2b | |
251 | ||
252 | b LShort16 | |
253 | ||
254 | .align 4 | |
255 | 3: // loop over 16-byte unaligned chunks | |
256 | lwz w1,0(rs) | |
257 | lwz w2,4(rs) | |
258 | lwz w3,8(rs) | |
259 | lwz w4,12(rs) | |
260 | addi rs,rs,16 | |
261 | stw w1,0(rd) | |
262 | stw w2,4(rd) | |
263 | stw w3,8(rd) | |
264 | stw w4,12(rd) | |
265 | addi rd,rd,16 | |
266 | bdnz 3b | |
267 | ||
268 | b LShort16 | |
269 | ||
270 | ||
271 | // Vector loops. First, we must 32-byte align the destination. | |
272 | // w1 = (rd-rs), used to check for reverse and alignment | |
273 | // w4 = #bytes to 32-byte align destination | |
274 | // rc = long enough for at least one vector loop | |
275 | ||
276 | LFwdLong: | |
277 | cmpwi w4,0 // dest already aligned? | |
278 | sub rc,rc,w4 // adjust length | |
279 | mtcrf 0x01,w4 // cr7 <- #bytes to align dest | |
280 | rlwinm w2,w1,0,0xF // relatively 16-byte aligned? | |
281 | mtcrf 0x02,w4 // finish moving #bytes to align to cr6 and cr7 | |
282 | srwi r0,rc,6 // get # 64-byte chunks to xfer (>=1) | |
283 | cmpwi cr5,w2,0 // set cr5 beq if relatively 16-byte aligned | |
284 | beq LFwdAligned // dest is already aligned | |
285 | ||
286 | // 32-byte align destination. | |
287 | ||
288 | bf 31,1f // byte to move? | |
289 | lbz w1,0(rs) | |
290 | addi rs,rs,1 | |
291 | stb w1,0(rd) | |
292 | addi rd,rd,1 | |
293 | 1: | |
294 | bf 30,2f // halfword? | |
295 | lhz w1,0(rs) | |
296 | addi rs,rs,2 | |
297 | sth w1,0(rd) | |
298 | addi rd,rd,2 | |
299 | 2: | |
300 | bf 29,3f // word? | |
301 | lwz w1,0(rs) | |
302 | addi rs,rs,4 | |
303 | stw w1,0(rd) | |
304 | addi rd,rd,4 | |
305 | 3: | |
306 | bf 28,4f // doubleword? | |
307 | lwz w1,0(rs) | |
308 | lwz w2,4(rs) | |
309 | addi rs,rs,8 | |
310 | stw w1,0(rd) | |
311 | stw w2,4(rd) | |
312 | addi rd,rd,8 | |
313 | 4: | |
314 | bf 27,LFwdAligned // quadword? | |
315 | lwz w1,0(rs) | |
316 | lwz w2,4(rs) | |
317 | lwz w3,8(rs) | |
318 | lwz w4,12(rs) | |
319 | addi rs,rs,16 | |
320 | stw w1,0(rd) | |
321 | stw w2,4(rd) | |
322 | stw w3,8(rd) | |
323 | stw w4,12(rd) | |
324 | addi rd,rd,16 | |
325 | ||
326 | ||
327 | // Destination is 32-byte aligned. | |
328 | // r0 = count of 64-byte chunks to move (not 0) | |
329 | // rd = 32-byte aligned | |
330 | // rc = bytes remaining | |
331 | // cr5 = beq if source is 16-byte aligned | |
332 | // We set up many registers: | |
333 | // ctr = number of 64-byte chunks to move | |
334 | // r0/cr0 = leftover QWs to move | |
335 | // cr7 = low 4 bits of rc (ie, leftover byte count 0-15) | |
336 | // cr6 = beq if leftover byte count is 0 | |
337 | // rv = original value of vrsave | |
338 | // c16 etc = loaded | |
339 | ||
340 | LFwdAligned: | |
341 | mfspr rv,vrsave // get bitmap of live vector registers | |
342 | mtcrf 0x01,rc // move leftover count to cr7 for LShort16 | |
343 | rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3 | |
344 | mtctr r0 // set up loop count | |
345 | cmpwi cr6,w3,0 // set cr6 on leftover byte count | |
346 | oris w1,rv,0xFF00 // we use v0-v7 | |
347 | rlwinm. r0,rc,28,30,31 // get number of quadword leftovers (0-3) and set cr0 | |
348 | mtspr vrsave,w1 // update mask | |
349 | li c16,16 // get constants used in ldvx/stvx | |
350 | li c32,32 | |
351 | li c48,48 | |
352 | li c96,96 | |
353 | li c128,128 | |
354 | bne cr5,LForwardVecUnal // handle unaligned operands | |
355 | b 1f | |
356 | ||
357 | .align 4 | |
358 | 1: // loop over 64-byte chunks | |
359 | dcbt c96,rs | |
360 | dcbt c128,rs | |
361 | lvx v1,0,rs | |
362 | lvx v2,c16,rs | |
363 | lvx v3,c32,rs | |
364 | lvx v4,c48,rs | |
365 | addi rs,rs,64 | |
366 | dcba 0,rd // patched to NOP on some machines | |
367 | stvx v1,0,rd | |
368 | stvx v2,c16,rd | |
369 | dcba c32,rd // patched to NOP on some machines | |
370 | stvx v3,c32,rd | |
371 | stvx v4,c48,rd | |
372 | addi rd,rd,64 | |
373 | bdnz 1b | |
374 | ||
375 | beq 4f // no leftover quadwords | |
376 | mtctr r0 | |
377 | 3: // loop over remaining quadwords (1-3) | |
378 | lvx v1,0,rs | |
379 | addi rs,rs,16 | |
380 | stvx v1,0,rd | |
381 | addi rd,rd,16 | |
382 | bdnz 3b | |
383 | 4: | |
384 | mtspr vrsave,rv // restore bitmap of live vr's | |
385 | bne cr6,LShort16 // handle last 0-15 bytes if any | |
386 | blr | |
387 | ||
388 | ||
389 | // Long, forward, unaligned vector loop. | |
390 | ||
391 | LForwardVecUnal: | |
392 | lvsl vp,0,rs // get permute vector to shift left | |
393 | lvx v1,0,rs // prefetch 1st source quadword | |
394 | b 1f | |
395 | ||
396 | .align 4 // align inner loops | |
397 | 1: // loop over 64-byte chunks | |
398 | lvx v2,c16,rs | |
399 | dcbt c96,rs | |
400 | lvx v3,c32,rs | |
401 | dcbt c128,rs | |
402 | lvx v4,c48,rs | |
403 | addi rs,rs,64 | |
404 | vperm vw,v1,v2,vp | |
405 | lvx v1,0,rs | |
406 | vperm vx,v2,v3,vp | |
407 | dcba 0,rd // patched to NOP on some machines | |
408 | stvx vw,0,rd | |
409 | vperm vy,v3,v4,vp | |
410 | stvx vx,c16,rd | |
411 | vperm vw,v4,v1,vp | |
412 | dcba c32,rd // patched to NOP on some machines | |
413 | stvx vy,c32,rd | |
414 | stvx vw,c48,rd | |
415 | addi rd,rd,64 | |
416 | bdnz 1b | |
417 | ||
418 | beq- 4f // no leftover quadwords | |
419 | mtctr r0 | |
420 | 3: // loop over remaining quadwords | |
421 | lvx v2,c16,rs | |
422 | addi rs,rs,16 | |
423 | vperm vx,v1,v2,vp | |
424 | vor v1,v2,v2 // v1 <- v2 | |
425 | stvx vx,0,rd | |
426 | addi rd,rd,16 | |
427 | bdnz 3b | |
428 | 4: | |
429 | mtspr vrsave,rv // restore bitmap of live vr's | |
430 | bne cr6,LShort16 // handle last 0-15 bytes if any | |
431 | blr | |
432 | ||
433 | ||
434 | // Medium and long, reverse moves. We use altivec if the operands are long enough, | |
435 | // else a lwz/stx loop. | |
436 | // w1 = (rd-rs), used to check for reverse and alignment | |
437 | // cr7 = bge if long | |
438 | ||
439 | LMediumReverse: | |
440 | add rd,rd,rc // point to end of operands | |
441 | add rs,rs,rc | |
442 | andi. w4,rd,0x1F // w4 <- #bytes to 32-byte align destination | |
443 | rlwinm w6,rd,0,0x3 // w6 <- #bytes to 4-byte align destination | |
444 | bge cr7,LLongReverse // long enough for vectors | |
445 | ||
446 | // Scalar loop. | |
447 | // w6 = #bytes to 4-byte align destination | |
448 | ||
449 | sub rc,rc,w6 // decrement length remaining | |
450 | mtxer w6 // set up count for move | |
451 | sub rs,rs,w6 // back up ptrs | |
452 | sub rd,rd,w6 | |
453 | srwi r0,rc,4 // get # 16-byte chunks (>=1) | |
454 | mtcrf 0x01,rc // set remaining byte count here for LShortReverse16 | |
455 | lswx w1,0,rs // move w6 bytes to align destination | |
456 | stswx w1,0,rd | |
457 | mtctr r0 // set up 16-byte loop | |
458 | b 1f | |
459 | ||
460 | .align 4 | |
461 | 1: // loop over 16-byte aligned chunks | |
462 | lwz w1,-4(rs) | |
463 | lwz w2,-8(rs) | |
464 | lwz w3,-12(rs) | |
465 | lwzu w4,-16(rs) | |
466 | stw w1,-4(rd) | |
467 | stw w2,-8(rd) | |
468 | stw w3,-12(rd) | |
469 | stwu w4,-16(rd) | |
470 | bdnz 1b | |
471 | ||
472 | b LShortReverse16 | |
473 | ||
474 | ||
475 | // Reverse vector loops. First, we must 32-byte align the destination. | |
476 | // w1 = (rd-rs), used to check for reverse and alignment | |
477 | // w4/cr0 = #bytes to 32-byte align destination | |
478 | // rc = long enough for at least one vector loop | |
479 | ||
480 | LLongReverse: | |
481 | sub rc,rc,w4 // adjust length | |
482 | mtcrf 0x01,w4 // cr7 <- #bytes to align dest | |
483 | rlwinm w2,w1,0,0xF // relatively 16-byte aligned? | |
484 | mtcrf 0x02,w4 // finish moving #bytes to align to cr6 and cr7 | |
485 | srwi r0,rc,6 // get # 64-byte chunks to xfer (>=1) | |
486 | cmpwi cr5,w2,0 // set cr5 beq if relatively 16-byte aligned | |
487 | beq LReverseAligned // dest is already aligned | |
488 | ||
489 | // 32-byte align destination. | |
490 | ||
491 | bf 31,1f // byte to move? | |
492 | lbzu w1,-1(rs) | |
493 | stbu w1,-1(rd) | |
494 | 1: | |
495 | bf 30,2f // halfword? | |
496 | lhzu w1,-2(rs) | |
497 | sthu w1,-2(rd) | |
498 | 2: | |
499 | bf 29,3f // word? | |
500 | lwzu w1,-4(rs) | |
501 | stwu w1,-4(rd) | |
502 | 3: | |
503 | bf 28,4f // doubleword? | |
504 | lwz w1,-4(rs) | |
505 | lwzu w2,-8(rs) | |
506 | stw w1,-4(rd) | |
507 | stwu w2,-8(rd) | |
508 | 4: | |
509 | bf 27,LReverseAligned // quadword? | |
510 | lwz w1,-4(rs) | |
511 | lwz w2,-8(rs) | |
512 | lwz w3,-12(rs) | |
513 | lwzu w4,-16(rs) | |
514 | stw w1,-4(rd) | |
515 | stw w2,-8(rd) | |
516 | stw w3,-12(rd) | |
517 | stwu w4,-16(rd) | |
518 | ||
519 | // Destination is 32-byte aligned. | |
520 | // r0 = count of 64-byte chunks to move (not 0) | |
521 | // rd = 32-byte aligned | |
522 | // rc = bytes remaining | |
523 | // cr5 = beq if source is 16-byte aligned | |
524 | // We set up many registers: | |
525 | // ctr = number of 64-byte chunks to move | |
526 | // r0/cr0 = leftover QWs to move | |
527 | // cr7 = low 4 bits of rc (ie, leftover byte count 0-15) | |
528 | // cr6 = beq if leftover byte count is 0 | |
529 | // rv = original value of vrsave | |
530 | // cm1 etc = loaded | |
531 | ||
532 | LReverseAligned: | |
533 | mfspr rv,vrsave // get bitmap of live vector registers | |
534 | mtcrf 0x01,rc // move leftover count to cr7 for LShort16 | |
535 | rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3 | |
536 | mtctr r0 // set up loop count | |
537 | cmpwi cr6,w3,0 // set cr6 on leftover byte count | |
538 | oris w1,rv,0xFF00 // we use v0-v7 | |
539 | rlwinm. r0,rc,28,30,31 // get number of quadword leftovers (0-3) and set cr0 | |
540 | mtspr vrsave,w1 // update mask | |
541 | li cm1,-1 // get constants used in ldvx/stvx | |
542 | li cm17,-17 | |
543 | li cm33,-33 | |
544 | li cm49,-49 | |
545 | li cm97,-97 | |
546 | li cm129,-129 | |
547 | bne cr5,LReverseVecUnal // handle unaligned operands | |
548 | b 1f | |
549 | ||
550 | .align 4 // align inner loops | |
551 | 1: // loop over 64-byte chunks | |
552 | dcbt cm97,rs | |
553 | dcbt cm129,rs | |
554 | lvx v1,cm1,rs | |
555 | lvx v2,cm17,rs | |
556 | lvx v3,cm33,rs | |
557 | lvx v4,cm49,rs | |
558 | subi rs,rs,64 | |
559 | stvx v1,cm1,rd | |
560 | stvx v2,cm17,rd | |
561 | stvx v3,cm33,rd | |
562 | stvx v4,cm49,rd | |
563 | subi rd,rd,64 | |
564 | bdnz 1b | |
565 | ||
566 | beq 4f // no leftover quadwords | |
567 | mtctr r0 | |
568 | 3: // loop over remaining quadwords (1-7) | |
569 | lvx v1,cm1,rs | |
570 | subi rs,rs,16 | |
571 | stvx v1,cm1,rd | |
572 | subi rd,rd,16 | |
573 | bdnz 3b | |
574 | 4: | |
575 | mtspr vrsave,rv // restore bitmap of live vr's | |
576 | bne cr6,LShortReverse16 // handle last 0-15 bytes if any | |
577 | blr | |
578 | ||
579 | ||
580 | // Long, reverse, unaligned vector loop. | |
581 | ||
582 | LReverseVecUnal: | |
583 | lvsl vp,0,rs // get permute vector to shift left | |
584 | lvx v1,cm1,rs // v1 always looks ahead | |
585 | b 1f | |
586 | ||
587 | .align 4 // align the inner loops | |
588 | 1: // loop over 64-byte chunks | |
589 | lvx v2,cm17,rs | |
590 | dcbt cm97,rs | |
591 | lvx v3,cm33,rs | |
592 | dcbt cm129,rs | |
593 | lvx v4,cm49,rs | |
594 | subi rs,rs,64 | |
595 | vperm vw,v2,v1,vp | |
596 | lvx v1,cm1,rs | |
597 | vperm vx,v3,v2,vp | |
598 | stvx vw,cm1,rd | |
599 | vperm vy,v4,v3,vp | |
600 | stvx vx,cm17,rd | |
601 | vperm vw,v1,v4,vp | |
602 | stvx vy,cm33,rd | |
603 | stvx vw,cm49,rd | |
604 | subi rd,rd,64 | |
605 | bdnz 1b | |
606 | ||
607 | beq 3f // no leftover quadwords | |
608 | mtctr r0 | |
609 | 2: // loop over 1-3 quadwords | |
610 | lvx v2,cm17,rs | |
611 | subi rs,rs,16 | |
612 | vperm vx,v2,v1,vp | |
613 | vor v1,v2,v2 // v1 <- v2 | |
614 | stvx vx,cm1,rd | |
615 | subi rd,rd,16 | |
616 | bdnz 2b | |
617 | 3: | |
618 | mtspr vrsave,rv // restore bitmap of live vr's | |
619 | bne cr6,LShortReverse16 // handle last 0-15 bytes iff any | |
620 | blr | |
621 | ||
91447636 | 622 | COMMPAGE_DESCRIPTOR(bcopy_g4,_COMM_PAGE_BCOPY,kHasAltivec,k64Bit,kCommPageDCBA+kCommPage32) |