2 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
28 /* =======================================
29 * BCOPY, MEMCPY, and MEMMOVE for Mac OS X
30 * =======================================
32 * Version of 2/20/2003, tuned for G4. The inner loops use DCBA to avoid
33 * reading destination cache lines. Only the 7450 actually benefits from
34 * this, and then only in the cold-cache case. On 7400s and 7455s, we
35 * patch the DCBAs into NOPs.
37 * Register usage. Note we use R2, so this code will not run in a PEF/CFM
38 * environment. Note also the rather delicate way we assign multiple uses
39 * to the same register. Beware.
41 * r0 = "w7" or temp (NB: cannot use r0 for any constant such as "c16")
42 * r2 = "w8" or vrsave ("rv")
43 * r3 = not used, as memcpy and memmove return 1st parameter as a value
44 * r4 = source ptr ("rs")
45 * r5 = count of bytes to move ("rc")
46 * r6 = "w1", "c16", or "cm17"
47 * r7 = "w2", "c32", or "cm33"
48 * r8 = "w3", "c48", or "cm49"
50 * r10 = "w5", "c96", or "cm97"
51 * r11 = "w6", "c128", or "cm129"
52 * r12 = destination ptr ("rd")
53 * v0 = permute vector ("vp")
54 * v1-v4 = qw's loaded from source
55 * v5-v7 = permuted qw's ("vw", "vx", "vy")
89 #include <sys/appleapiopts.h>
91 #include <machine/cpu_capabilities.h>
92 #include <machine/commpage.h>
96 #define kMedium 32 // too long for inline loopless code
97 #define kLong 96 // long enough to justify use of Altivec
100 // Main entry points.
103 bcopy_g4: // void bcopy(const void *src, void *dst, size_t len)
104 cmplwi rc,kMedium // short or long?
105 sub w1,r4,r3 // must move in reverse if (rd-rs)<rc
106 cmplw cr1,w1,rc // set cr1 blt iff we must move reverse
107 mr rd,r4 // start to move registers to canonic spot
109 blt+ LShort // handle short operands
110 dcbt 0,r3 // touch in destination
111 b LMedium // join medium/long operand code
113 // NB: memmove() must be 8 words past bcopy(), to agree with comm page addresses.
116 Lmemcpy_g4: // void* memcpy(void *dst, void *src, size_t len)
117 Lmemmove_g4: // void* memmove(void *dst, const void *src, size_t len)
118 cmplwi rc,kMedium // short or long?
119 sub w1,r3,r4 // must move in reverse if (rd-rs)<rc
120 dcbt 0,r4 // touch in the first line of source
121 cmplw cr1,w1,rc // set cr1 blt iff we must move reverse
122 mr rd,r3 // must leave r3 alone, it is return value for memcpy etc
123 bge- LMedium // handle medium or long operands
125 // Handle short operands.
128 andi. r0,rc,0x10 // test bit 27 separately (faster on G4)
129 mtcrf 0x01,rc // put length bits 28-31 in cr7
130 blt- cr1,LShortReverse
132 // Forward short operands. This is the most frequent case, so it is inline.
134 beq LShort16 // quadword to move?
145 LShort16: // join here to xfer 0-15 bytes
146 bf 28,2f // doubleword?
160 bf 30,4f // halfword to move?
166 bflr 31 // skip if no odd byte
172 // Handle short reverse operands.
173 // cr0 = bne if bit 27 of length is set
174 // cr7 = bits 28-31 of length
177 add rs,rs,rc // adjust ptrs for reverse move
179 beq LShortReverse16 // quadword to move?
188 LShortReverse16: // join here to xfer 0-15 bytes and return
189 bf 28,2f // doubleword?
199 bf 30,4f // halfword to move?
203 bflr 31 // done if no odd byte
204 lbz w1,-1(rs) // no update
209 // Medium and long operands. Use Altivec if long enough, else scalar loops.
210 // w1 = (rd-rs), used to check for alignment
211 // cr1 = blt iff we must move reverse
215 dcbtst 0,rd // touch in destination
216 cmplwi cr7,rc,kLong // long enough for vectors?
217 neg w3,rd // start to compute #bytes to align destination
218 rlwinm r0,w1,0,0x7 // check relative 8-byte alignment
219 andi. w6,w3,7 // w6 <- #bytes to 8-byte align destination
220 blt cr1,LMediumReverse // handle reverse moves
221 rlwinm w4,w3,0,0x1F // w4 <- #bytes to 32-byte align destination
222 cmpwi cr6,r0,0 // set cr6 beq if relatively aligned
223 bge cr7,LFwdLong // long enough for vectors
225 // Medium length: use scalar loops.
226 // w6/cr0 = #bytes to 8-byte align destination
227 // cr6 = beq if relatively doubleword aligned
229 sub rc,rc,w6 // decrement length remaining
230 beq 1f // skip if dest already doubleword aligned
231 mtxer w6 // set up count for move
232 lswx w1,0,rs // move w6 bytes to align destination
234 add rs,rs,w6 // bump ptrs past
237 srwi r0,rc,4 // get # 16-byte chunks (>=1)
238 mtcrf 0x01,rc // save remaining byte count here for LShort16
239 mtctr r0 // set up 16-byte loop
240 bne cr6,3f // source not 4-byte aligned
244 2: // loop over 16-byte aligned chunks
256 3: // loop over 16-byte unaligned chunks
272 // Vector loops. First, we must 32-byte align the destination.
273 // w1 = (rd-rs), used to check for reverse and alignment
274 // w4 = #bytes to 32-byte align destination
275 // rc = long enough for at least one vector loop
278 cmpwi w4,0 // dest already aligned?
279 sub rc,rc,w4 // adjust length
280 mtcrf 0x01,w4 // cr7 <- #bytes to align dest
281 rlwinm w2,w1,0,0xF // relatively 16-byte aligned?
282 mtcrf 0x02,w4 // finish moving #bytes to align to cr6 and cr7
283 srwi r0,rc,6 // get # 64-byte chunks to xfer (>=1)
284 cmpwi cr5,w2,0 // set cr5 beq if relatively 16-byte aligned
285 beq LFwdAligned // dest is already aligned
287 // 32-byte align destination.
289 bf 31,1f // byte to move?
295 bf 30,2f // halfword?
307 bf 28,4f // doubleword?
315 bf 27,LFwdAligned // quadword?
328 // Destination is 32-byte aligned.
329 // r0 = count of 64-byte chunks to move (not 0)
330 // rd = 32-byte aligned
331 // rc = bytes remaining
332 // cr5 = beq if source is 16-byte aligned
333 // We set up many registers:
334 // ctr = number of 64-byte chunks to move
335 // r0/cr0 = leftover QWs to move
336 // cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
337 // cr6 = beq if leftover byte count is 0
338 // rv = original value of vrsave
342 mfspr rv,vrsave // get bitmap of live vector registers
343 mtcrf 0x01,rc // move leftover count to cr7 for LShort16
344 rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3
345 mtctr r0 // set up loop count
346 cmpwi cr6,w3,0 // set cr6 on leftover byte count
347 oris w1,rv,0xFF00 // we use v0-v7
348 rlwinm. r0,rc,28,30,31 // get number of quadword leftovers (0-3) and set cr0
349 mtspr vrsave,w1 // update mask
350 li c16,16 // get constants used in ldvx/stvx
355 bne cr5,LForwardVecUnal // handle unaligned operands
359 1: // loop over 64-byte chunks
367 dcba 0,rd // patched to NOP on some machines
370 dcba c32,rd // patched to NOP on some machines
376 beq 4f // no leftover quadwords
378 3: // loop over remaining quadwords (1-3)
385 mtspr vrsave,rv // restore bitmap of live vr's
386 bne cr6,LShort16 // handle last 0-15 bytes if any
390 // Long, forward, unaligned vector loop.
393 lvsl vp,0,rs // get permute vector to shift left
394 lvx v1,0,rs // prefetch 1st source quadword
397 .align 4 // align inner loops
398 1: // loop over 64-byte chunks
408 dcba 0,rd // patched to NOP on some machines
413 dcba c32,rd // patched to NOP on some machines
419 beq- 4f // no leftover quadwords
421 3: // loop over remaining quadwords
425 vor v1,v2,v2 // v1 <- v2
430 mtspr vrsave,rv // restore bitmap of live vr's
431 bne cr6,LShort16 // handle last 0-15 bytes if any
435 // Medium and long, reverse moves. We use altivec if the operands are long enough,
436 // else a lwz/stx loop.
437 // w1 = (rd-rs), used to check for reverse and alignment
441 add rd,rd,rc // point to end of operands
443 andi. w4,rd,0x1F // w4 <- #bytes to 32-byte align destination
444 rlwinm w6,rd,0,0x3 // w6 <- #bytes to 4-byte align destination
445 bge cr7,LLongReverse // long enough for vectors
448 // w6 = #bytes to 4-byte align destination
450 sub rc,rc,w6 // decrement length remaining
451 mtxer w6 // set up count for move
452 sub rs,rs,w6 // back up ptrs
454 srwi r0,rc,4 // get # 16-byte chunks (>=1)
455 mtcrf 0x01,rc // set remaining byte count here for LShortReverse16
456 lswx w1,0,rs // move w6 bytes to align destination
458 mtctr r0 // set up 16-byte loop
462 1: // loop over 16-byte aligned chunks
476 // Reverse vector loops. First, we must 32-byte align the destination.
477 // w1 = (rd-rs), used to check for reverse and alignment
478 // w4/cr0 = #bytes to 32-byte align destination
479 // rc = long enough for at least one vector loop
482 sub rc,rc,w4 // adjust length
483 mtcrf 0x01,w4 // cr7 <- #bytes to align dest
484 rlwinm w2,w1,0,0xF // relatively 16-byte aligned?
485 mtcrf 0x02,w4 // finish moving #bytes to align to cr6 and cr7
486 srwi r0,rc,6 // get # 64-byte chunks to xfer (>=1)
487 cmpwi cr5,w2,0 // set cr5 beq if relatively 16-byte aligned
488 beq LReverseAligned // dest is already aligned
490 // 32-byte align destination.
492 bf 31,1f // byte to move?
496 bf 30,2f // halfword?
504 bf 28,4f // doubleword?
510 bf 27,LReverseAligned // quadword?
520 // Destination is 32-byte aligned.
521 // r0 = count of 64-byte chunks to move (not 0)
522 // rd = 32-byte aligned
523 // rc = bytes remaining
524 // cr5 = beq if source is 16-byte aligned
525 // We set up many registers:
526 // ctr = number of 64-byte chunks to move
527 // r0/cr0 = leftover QWs to move
528 // cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
529 // cr6 = beq if leftover byte count is 0
530 // rv = original value of vrsave
534 mfspr rv,vrsave // get bitmap of live vector registers
535 mtcrf 0x01,rc // move leftover count to cr7 for LShort16
536 rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3
537 mtctr r0 // set up loop count
538 cmpwi cr6,w3,0 // set cr6 on leftover byte count
539 oris w1,rv,0xFF00 // we use v0-v7
540 rlwinm. r0,rc,28,30,31 // get number of quadword leftovers (0-3) and set cr0
541 mtspr vrsave,w1 // update mask
542 li cm1,-1 // get constants used in ldvx/stvx
548 bne cr5,LReverseVecUnal // handle unaligned operands
551 .align 4 // align inner loops
552 1: // loop over 64-byte chunks
567 beq 4f // no leftover quadwords
569 3: // loop over remaining quadwords (1-7)
576 mtspr vrsave,rv // restore bitmap of live vr's
577 bne cr6,LShortReverse16 // handle last 0-15 bytes if any
581 // Long, reverse, unaligned vector loop.
584 lvsl vp,0,rs // get permute vector to shift left
585 lvx v1,cm1,rs // v1 always looks ahead
588 .align 4 // align the inner loops
589 1: // loop over 64-byte chunks
608 beq 3f // no leftover quadwords
610 2: // loop over 1-3 quadwords
614 vor v1,v2,v2 // v1 <- v2
619 mtspr vrsave,rv // restore bitmap of live vr's
620 bne cr6,LShortReverse16 // handle last 0-15 bytes iff any
623 COMMPAGE_DESCRIPTOR(bcopy_g4,_COMM_PAGE_BCOPY,kHasAltivec,k64Bit,kCommPageDCBA+kCommPage32)