2 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
4 * @APPLE_LICENSE_OSREFERENCE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the
10 * License may not be used to create, or enable the creation or
11 * redistribution of, unlawful or unlicensed copies of an Apple operating
12 * system, or to circumvent, violate, or enable the circumvention or
13 * violation of, any terms of an Apple operating system software license
16 * Please obtain a copy of the License at
17 * http://www.opensource.apple.com/apsl/ and read it before using this
20 * The Original Code and all software distributed under the License are
21 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
22 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
23 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
24 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
25 * Please see the License for the specific language governing rights and
26 * limitations under the License.
28 * @APPLE_LICENSE_OSREFERENCE_HEADER_END@
30 /* =======================================
31 * BCOPY, MEMCPY, and MEMMOVE for Mac OS X
32 * =======================================
34 * Version of 2/20/2003, tuned for G4. The inner loops use DCBA to avoid
35 * reading destination cache lines. Only the 7450 actually benefits from
36 * this, and then only in the cold-cache case. On 7400s and 7455s, we
37 * patch the DCBAs into NOPs.
39 * Register usage. Note we use R2, so this code will not run in a PEF/CFM
40 * environment. Note also the rather delicate way we assign multiple uses
41 * to the same register. Beware.
43 * r0 = "w7" or temp (NB: cannot use r0 for any constant such as "c16")
44 * r2 = "w8" or vrsave ("rv")
45 * r3 = not used, as memcpy and memmove return 1st parameter as a value
46 * r4 = source ptr ("rs")
47 * r5 = count of bytes to move ("rc")
48 * r6 = "w1", "c16", or "cm17"
49 * r7 = "w2", "c32", or "cm33"
50 * r8 = "w3", "c48", or "cm49"
52 * r10 = "w5", "c96", or "cm97"
53 * r11 = "w6", "c128", or "cm129"
54 * r12 = destination ptr ("rd")
55 * v0 = permute vector ("vp")
56 * v1-v4 = qw's loaded from source
57 * v5-v7 = permuted qw's ("vw", "vx", "vy")
91 #include <sys/appleapiopts.h>
93 #include <machine/cpu_capabilities.h>
94 #include <machine/commpage.h>
98 #define kMedium 32 // too long for inline loopless code
99 #define kLong 96 // long enough to justify use of Altivec
102 // Main entry points.
105 bcopy_g4: // void bcopy(const void *src, void *dst, size_t len)
106 cmplwi rc,kMedium // short or long?
107 sub w1,r4,r3 // must move in reverse if (rd-rs)<rc
108 cmplw cr1,w1,rc // set cr1 blt iff we must move reverse
109 mr rd,r4 // start to move registers to canonic spot
111 blt+ LShort // handle short operands
112 dcbt 0,r3 // touch in destination
113 b LMedium // join medium/long operand code
115 // NB: memmove() must be 8 words past bcopy(), to agree with comm page addresses.
118 Lmemcpy_g4: // void* memcpy(void *dst, void *src, size_t len)
119 Lmemmove_g4: // void* memmove(void *dst, const void *src, size_t len)
120 cmplwi rc,kMedium // short or long?
121 sub w1,r3,r4 // must move in reverse if (rd-rs)<rc
122 dcbt 0,r4 // touch in the first line of source
123 cmplw cr1,w1,rc // set cr1 blt iff we must move reverse
124 mr rd,r3 // must leave r3 alone, it is return value for memcpy etc
125 bge- LMedium // handle medium or long operands
127 // Handle short operands.
130 andi. r0,rc,0x10 // test bit 27 separately (faster on G4)
131 mtcrf 0x01,rc // put length bits 28-31 in cr7
132 blt- cr1,LShortReverse
134 // Forward short operands. This is the most frequent case, so it is inline.
136 beq LShort16 // quadword to move?
147 LShort16: // join here to xfer 0-15 bytes
148 bf 28,2f // doubleword?
162 bf 30,4f // halfword to move?
168 bflr 31 // skip if no odd byte
174 // Handle short reverse operands.
175 // cr0 = bne if bit 27 of length is set
176 // cr7 = bits 28-31 of length
179 add rs,rs,rc // adjust ptrs for reverse move
181 beq LShortReverse16 // quadword to move?
190 LShortReverse16: // join here to xfer 0-15 bytes and return
191 bf 28,2f // doubleword?
201 bf 30,4f // halfword to move?
205 bflr 31 // done if no odd byte
206 lbz w1,-1(rs) // no update
211 // Medium and long operands. Use Altivec if long enough, else scalar loops.
212 // w1 = (rd-rs), used to check for alignment
213 // cr1 = blt iff we must move reverse
217 dcbtst 0,rd // touch in destination
218 cmplwi cr7,rc,kLong // long enough for vectors?
219 neg w3,rd // start to compute #bytes to align destination
220 rlwinm r0,w1,0,0x7 // check relative 8-byte alignment
221 andi. w6,w3,7 // w6 <- #bytes to 8-byte align destination
222 blt cr1,LMediumReverse // handle reverse moves
223 rlwinm w4,w3,0,0x1F // w4 <- #bytes to 32-byte align destination
224 cmpwi cr6,r0,0 // set cr6 beq if relatively aligned
225 bge cr7,LFwdLong // long enough for vectors
227 // Medium length: use scalar loops.
228 // w6/cr0 = #bytes to 8-byte align destination
229 // cr6 = beq if relatively doubleword aligned
231 sub rc,rc,w6 // decrement length remaining
232 beq 1f // skip if dest already doubleword aligned
233 mtxer w6 // set up count for move
234 lswx w1,0,rs // move w6 bytes to align destination
236 add rs,rs,w6 // bump ptrs past
239 srwi r0,rc,4 // get # 16-byte chunks (>=1)
240 mtcrf 0x01,rc // save remaining byte count here for LShort16
241 mtctr r0 // set up 16-byte loop
242 bne cr6,3f // source not 4-byte aligned
246 2: // loop over 16-byte aligned chunks
258 3: // loop over 16-byte unaligned chunks
274 // Vector loops. First, we must 32-byte align the destination.
275 // w1 = (rd-rs), used to check for reverse and alignment
276 // w4 = #bytes to 32-byte align destination
277 // rc = long enough for at least one vector loop
280 cmpwi w4,0 // dest already aligned?
281 sub rc,rc,w4 // adjust length
282 mtcrf 0x01,w4 // cr7 <- #bytes to align dest
283 rlwinm w2,w1,0,0xF // relatively 16-byte aligned?
284 mtcrf 0x02,w4 // finish moving #bytes to align to cr6 and cr7
285 srwi r0,rc,6 // get # 64-byte chunks to xfer (>=1)
286 cmpwi cr5,w2,0 // set cr5 beq if relatively 16-byte aligned
287 beq LFwdAligned // dest is already aligned
289 // 32-byte align destination.
291 bf 31,1f // byte to move?
297 bf 30,2f // halfword?
309 bf 28,4f // doubleword?
317 bf 27,LFwdAligned // quadword?
330 // Destination is 32-byte aligned.
331 // r0 = count of 64-byte chunks to move (not 0)
332 // rd = 32-byte aligned
333 // rc = bytes remaining
334 // cr5 = beq if source is 16-byte aligned
335 // We set up many registers:
336 // ctr = number of 64-byte chunks to move
337 // r0/cr0 = leftover QWs to move
338 // cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
339 // cr6 = beq if leftover byte count is 0
340 // rv = original value of vrsave
344 mfspr rv,vrsave // get bitmap of live vector registers
345 mtcrf 0x01,rc // move leftover count to cr7 for LShort16
346 rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3
347 mtctr r0 // set up loop count
348 cmpwi cr6,w3,0 // set cr6 on leftover byte count
349 oris w1,rv,0xFF00 // we use v0-v7
350 rlwinm. r0,rc,28,30,31 // get number of quadword leftovers (0-3) and set cr0
351 mtspr vrsave,w1 // update mask
352 li c16,16 // get constants used in ldvx/stvx
357 bne cr5,LForwardVecUnal // handle unaligned operands
361 1: // loop over 64-byte chunks
369 dcba 0,rd // patched to NOP on some machines
372 dcba c32,rd // patched to NOP on some machines
378 beq 4f // no leftover quadwords
380 3: // loop over remaining quadwords (1-3)
387 mtspr vrsave,rv // restore bitmap of live vr's
388 bne cr6,LShort16 // handle last 0-15 bytes if any
392 // Long, forward, unaligned vector loop.
395 lvsl vp,0,rs // get permute vector to shift left
396 lvx v1,0,rs // prefetch 1st source quadword
399 .align 4 // align inner loops
400 1: // loop over 64-byte chunks
410 dcba 0,rd // patched to NOP on some machines
415 dcba c32,rd // patched to NOP on some machines
421 beq- 4f // no leftover quadwords
423 3: // loop over remaining quadwords
427 vor v1,v2,v2 // v1 <- v2
432 mtspr vrsave,rv // restore bitmap of live vr's
433 bne cr6,LShort16 // handle last 0-15 bytes if any
437 // Medium and long, reverse moves. We use altivec if the operands are long enough,
438 // else a lwz/stx loop.
439 // w1 = (rd-rs), used to check for reverse and alignment
443 add rd,rd,rc // point to end of operands
445 andi. w4,rd,0x1F // w4 <- #bytes to 32-byte align destination
446 rlwinm w6,rd,0,0x3 // w6 <- #bytes to 4-byte align destination
447 bge cr7,LLongReverse // long enough for vectors
450 // w6 = #bytes to 4-byte align destination
452 sub rc,rc,w6 // decrement length remaining
453 mtxer w6 // set up count for move
454 sub rs,rs,w6 // back up ptrs
456 srwi r0,rc,4 // get # 16-byte chunks (>=1)
457 mtcrf 0x01,rc // set remaining byte count here for LShortReverse16
458 lswx w1,0,rs // move w6 bytes to align destination
460 mtctr r0 // set up 16-byte loop
464 1: // loop over 16-byte aligned chunks
478 // Reverse vector loops. First, we must 32-byte align the destination.
479 // w1 = (rd-rs), used to check for reverse and alignment
480 // w4/cr0 = #bytes to 32-byte align destination
481 // rc = long enough for at least one vector loop
484 sub rc,rc,w4 // adjust length
485 mtcrf 0x01,w4 // cr7 <- #bytes to align dest
486 rlwinm w2,w1,0,0xF // relatively 16-byte aligned?
487 mtcrf 0x02,w4 // finish moving #bytes to align to cr6 and cr7
488 srwi r0,rc,6 // get # 64-byte chunks to xfer (>=1)
489 cmpwi cr5,w2,0 // set cr5 beq if relatively 16-byte aligned
490 beq LReverseAligned // dest is already aligned
492 // 32-byte align destination.
494 bf 31,1f // byte to move?
498 bf 30,2f // halfword?
506 bf 28,4f // doubleword?
512 bf 27,LReverseAligned // quadword?
522 // Destination is 32-byte aligned.
523 // r0 = count of 64-byte chunks to move (not 0)
524 // rd = 32-byte aligned
525 // rc = bytes remaining
526 // cr5 = beq if source is 16-byte aligned
527 // We set up many registers:
528 // ctr = number of 64-byte chunks to move
529 // r0/cr0 = leftover QWs to move
530 // cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
531 // cr6 = beq if leftover byte count is 0
532 // rv = original value of vrsave
536 mfspr rv,vrsave // get bitmap of live vector registers
537 mtcrf 0x01,rc // move leftover count to cr7 for LShort16
538 rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3
539 mtctr r0 // set up loop count
540 cmpwi cr6,w3,0 // set cr6 on leftover byte count
541 oris w1,rv,0xFF00 // we use v0-v7
542 rlwinm. r0,rc,28,30,31 // get number of quadword leftovers (0-3) and set cr0
543 mtspr vrsave,w1 // update mask
544 li cm1,-1 // get constants used in ldvx/stvx
550 bne cr5,LReverseVecUnal // handle unaligned operands
553 .align 4 // align inner loops
554 1: // loop over 64-byte chunks
569 beq 4f // no leftover quadwords
571 3: // loop over remaining quadwords (1-7)
578 mtspr vrsave,rv // restore bitmap of live vr's
579 bne cr6,LShortReverse16 // handle last 0-15 bytes if any
583 // Long, reverse, unaligned vector loop.
586 lvsl vp,0,rs // get permute vector to shift left
587 lvx v1,cm1,rs // v1 always looks ahead
590 .align 4 // align the inner loops
591 1: // loop over 64-byte chunks
610 beq 3f // no leftover quadwords
612 2: // loop over 1-3 quadwords
616 vor v1,v2,v2 // v1 <- v2
621 mtspr vrsave,rv // restore bitmap of live vr's
622 bne cr6,LShortReverse16 // handle last 0-15 bytes iff any
625 COMMPAGE_DESCRIPTOR(bcopy_g4,_COMM_PAGE_BCOPY,kHasAltivec,k64Bit,kCommPageDCBA+kCommPage32)