2 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
28 /* =======================================
29 * BCOPY, MEMCPY, and MEMMOVE for Mac OS X
30 * =======================================
32 * Version of 2/20/2003, tuned for G4. The inner loops use DCBA to avoid
33 * reading destination cache lines. Only the 7450 actually benefits from
34 * this, and then only in the cold-cache case. On 7400s and 7455s, we
35 * patch the DCBAs into NOPs.
37 * Register usage. Note we use R2, so this code will not run in a PEF/CFM
38 * environment. Note also the rather delicate way we assign multiple uses
39 * to the same register. Beware.
41 * r0 = "w7" or temp (NB: cannot use r0 for any constant such as "c16")
42 * r2 = "w8" or vrsave ("rv")
43 * r3 = not used, as memcpy and memmove return 1st parameter as a value
44 * r4 = source ptr ("rs")
45 * r5 = count of bytes to move ("rc")
46 * r6 = "w1", "c16", or "cm17"
47 * r7 = "w2", "c32", or "cm33"
48 * r8 = "w3", "c48", or "cm49"
50 * r10 = "w5", "c96", or "cm97"
51 * r11 = "w6", "c128", or "cm129"
52 * r12 = destination ptr ("rd")
53 * v0 = permute vector ("vp")
54 * v1-v4 = qw's loaded from source
55 * v5-v7 = permuted qw's ("vw", "vx", "vy")
88 #include <sys/appleapiopts.h>
90 #include <machine/cpu_capabilities.h>
91 #include <machine/commpage.h>
95 #define kMedium 32 // too long for inline loopless code
96 #define kLong 96 // long enough to justify use of Altivec
102 bcopy_g4: // void bcopy(const void *src, void *dst, size_t len)
103 cmplwi rc,kMedium // short or long?
104 sub w1,r4,r3 // must move in reverse if (rd-rs)<rc
105 cmplw cr1,w1,rc // set cr1 blt iff we must move reverse
106 mr rd,r4 // start to move registers to canonic spot
108 blt+ LShort // handle short operands
109 dcbt 0,r3 // touch in destination
110 b LMedium // join medium/long operand code
112 // NB: memmove() must be 8 words past bcopy(), to agree with comm page addresses.
115 Lmemcpy_g4: // void* memcpy(void *dst, void *src, size_t len)
116 Lmemmove_g4: // void* memmove(void *dst, const void *src, size_t len)
117 cmplwi rc,kMedium // short or long?
118 sub w1,r3,r4 // must move in reverse if (rd-rs)<rc
119 dcbt 0,r4 // touch in the first line of source
120 cmplw cr1,w1,rc // set cr1 blt iff we must move reverse
121 mr rd,r3 // must leave r3 alone, it is return value for memcpy etc
122 bge- LMedium // handle medium or long operands
124 // Handle short operands.
127 andi. r0,rc,0x10 // test bit 27 separately (faster on G4)
128 mtcrf 0x01,rc // put length bits 28-31 in cr7
129 blt- cr1,LShortReverse
131 // Forward short operands. This is the most frequent case, so it is inline.
133 beq LShort16 // quadword to move?
144 LShort16: // join here to xfer 0-15 bytes
145 bf 28,2f // doubleword?
159 bf 30,4f // halfword to move?
165 bflr 31 // skip if no odd byte
171 // Handle short reverse operands.
172 // cr0 = bne if bit 27 of length is set
173 // cr7 = bits 28-31 of length
176 add rs,rs,rc // adjust ptrs for reverse move
178 beq LShortReverse16 // quadword to move?
187 LShortReverse16: // join here to xfer 0-15 bytes and return
188 bf 28,2f // doubleword?
198 bf 30,4f // halfword to move?
202 bflr 31 // done if no odd byte
203 lbz w1,-1(rs) // no update
208 // Medium and long operands. Use Altivec if long enough, else scalar loops.
209 // w1 = (rd-rs), used to check for alignment
210 // cr1 = blt iff we must move reverse
214 dcbtst 0,rd // touch in destination
215 cmplwi cr7,rc,kLong // long enough for vectors?
216 neg w3,rd // start to compute #bytes to align destination
217 rlwinm r0,w1,0,0x7 // check relative 8-byte alignment
218 andi. w6,w3,7 // w6 <- #bytes to 8-byte align destination
219 blt cr1,LMediumReverse // handle reverse moves
220 rlwinm w4,w3,0,0x1F // w4 <- #bytes to 32-byte align destination
221 cmpwi cr6,r0,0 // set cr6 beq if relatively aligned
222 bge cr7,LFwdLong // long enough for vectors
224 // Medium length: use scalar loops.
225 // w6/cr0 = #bytes to 8-byte align destination
226 // cr6 = beq if relatively doubleword aligned
228 sub rc,rc,w6 // decrement length remaining
229 beq 1f // skip if dest already doubleword aligned
230 mtxer w6 // set up count for move
231 lswx w1,0,rs // move w6 bytes to align destination
233 add rs,rs,w6 // bump ptrs past
236 srwi r0,rc,4 // get # 16-byte chunks (>=1)
237 mtcrf 0x01,rc // save remaining byte count here for LShort16
238 mtctr r0 // set up 16-byte loop
239 bne cr6,3f // source not 4-byte aligned
243 2: // loop over 16-byte aligned chunks
255 3: // loop over 16-byte unaligned chunks
271 // Vector loops. First, we must 32-byte align the destination.
272 // w1 = (rd-rs), used to check for reverse and alignment
273 // w4 = #bytes to 32-byte align destination
274 // rc = long enough for at least one vector loop
277 cmpwi w4,0 // dest already aligned?
278 sub rc,rc,w4 // adjust length
279 mtcrf 0x01,w4 // cr7 <- #bytes to align dest
280 rlwinm w2,w1,0,0xF // relatively 16-byte aligned?
281 mtcrf 0x02,w4 // finish moving #bytes to align to cr6 and cr7
282 srwi r0,rc,6 // get # 64-byte chunks to xfer (>=1)
283 cmpwi cr5,w2,0 // set cr5 beq if relatively 16-byte aligned
284 beq LFwdAligned // dest is already aligned
286 // 32-byte align destination.
288 bf 31,1f // byte to move?
294 bf 30,2f // halfword?
306 bf 28,4f // doubleword?
314 bf 27,LFwdAligned // quadword?
327 // Destination is 32-byte aligned.
328 // r0 = count of 64-byte chunks to move (not 0)
329 // rd = 32-byte aligned
330 // rc = bytes remaining
331 // cr5 = beq if source is 16-byte aligned
332 // We set up many registers:
333 // ctr = number of 64-byte chunks to move
334 // r0/cr0 = leftover QWs to move
335 // cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
336 // cr6 = beq if leftover byte count is 0
337 // rv = original value of vrsave
341 mfspr rv,vrsave // get bitmap of live vector registers
342 mtcrf 0x01,rc // move leftover count to cr7 for LShort16
343 rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3
344 mtctr r0 // set up loop count
345 cmpwi cr6,w3,0 // set cr6 on leftover byte count
346 oris w1,rv,0xFF00 // we use v0-v7
347 rlwinm. r0,rc,28,30,31 // get number of quadword leftovers (0-3) and set cr0
348 mtspr vrsave,w1 // update mask
349 li c16,16 // get constants used in ldvx/stvx
354 bne cr5,LForwardVecUnal // handle unaligned operands
358 1: // loop over 64-byte chunks
366 dcba 0,rd // patched to NOP on some machines
369 dcba c32,rd // patched to NOP on some machines
375 beq 4f // no leftover quadwords
377 3: // loop over remaining quadwords (1-3)
384 mtspr vrsave,rv // restore bitmap of live vr's
385 bne cr6,LShort16 // handle last 0-15 bytes if any
389 // Long, forward, unaligned vector loop.
392 lvsl vp,0,rs // get permute vector to shift left
393 lvx v1,0,rs // prefetch 1st source quadword
396 .align 4 // align inner loops
397 1: // loop over 64-byte chunks
407 dcba 0,rd // patched to NOP on some machines
412 dcba c32,rd // patched to NOP on some machines
418 beq- 4f // no leftover quadwords
420 3: // loop over remaining quadwords
424 vor v1,v2,v2 // v1 <- v2
429 mtspr vrsave,rv // restore bitmap of live vr's
430 bne cr6,LShort16 // handle last 0-15 bytes if any
434 // Medium and long, reverse moves. We use altivec if the operands are long enough,
435 // else a lwz/stx loop.
436 // w1 = (rd-rs), used to check for reverse and alignment
440 add rd,rd,rc // point to end of operands
442 andi. w4,rd,0x1F // w4 <- #bytes to 32-byte align destination
443 rlwinm w6,rd,0,0x3 // w6 <- #bytes to 4-byte align destination
444 bge cr7,LLongReverse // long enough for vectors
447 // w6 = #bytes to 4-byte align destination
449 sub rc,rc,w6 // decrement length remaining
450 mtxer w6 // set up count for move
451 sub rs,rs,w6 // back up ptrs
453 srwi r0,rc,4 // get # 16-byte chunks (>=1)
454 mtcrf 0x01,rc // set remaining byte count here for LShortReverse16
455 lswx w1,0,rs // move w6 bytes to align destination
457 mtctr r0 // set up 16-byte loop
461 1: // loop over 16-byte aligned chunks
475 // Reverse vector loops. First, we must 32-byte align the destination.
476 // w1 = (rd-rs), used to check for reverse and alignment
477 // w4/cr0 = #bytes to 32-byte align destination
478 // rc = long enough for at least one vector loop
481 sub rc,rc,w4 // adjust length
482 mtcrf 0x01,w4 // cr7 <- #bytes to align dest
483 rlwinm w2,w1,0,0xF // relatively 16-byte aligned?
484 mtcrf 0x02,w4 // finish moving #bytes to align to cr6 and cr7
485 srwi r0,rc,6 // get # 64-byte chunks to xfer (>=1)
486 cmpwi cr5,w2,0 // set cr5 beq if relatively 16-byte aligned
487 beq LReverseAligned // dest is already aligned
489 // 32-byte align destination.
491 bf 31,1f // byte to move?
495 bf 30,2f // halfword?
503 bf 28,4f // doubleword?
509 bf 27,LReverseAligned // quadword?
519 // Destination is 32-byte aligned.
520 // r0 = count of 64-byte chunks to move (not 0)
521 // rd = 32-byte aligned
522 // rc = bytes remaining
523 // cr5 = beq if source is 16-byte aligned
524 // We set up many registers:
525 // ctr = number of 64-byte chunks to move
526 // r0/cr0 = leftover QWs to move
527 // cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
528 // cr6 = beq if leftover byte count is 0
529 // rv = original value of vrsave
533 mfspr rv,vrsave // get bitmap of live vector registers
534 mtcrf 0x01,rc // move leftover count to cr7 for LShort16
535 rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3
536 mtctr r0 // set up loop count
537 cmpwi cr6,w3,0 // set cr6 on leftover byte count
538 oris w1,rv,0xFF00 // we use v0-v7
539 rlwinm. r0,rc,28,30,31 // get number of quadword leftovers (0-3) and set cr0
540 mtspr vrsave,w1 // update mask
541 li cm1,-1 // get constants used in ldvx/stvx
547 bne cr5,LReverseVecUnal // handle unaligned operands
550 .align 4 // align inner loops
551 1: // loop over 64-byte chunks
566 beq 4f // no leftover quadwords
568 3: // loop over remaining quadwords (1-7)
575 mtspr vrsave,rv // restore bitmap of live vr's
576 bne cr6,LShortReverse16 // handle last 0-15 bytes if any
580 // Long, reverse, unaligned vector loop.
583 lvsl vp,0,rs // get permute vector to shift left
584 lvx v1,cm1,rs // v1 always looks ahead
587 .align 4 // align the inner loops
588 1: // loop over 64-byte chunks
607 beq 3f // no leftover quadwords
609 2: // loop over 1-3 quadwords
613 vor v1,v2,v2 // v1 <- v2
618 mtspr vrsave,rv // restore bitmap of live vr's
619 bne cr6,LShortReverse16 // handle last 0-15 bytes iff any
622 COMMPAGE_DESCRIPTOR(bcopy_g4,_COMM_PAGE_BCOPY,kHasAltivec,k64Bit,kCommPageDCBA+kCommPage32)