2 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
4 * @APPLE_LICENSE_HEADER_START@
6 * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved.
8 * This file contains Original Code and/or Modifications of Original Code
9 * as defined in and that are subject to the Apple Public Source License
10 * Version 2.0 (the 'License'). You may not use this file except in
11 * compliance with the License. Please obtain a copy of the License at
12 * http://www.opensource.apple.com/apsl/ and read it before using this
15 * The Original Code and all software distributed under the License are
16 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
17 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
18 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
20 * Please see the License for the specific language governing rights and
21 * limitations under the License.
23 * @APPLE_LICENSE_HEADER_END@
25 /* =======================================
26 * BCOPY, MEMCPY, and MEMMOVE for Mac OS X
27 * =======================================
29 * Version of 2/20/2003, tuned for G4. The inner loops use DCBA to avoid
30 * reading destination cache lines. Only the 7450 actually benefits from
31 * this, and then only in the cold-cache case. On 7400s and 7455s, we
32 * patch the DCBAs into NOPs.
34 * Register usage. Note we use R2, so this code will not run in a PEF/CFM
35 * environment. Note also the rather delicate way we assign multiple uses
36 * to the same register. Beware.
38 * r0 = "w7" or temp (NB: cannot use r0 for any constant such as "c16")
39 * r2 = "w8" or vrsave ("rv")
40 * r3 = not used, as memcpy and memmove return 1st parameter as a value
41 * r4 = source ptr ("rs")
42 * r5 = count of bytes to move ("rc")
43 * r6 = "w1", "c16", or "cm17"
44 * r7 = "w2", "c32", or "cm33"
45 * r8 = "w3", "c48", or "cm49"
47 * r10 = "w5", "c96", or "cm97"
48 * r11 = "w6", "c128", or "cm129"
49 * r12 = destination ptr ("rd")
50 * v0 = permute vector ("vp")
51 * v1-v4 = qw's loaded from source
52 * v5-v7 = permuted qw's ("vw", "vx", "vy")
86 #include <sys/appleapiopts.h>
88 #include <machine/cpu_capabilities.h>
89 #include <machine/commpage.h>
94 #define kMedium 32 // too long for inline loopless code
95 #define kLong 96 // long enough to justify use of Altivec
101 bcopy_g4: // void bcopy(const void *src, void *dst, size_t len)
102 cmplwi rc,kMedium // short or long?
103 sub w1,r4,r3 // must move in reverse if (rd-rs)<rc
104 cmplw cr1,w1,rc // set cr1 blt iff we must move reverse
105 mr rd,r4 // start to move registers to canonic spot
107 blt+ LShort // handle short operands
108 dcbt 0,r3 // touch in destination
109 b LMedium // join medium/long operand code
111 // NB: memmove() must be 8 words past bcopy(), to agree with comm page addresses.
114 Lmemcpy_g4: // void* memcpy(void *dst, void *src, size_t len)
115 Lmemmove_g4: // void* memmove(void *dst, const void *src, size_t len)
116 cmplwi rc,kMedium // short or long?
117 sub w1,r3,r4 // must move in reverse if (rd-rs)<rc
118 dcbt 0,r4 // touch in the first line of source
119 cmplw cr1,w1,rc // set cr1 blt iff we must move reverse
120 mr rd,r3 // must leave r3 alone, it is return value for memcpy etc
121 bge- LMedium // handle medium or long operands
123 // Handle short operands.
126 andi. r0,rc,0x10 // test bit 27 separately (faster on G4)
127 mtcrf 0x01,rc // put length bits 28-31 in cr7
128 blt- cr1,LShortReverse
130 // Forward short operands. This is the most frequent case, so it is inline.
132 beq LShort16 // quadword to move?
143 LShort16: // join here to xfer 0-15 bytes
144 bf 28,2f // doubleword?
158 bf 30,4f // halfword to move?
164 bflr 31 // skip if no odd byte
170 // Handle short reverse operands.
171 // cr0 = bne if bit 27 of length is set
172 // cr7 = bits 28-31 of length
175 add rs,rs,rc // adjust ptrs for reverse move
177 beq LShortReverse16 // quadword to move?
186 LShortReverse16: // join here to xfer 0-15 bytes and return
187 bf 28,2f // doubleword?
197 bf 30,4f // halfword to move?
201 bflr 31 // done if no odd byte
202 lbz w1,-1(rs) // no update
207 // Medium and long operands. Use Altivec if long enough, else scalar loops.
208 // w1 = (rd-rs), used to check for alignment
209 // cr1 = blt iff we must move reverse
213 dcbtst 0,rd // touch in destination
214 cmplwi cr7,rc,kLong // long enough for vectors?
215 neg w3,rd // start to compute #bytes to align destination
216 rlwinm r0,w1,0,0x7 // check relative 8-byte alignment
217 andi. w6,w3,7 // w6 <- #bytes to 8-byte align destination
218 blt cr1,LMediumReverse // handle reverse moves
219 rlwinm w4,w3,0,0x1F // w4 <- #bytes to 32-byte align destination
220 cmpwi cr6,r0,0 // set cr6 beq if relatively aligned
221 bge cr7,LFwdLong // long enough for vectors
223 // Medium length: use scalar loops.
224 // w6/cr0 = #bytes to 8-byte align destination
225 // cr6 = beq if relatively doubleword aligned
227 sub rc,rc,w6 // decrement length remaining
228 beq 1f // skip if dest already doubleword aligned
229 mtxer w6 // set up count for move
230 lswx w1,0,rs // move w6 bytes to align destination
232 add rs,rs,w6 // bump ptrs past
235 srwi r0,rc,4 // get # 16-byte chunks (>=1)
236 mtcrf 0x01,rc // save remaining byte count here for LShort16
237 mtctr r0 // set up 16-byte loop
238 bne cr6,3f // source not 4-byte aligned
242 2: // loop over 16-byte aligned chunks
254 3: // loop over 16-byte unaligned chunks
270 // Vector loops. First, we must 32-byte align the destination.
271 // w1 = (rd-rs), used to check for reverse and alignment
272 // w4 = #bytes to 32-byte align destination
273 // rc = long enough for at least one vector loop
276 cmpwi w4,0 // dest already aligned?
277 sub rc,rc,w4 // adjust length
278 mtcrf 0x01,w4 // cr7 <- #bytes to align dest
279 rlwinm w2,w1,0,0xF // relatively 16-byte aligned?
280 mtcrf 0x02,w4 // finish moving #bytes to align to cr6 and cr7
281 srwi r0,rc,6 // get # 64-byte chunks to xfer (>=1)
282 cmpwi cr5,w2,0 // set cr5 beq if relatively 16-byte aligned
283 beq LFwdAligned // dest is already aligned
285 // 32-byte align destination.
287 bf 31,1f // byte to move?
293 bf 30,2f // halfword?
305 bf 28,4f // doubleword?
313 bf 27,LFwdAligned // quadword?
326 // Destination is 32-byte aligned.
327 // r0 = count of 64-byte chunks to move (not 0)
328 // rd = 32-byte aligned
329 // rc = bytes remaining
330 // cr5 = beq if source is 16-byte aligned
331 // We set up many registers:
332 // ctr = number of 64-byte chunks to move
333 // r0/cr0 = leftover QWs to move
334 // cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
335 // cr6 = beq if leftover byte count is 0
336 // rv = original value of vrsave
340 mfspr rv,vrsave // get bitmap of live vector registers
341 mtcrf 0x01,rc // move leftover count to cr7 for LShort16
342 rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3
343 mtctr r0 // set up loop count
344 cmpwi cr6,w3,0 // set cr6 on leftover byte count
345 oris w1,rv,0xFF00 // we use v0-v7
346 rlwinm. r0,rc,28,30,31 // get number of quadword leftovers (0-3) and set cr0
347 mtspr vrsave,w1 // update mask
348 li c16,16 // get constants used in ldvx/stvx
353 bne cr5,LForwardVecUnal // handle unaligned operands
357 1: // loop over 64-byte chunks
365 dcba 0,rd // patched to NOP on some machines
368 dcba c32,rd // patched to NOP on some machines
374 beq 4f // no leftover quadwords
376 3: // loop over remaining quadwords (1-3)
383 mtspr vrsave,rv // restore bitmap of live vr's
384 bne cr6,LShort16 // handle last 0-15 bytes if any
388 // Long, forward, unaligned vector loop.
391 lvsl vp,0,rs // get permute vector to shift left
392 lvx v1,0,rs // prefetch 1st source quadword
395 .align 4 // align inner loops
396 1: // loop over 64-byte chunks
406 dcba 0,rd // patched to NOP on some machines
411 dcba c32,rd // patched to NOP on some machines
417 beq- 4f // no leftover quadwords
419 3: // loop over remaining quadwords
423 vor v1,v2,v2 // v1 <- v2
428 mtspr vrsave,rv // restore bitmap of live vr's
429 bne cr6,LShort16 // handle last 0-15 bytes if any
433 // Medium and long, reverse moves. We use altivec if the operands are long enough,
434 // else a lwz/stx loop.
435 // w1 = (rd-rs), used to check for reverse and alignment
439 add rd,rd,rc // point to end of operands
441 andi. w4,rd,0x1F // w4 <- #bytes to 32-byte align destination
442 rlwinm w6,rd,0,0x3 // w6 <- #bytes to 4-byte align destination
443 bge cr7,LLongReverse // long enough for vectors
446 // w6 = #bytes to 4-byte align destination
448 sub rc,rc,w6 // decrement length remaining
449 mtxer w6 // set up count for move
450 sub rs,rs,w6 // back up ptrs
452 srwi r0,rc,4 // get # 16-byte chunks (>=1)
453 mtcrf 0x01,rc // set remaining byte count here for LShortReverse16
454 lswx w1,0,rs // move w6 bytes to align destination
456 mtctr r0 // set up 16-byte loop
460 1: // loop over 16-byte aligned chunks
474 // Reverse vector loops. First, we must 32-byte align the destination.
475 // w1 = (rd-rs), used to check for reverse and alignment
476 // w4/cr0 = #bytes to 32-byte align destination
477 // rc = long enough for at least one vector loop
480 sub rc,rc,w4 // adjust length
481 mtcrf 0x01,w4 // cr7 <- #bytes to align dest
482 rlwinm w2,w1,0,0xF // relatively 16-byte aligned?
483 mtcrf 0x02,w4 // finish moving #bytes to align to cr6 and cr7
484 srwi r0,rc,6 // get # 64-byte chunks to xfer (>=1)
485 cmpwi cr5,w2,0 // set cr5 beq if relatively 16-byte aligned
486 beq LReverseAligned // dest is already aligned
488 // 32-byte align destination.
490 bf 31,1f // byte to move?
494 bf 30,2f // halfword?
502 bf 28,4f // doubleword?
508 bf 27,LReverseAligned // quadword?
518 // Destination is 32-byte aligned.
519 // r0 = count of 64-byte chunks to move (not 0)
520 // rd = 32-byte aligned
521 // rc = bytes remaining
522 // cr5 = beq if source is 16-byte aligned
523 // We set up many registers:
524 // ctr = number of 64-byte chunks to move
525 // r0/cr0 = leftover QWs to move
526 // cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
527 // cr6 = beq if leftover byte count is 0
528 // rv = original value of vrsave
532 mfspr rv,vrsave // get bitmap of live vector registers
533 mtcrf 0x01,rc // move leftover count to cr7 for LShort16
534 rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3
535 mtctr r0 // set up loop count
536 cmpwi cr6,w3,0 // set cr6 on leftover byte count
537 oris w1,rv,0xFF00 // we use v0-v7
538 rlwinm. r0,rc,28,30,31 // get number of quadword leftovers (0-3) and set cr0
539 mtspr vrsave,w1 // update mask
540 li cm1,-1 // get constants used in ldvx/stvx
546 bne cr5,LReverseVecUnal // handle unaligned operands
549 .align 4 // align inner loops
550 1: // loop over 64-byte chunks
565 beq 4f // no leftover quadwords
567 3: // loop over remaining quadwords (1-7)
574 mtspr vrsave,rv // restore bitmap of live vr's
575 bne cr6,LShortReverse16 // handle last 0-15 bytes if any
579 // Long, reverse, unaligned vector loop.
582 lvsl vp,0,rs // get permute vector to shift left
583 lvx v1,cm1,rs // v1 always looks ahead
586 .align 4 // align the inner loops
587 1: // loop over 64-byte chunks
606 beq 3f // no leftover quadwords
608 2: // loop over 1-3 quadwords
612 vor v1,v2,v2 // v1 <- v2
617 mtspr vrsave,rv // restore bitmap of live vr's
618 bne cr6,LShortReverse16 // handle last 0-15 bytes iff any
621 COMMPAGE_DESCRIPTOR(bcopy_g4,_COMM_PAGE_BCOPY,kHasAltivec,k64Bit,kCommPageDCBA)