2 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
4 * @APPLE_LICENSE_HEADER_START@
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
20 * @APPLE_LICENSE_HEADER_END@
22 /* =======================================
23 * BCOPY, MEMCPY, and MEMMOVE for Mac OS X
24 * =======================================
26 * Version of 2/20/2003, tuned for G4. The inner loops use DCBA to avoid
27 * reading destination cache lines. Only the 7450 actually benefits from
28 * this, and then only in the cold-cache case. On 7400s and 7455s, we
29 * patch the DCBAs into NOPs.
31 * Register usage. Note we use R2, so this code will not run in a PEF/CFM
32 * environment. Note also the rather delicate way we assign multiple uses
33 * to the same register. Beware.
35 * r0 = "w7" or temp (NB: cannot use r0 for any constant such as "c16")
36 * r2 = "w8" or vrsave ("rv")
37 * r3 = not used, as memcpy and memmove return 1st parameter as a value
38 * r4 = source ptr ("rs")
39 * r5 = count of bytes to move ("rc")
40 * r6 = "w1", "c16", or "cm17"
41 * r7 = "w2", "c32", or "cm33"
42 * r8 = "w3", "c48", or "cm49"
44 * r10 = "w5", "c96", or "cm97"
45 * r11 = "w6", "c128", or "cm129"
46 * r12 = destination ptr ("rd")
47 * v0 = permute vector ("vp")
48 * v1-v4 = qw's loaded from source
49 * v5-v7 = permuted qw's ("vw", "vx", "vy")
83 #include <sys/appleapiopts.h>
85 #include <machine/cpu_capabilities.h>
86 #include <machine/commpage.h>
90 #define kMedium 32 // too long for inline loopless code
91 #define kLong 96 // long enough to justify use of Altivec
97 bcopy_g4: // void bcopy(const void *src, void *dst, size_t len)
98 cmplwi rc,kMedium // short or long?
99 sub w1,r4,r3 // must move in reverse if (rd-rs)<rc
100 cmplw cr1,w1,rc // set cr1 blt iff we must move reverse
101 mr rd,r4 // start to move registers to canonic spot
103 blt+ LShort // handle short operands
104 dcbt 0,r3 // touch in destination
105 b LMedium // join medium/long operand code
107 // NB: memmove() must be 8 words past bcopy(), to agree with comm page addresses.
110 Lmemcpy_g4: // void* memcpy(void *dst, void *src, size_t len)
111 Lmemmove_g4: // void* memmove(void *dst, const void *src, size_t len)
112 cmplwi rc,kMedium // short or long?
113 sub w1,r3,r4 // must move in reverse if (rd-rs)<rc
114 dcbt 0,r4 // touch in the first line of source
115 cmplw cr1,w1,rc // set cr1 blt iff we must move reverse
116 mr rd,r3 // must leave r3 alone, it is return value for memcpy etc
117 bge- LMedium // handle medium or long operands
119 // Handle short operands.
122 andi. r0,rc,0x10 // test bit 27 separately (faster on G4)
123 mtcrf 0x01,rc // put length bits 28-31 in cr7
124 blt- cr1,LShortReverse
126 // Forward short operands. This is the most frequent case, so it is inline.
128 beq LShort16 // quadword to move?
139 LShort16: // join here to xfer 0-15 bytes
140 bf 28,2f // doubleword?
154 bf 30,4f // halfword to move?
160 bflr 31 // skip if no odd byte
166 // Handle short reverse operands.
167 // cr0 = bne if bit 27 of length is set
168 // cr7 = bits 28-31 of length
171 add rs,rs,rc // adjust ptrs for reverse move
173 beq LShortReverse16 // quadword to move?
182 LShortReverse16: // join here to xfer 0-15 bytes and return
183 bf 28,2f // doubleword?
193 bf 30,4f // halfword to move?
197 bflr 31 // done if no odd byte
198 lbz w1,-1(rs) // no update
203 // Medium and long operands. Use Altivec if long enough, else scalar loops.
204 // w1 = (rd-rs), used to check for alignment
205 // cr1 = blt iff we must move reverse
209 dcbtst 0,rd // touch in destination
210 cmplwi cr7,rc,kLong // long enough for vectors?
211 neg w3,rd // start to compute #bytes to align destination
212 rlwinm r0,w1,0,0x7 // check relative 8-byte alignment
213 andi. w6,w3,7 // w6 <- #bytes to 8-byte align destination
214 blt cr1,LMediumReverse // handle reverse moves
215 rlwinm w4,w3,0,0x1F // w4 <- #bytes to 32-byte align destination
216 cmpwi cr6,r0,0 // set cr6 beq if relatively aligned
217 bge cr7,LFwdLong // long enough for vectors
219 // Medium length: use scalar loops.
220 // w6/cr0 = #bytes to 8-byte align destination
221 // cr6 = beq if relatively doubleword aligned
223 sub rc,rc,w6 // decrement length remaining
224 beq 1f // skip if dest already doubleword aligned
225 mtxer w6 // set up count for move
226 lswx w1,0,rs // move w6 bytes to align destination
228 add rs,rs,w6 // bump ptrs past
231 srwi r0,rc,4 // get # 16-byte chunks (>=1)
232 mtcrf 0x01,rc // save remaining byte count here for LShort16
233 mtctr r0 // set up 16-byte loop
234 bne cr6,3f // source not 4-byte aligned
238 2: // loop over 16-byte aligned chunks
250 3: // loop over 16-byte unaligned chunks
266 // Vector loops. First, we must 32-byte align the destination.
267 // w1 = (rd-rs), used to check for reverse and alignment
268 // w4 = #bytes to 32-byte align destination
269 // rc = long enough for at least one vector loop
272 cmpwi w4,0 // dest already aligned?
273 sub rc,rc,w4 // adjust length
274 mtcrf 0x01,w4 // cr7 <- #bytes to align dest
275 rlwinm w2,w1,0,0xF // relatively 16-byte aligned?
276 mtcrf 0x02,w4 // finish moving #bytes to align to cr6 and cr7
277 srwi r0,rc,6 // get # 64-byte chunks to xfer (>=1)
278 cmpwi cr5,w2,0 // set cr5 beq if relatively 16-byte aligned
279 beq LFwdAligned // dest is already aligned
281 // 32-byte align destination.
283 bf 31,1f // byte to move?
289 bf 30,2f // halfword?
301 bf 28,4f // doubleword?
309 bf 27,LFwdAligned // quadword?
322 // Destination is 32-byte aligned.
323 // r0 = count of 64-byte chunks to move (not 0)
324 // rd = 32-byte aligned
325 // rc = bytes remaining
326 // cr5 = beq if source is 16-byte aligned
327 // We set up many registers:
328 // ctr = number of 64-byte chunks to move
329 // r0/cr0 = leftover QWs to move
330 // cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
331 // cr6 = beq if leftover byte count is 0
332 // rv = original value of vrsave
336 mfspr rv,vrsave // get bitmap of live vector registers
337 mtcrf 0x01,rc // move leftover count to cr7 for LShort16
338 rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3
339 mtctr r0 // set up loop count
340 cmpwi cr6,w3,0 // set cr6 on leftover byte count
341 oris w1,rv,0xFF00 // we use v0-v7
342 rlwinm. r0,rc,28,30,31 // get number of quadword leftovers (0-3) and set cr0
343 mtspr vrsave,w1 // update mask
344 li c16,16 // get constants used in ldvx/stvx
349 bne cr5,LForwardVecUnal // handle unaligned operands
353 1: // loop over 64-byte chunks
361 dcba 0,rd // patched to NOP on some machines
364 dcba c32,rd // patched to NOP on some machines
370 beq 4f // no leftover quadwords
372 3: // loop over remaining quadwords (1-3)
379 mtspr vrsave,rv // restore bitmap of live vr's
380 bne cr6,LShort16 // handle last 0-15 bytes if any
384 // Long, forward, unaligned vector loop.
387 lvsl vp,0,rs // get permute vector to shift left
388 lvx v1,0,rs // prefetch 1st source quadword
391 .align 4 // align inner loops
392 1: // loop over 64-byte chunks
402 dcba 0,rd // patched to NOP on some machines
407 dcba c32,rd // patched to NOP on some machines
413 beq- 4f // no leftover quadwords
415 3: // loop over remaining quadwords
419 vor v1,v2,v2 // v1 <- v2
424 mtspr vrsave,rv // restore bitmap of live vr's
425 bne cr6,LShort16 // handle last 0-15 bytes if any
429 // Medium and long, reverse moves. We use altivec if the operands are long enough,
430 // else a lwz/stx loop.
431 // w1 = (rd-rs), used to check for reverse and alignment
435 add rd,rd,rc // point to end of operands
437 andi. w4,rd,0x1F // w4 <- #bytes to 32-byte align destination
438 rlwinm w6,rd,0,0x3 // w6 <- #bytes to 4-byte align destination
439 bge cr7,LLongReverse // long enough for vectors
442 // w6 = #bytes to 4-byte align destination
444 sub rc,rc,w6 // decrement length remaining
445 mtxer w6 // set up count for move
446 sub rs,rs,w6 // back up ptrs
448 srwi r0,rc,4 // get # 16-byte chunks (>=1)
449 mtcrf 0x01,rc // set remaining byte count here for LShortReverse16
450 lswx w1,0,rs // move w6 bytes to align destination
452 mtctr r0 // set up 16-byte loop
456 1: // loop over 16-byte aligned chunks
470 // Reverse vector loops. First, we must 32-byte align the destination.
471 // w1 = (rd-rs), used to check for reverse and alignment
472 // w4/cr0 = #bytes to 32-byte align destination
473 // rc = long enough for at least one vector loop
476 sub rc,rc,w4 // adjust length
477 mtcrf 0x01,w4 // cr7 <- #bytes to align dest
478 rlwinm w2,w1,0,0xF // relatively 16-byte aligned?
479 mtcrf 0x02,w4 // finish moving #bytes to align to cr6 and cr7
480 srwi r0,rc,6 // get # 64-byte chunks to xfer (>=1)
481 cmpwi cr5,w2,0 // set cr5 beq if relatively 16-byte aligned
482 beq LReverseAligned // dest is already aligned
484 // 32-byte align destination.
486 bf 31,1f // byte to move?
490 bf 30,2f // halfword?
498 bf 28,4f // doubleword?
504 bf 27,LReverseAligned // quadword?
514 // Destination is 32-byte aligned.
515 // r0 = count of 64-byte chunks to move (not 0)
516 // rd = 32-byte aligned
517 // rc = bytes remaining
518 // cr5 = beq if source is 16-byte aligned
519 // We set up many registers:
520 // ctr = number of 64-byte chunks to move
521 // r0/cr0 = leftover QWs to move
522 // cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
523 // cr6 = beq if leftover byte count is 0
524 // rv = original value of vrsave
528 mfspr rv,vrsave // get bitmap of live vector registers
529 mtcrf 0x01,rc // move leftover count to cr7 for LShort16
530 rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3
531 mtctr r0 // set up loop count
532 cmpwi cr6,w3,0 // set cr6 on leftover byte count
533 oris w1,rv,0xFF00 // we use v0-v7
534 rlwinm. r0,rc,28,30,31 // get number of quadword leftovers (0-3) and set cr0
535 mtspr vrsave,w1 // update mask
536 li cm1,-1 // get constants used in ldvx/stvx
542 bne cr5,LReverseVecUnal // handle unaligned operands
545 .align 4 // align inner loops
546 1: // loop over 64-byte chunks
561 beq 4f // no leftover quadwords
563 3: // loop over remaining quadwords (1-7)
570 mtspr vrsave,rv // restore bitmap of live vr's
571 bne cr6,LShortReverse16 // handle last 0-15 bytes if any
575 // Long, reverse, unaligned vector loop.
578 lvsl vp,0,rs // get permute vector to shift left
579 lvx v1,cm1,rs // v1 always looks ahead
582 .align 4 // align the inner loops
583 1: // loop over 64-byte chunks
602 beq 3f // no leftover quadwords
604 2: // loop over 1-3 quadwords
608 vor v1,v2,v2 // v1 <- v2
613 mtspr vrsave,rv // restore bitmap of live vr's
614 bne cr6,LShortReverse16 // handle last 0-15 bytes iff any
617 COMMPAGE_DESCRIPTOR(bcopy_g4,_COMM_PAGE_BCOPY,kHasAltivec,k64Bit,kCommPageDCBA+kCommPage32)