2 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
4 * @APPLE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
21 * @APPLE_LICENSE_HEADER_END@
23 /* =======================================
24 * BCOPY, MEMCPY, and MEMMOVE for Mac OS X
25 * =======================================
27 * Version of 2/20/2003, tuned for G4. The inner loops use DCBA to avoid
28 * reading destination cache lines. Only the 7450 actually benefits from
29 * this, and then only in the cold-cache case. On 7400s and 7455s, we
30 * patch the DCBAs into NOPs.
32 * Register usage. Note we use R2, so this code will not run in a PEF/CFM
33 * environment. Note also the rather delicate way we assign multiple uses
34 * to the same register. Beware.
36 * r0 = "w7" or temp (NB: cannot use r0 for any constant such as "c16")
37 * r2 = "w8" or vrsave ("rv")
38 * r3 = not used, as memcpy and memmove return 1st parameter as a value
39 * r4 = source ptr ("rs")
40 * r5 = count of bytes to move ("rc")
41 * r6 = "w1", "c16", or "cm17"
42 * r7 = "w2", "c32", or "cm33"
43 * r8 = "w3", "c48", or "cm49"
45 * r10 = "w5", "c96", or "cm97"
46 * r11 = "w6", "c128", or "cm129"
47 * r12 = destination ptr ("rd")
48 * v0 = permute vector ("vp")
49 * v1-v4 = qw's loaded from source
50 * v5-v7 = permuted qw's ("vw", "vx", "vy")
84 #include <sys/appleapiopts.h>
86 #include <machine/cpu_capabilities.h>
87 #include <machine/commpage.h>
91 #define kMedium 32 // too long for inline loopless code
92 #define kLong 96 // long enough to justify use of Altivec
98 bcopy_g4: // void bcopy(const void *src, void *dst, size_t len)
99 cmplwi rc,kMedium // short or long?
100 sub w1,r4,r3 // must move in reverse if (rd-rs)<rc
101 cmplw cr1,w1,rc // set cr1 blt iff we must move reverse
102 mr rd,r4 // start to move registers to canonic spot
104 blt+ LShort // handle short operands
105 dcbt 0,r3 // touch in destination
106 b LMedium // join medium/long operand code
108 // NB: memmove() must be 8 words past bcopy(), to agree with comm page addresses.
111 Lmemcpy_g4: // void* memcpy(void *dst, void *src, size_t len)
112 Lmemmove_g4: // void* memmove(void *dst, const void *src, size_t len)
113 cmplwi rc,kMedium // short or long?
114 sub w1,r3,r4 // must move in reverse if (rd-rs)<rc
115 dcbt 0,r4 // touch in the first line of source
116 cmplw cr1,w1,rc // set cr1 blt iff we must move reverse
117 mr rd,r3 // must leave r3 alone, it is return value for memcpy etc
118 bge- LMedium // handle medium or long operands
120 // Handle short operands.
123 andi. r0,rc,0x10 // test bit 27 separately (faster on G4)
124 mtcrf 0x01,rc // put length bits 28-31 in cr7
125 blt- cr1,LShortReverse
127 // Forward short operands. This is the most frequent case, so it is inline.
129 beq LShort16 // quadword to move?
140 LShort16: // join here to xfer 0-15 bytes
141 bf 28,2f // doubleword?
155 bf 30,4f // halfword to move?
161 bflr 31 // skip if no odd byte
167 // Handle short reverse operands.
168 // cr0 = bne if bit 27 of length is set
169 // cr7 = bits 28-31 of length
172 add rs,rs,rc // adjust ptrs for reverse move
174 beq LShortReverse16 // quadword to move?
183 LShortReverse16: // join here to xfer 0-15 bytes and return
184 bf 28,2f // doubleword?
194 bf 30,4f // halfword to move?
198 bflr 31 // done if no odd byte
199 lbz w1,-1(rs) // no update
204 // Medium and long operands. Use Altivec if long enough, else scalar loops.
205 // w1 = (rd-rs), used to check for alignment
206 // cr1 = blt iff we must move reverse
210 dcbtst 0,rd // touch in destination
211 cmplwi cr7,rc,kLong // long enough for vectors?
212 neg w3,rd // start to compute #bytes to align destination
213 rlwinm r0,w1,0,0x7 // check relative 8-byte alignment
214 andi. w6,w3,7 // w6 <- #bytes to 8-byte align destination
215 blt cr1,LMediumReverse // handle reverse moves
216 rlwinm w4,w3,0,0x1F // w4 <- #bytes to 32-byte align destination
217 cmpwi cr6,r0,0 // set cr6 beq if relatively aligned
218 bge cr7,LFwdLong // long enough for vectors
220 // Medium length: use scalar loops.
221 // w6/cr0 = #bytes to 8-byte align destination
222 // cr6 = beq if relatively doubleword aligned
224 sub rc,rc,w6 // decrement length remaining
225 beq 1f // skip if dest already doubleword aligned
226 mtxer w6 // set up count for move
227 lswx w1,0,rs // move w6 bytes to align destination
229 add rs,rs,w6 // bump ptrs past
232 srwi r0,rc,4 // get # 16-byte chunks (>=1)
233 mtcrf 0x01,rc // save remaining byte count here for LShort16
234 mtctr r0 // set up 16-byte loop
235 bne cr6,3f // source not 4-byte aligned
239 2: // loop over 16-byte aligned chunks
251 3: // loop over 16-byte unaligned chunks
267 // Vector loops. First, we must 32-byte align the destination.
268 // w1 = (rd-rs), used to check for reverse and alignment
269 // w4 = #bytes to 32-byte align destination
270 // rc = long enough for at least one vector loop
273 cmpwi w4,0 // dest already aligned?
274 sub rc,rc,w4 // adjust length
275 mtcrf 0x01,w4 // cr7 <- #bytes to align dest
276 rlwinm w2,w1,0,0xF // relatively 16-byte aligned?
277 mtcrf 0x02,w4 // finish moving #bytes to align to cr6 and cr7
278 srwi r0,rc,6 // get # 64-byte chunks to xfer (>=1)
279 cmpwi cr5,w2,0 // set cr5 beq if relatively 16-byte aligned
280 beq LFwdAligned // dest is already aligned
282 // 32-byte align destination.
284 bf 31,1f // byte to move?
290 bf 30,2f // halfword?
302 bf 28,4f // doubleword?
310 bf 27,LFwdAligned // quadword?
323 // Destination is 32-byte aligned.
324 // r0 = count of 64-byte chunks to move (not 0)
325 // rd = 32-byte aligned
326 // rc = bytes remaining
327 // cr5 = beq if source is 16-byte aligned
328 // We set up many registers:
329 // ctr = number of 64-byte chunks to move
330 // r0/cr0 = leftover QWs to move
331 // cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
332 // cr6 = beq if leftover byte count is 0
333 // rv = original value of vrsave
337 mfspr rv,vrsave // get bitmap of live vector registers
338 mtcrf 0x01,rc // move leftover count to cr7 for LShort16
339 rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3
340 mtctr r0 // set up loop count
341 cmpwi cr6,w3,0 // set cr6 on leftover byte count
342 oris w1,rv,0xFF00 // we use v0-v7
343 rlwinm. r0,rc,28,30,31 // get number of quadword leftovers (0-3) and set cr0
344 mtspr vrsave,w1 // update mask
345 li c16,16 // get constants used in ldvx/stvx
350 bne cr5,LForwardVecUnal // handle unaligned operands
354 1: // loop over 64-byte chunks
362 dcba 0,rd // patched to NOP on some machines
365 dcba c32,rd // patched to NOP on some machines
371 beq 4f // no leftover quadwords
373 3: // loop over remaining quadwords (1-3)
380 mtspr vrsave,rv // restore bitmap of live vr's
381 bne cr6,LShort16 // handle last 0-15 bytes if any
385 // Long, forward, unaligned vector loop.
388 lvsl vp,0,rs // get permute vector to shift left
389 lvx v1,0,rs // prefetch 1st source quadword
392 .align 4 // align inner loops
393 1: // loop over 64-byte chunks
403 dcba 0,rd // patched to NOP on some machines
408 dcba c32,rd // patched to NOP on some machines
414 beq- 4f // no leftover quadwords
416 3: // loop over remaining quadwords
420 vor v1,v2,v2 // v1 <- v2
425 mtspr vrsave,rv // restore bitmap of live vr's
426 bne cr6,LShort16 // handle last 0-15 bytes if any
430 // Medium and long, reverse moves. We use altivec if the operands are long enough,
431 // else a lwz/stx loop.
432 // w1 = (rd-rs), used to check for reverse and alignment
436 add rd,rd,rc // point to end of operands
438 andi. w4,rd,0x1F // w4 <- #bytes to 32-byte align destination
439 rlwinm w6,rd,0,0x3 // w6 <- #bytes to 4-byte align destination
440 bge cr7,LLongReverse // long enough for vectors
443 // w6 = #bytes to 4-byte align destination
445 sub rc,rc,w6 // decrement length remaining
446 mtxer w6 // set up count for move
447 sub rs,rs,w6 // back up ptrs
449 srwi r0,rc,4 // get # 16-byte chunks (>=1)
450 mtcrf 0x01,rc // set remaining byte count here for LShortReverse16
451 lswx w1,0,rs // move w6 bytes to align destination
453 mtctr r0 // set up 16-byte loop
457 1: // loop over 16-byte aligned chunks
471 // Reverse vector loops. First, we must 32-byte align the destination.
472 // w1 = (rd-rs), used to check for reverse and alignment
473 // w4/cr0 = #bytes to 32-byte align destination
474 // rc = long enough for at least one vector loop
477 sub rc,rc,w4 // adjust length
478 mtcrf 0x01,w4 // cr7 <- #bytes to align dest
479 rlwinm w2,w1,0,0xF // relatively 16-byte aligned?
480 mtcrf 0x02,w4 // finish moving #bytes to align to cr6 and cr7
481 srwi r0,rc,6 // get # 64-byte chunks to xfer (>=1)
482 cmpwi cr5,w2,0 // set cr5 beq if relatively 16-byte aligned
483 beq LReverseAligned // dest is already aligned
485 // 32-byte align destination.
487 bf 31,1f // byte to move?
491 bf 30,2f // halfword?
499 bf 28,4f // doubleword?
505 bf 27,LReverseAligned // quadword?
515 // Destination is 32-byte aligned.
516 // r0 = count of 64-byte chunks to move (not 0)
517 // rd = 32-byte aligned
518 // rc = bytes remaining
519 // cr5 = beq if source is 16-byte aligned
520 // We set up many registers:
521 // ctr = number of 64-byte chunks to move
522 // r0/cr0 = leftover QWs to move
523 // cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
524 // cr6 = beq if leftover byte count is 0
525 // rv = original value of vrsave
529 mfspr rv,vrsave // get bitmap of live vector registers
530 mtcrf 0x01,rc // move leftover count to cr7 for LShort16
531 rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3
532 mtctr r0 // set up loop count
533 cmpwi cr6,w3,0 // set cr6 on leftover byte count
534 oris w1,rv,0xFF00 // we use v0-v7
535 rlwinm. r0,rc,28,30,31 // get number of quadword leftovers (0-3) and set cr0
536 mtspr vrsave,w1 // update mask
537 li cm1,-1 // get constants used in ldvx/stvx
543 bne cr5,LReverseVecUnal // handle unaligned operands
546 .align 4 // align inner loops
547 1: // loop over 64-byte chunks
562 beq 4f // no leftover quadwords
564 3: // loop over remaining quadwords (1-7)
571 mtspr vrsave,rv // restore bitmap of live vr's
572 bne cr6,LShortReverse16 // handle last 0-15 bytes if any
576 // Long, reverse, unaligned vector loop.
579 lvsl vp,0,rs // get permute vector to shift left
580 lvx v1,cm1,rs // v1 always looks ahead
583 .align 4 // align the inner loops
584 1: // loop over 64-byte chunks
603 beq 3f // no leftover quadwords
605 2: // loop over 1-3 quadwords
609 vor v1,v2,v2 // v1 <- v2
614 mtspr vrsave,rv // restore bitmap of live vr's
615 bne cr6,LShortReverse16 // handle last 0-15 bytes iff any
618 COMMPAGE_DESCRIPTOR(bcopy_g4,_COMM_PAGE_BCOPY,kHasAltivec,k64Bit,kCommPageDCBA+kCommPage32)