2 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
28 /* =======================================
29 * BCOPY, MEMCPY, and MEMMOVE for Mac OS X
30 * =======================================
32 * Version of 6/11/2003, tuned for the IBM 970.
34 * Register usage. Note the rather delicate way we assign multiple uses
35 * to the same register. Beware.
36 * r0 = temp (NB: cannot use r0 for any constant such as "c16")
37 * r3 = not used, as memcpy and memmove return 1st parameter as a value
38 * r4 = source ptr ("rs")
39 * r5 = count of bytes to move ("rc")
40 * r6 = "w1", "c16", or "cm17"
41 * r7 = "w2", "c32", or "cm33"
42 * r8 = "w3", "c48", or "cm49"
46 * r12 = destination ptr ("rd")
47 * v0 = permute vector ("vp")
48 * v1-v8 = qw's loaded from source
49 *v9-v12 = permuted qw's ("vw", "vx", "vy", and "vz")
75 #include <sys/appleapiopts.h>
77 #include <machine/cpu_capabilities.h>
78 #include <machine/commpage.h>
82 * WARNING: this code is written for 32-bit mode, and ported by the kernel if necessary
83 * to 64-bit mode for use in the 64-bit commpage. This "port" consists of the following
84 * simple transformations:
85 * - all word compares are changed to doubleword
86 * - all "srwi[.]" opcodes are changed to "srdi[.]"
87 * Nothing else is done. For this to work, the following rules must be
89 * - do not use carry or overflow
90 * - only use record mode if you are sure the results are mode-invariant
91 * for example, all "andi." and almost all "rlwinm." are fine
92 * - do not use "slwi", "slw", or "srw"
93 * An imaginative programmer could break the porting model in other ways, but the above
94 * are the most likely problem areas. It is perhaps surprising how well in practice
95 * this simple method works.
99 #define kVeryLong (128*1024)
102 // Main entry points.
105 bcopy_970: // void bcopy(const void *src, void *dst, size_t len)
106 cmplwi rc,kShort // short or long?
107 sub w1,r4,r3 // must move in reverse if (rd-rs)<rc
108 mr rd,r4 // move registers to canonic spot
110 blt LShort // handle short operands
111 dcbt 0,rs // touch in the first line of source
112 dcbtst 0,rd // touch in destination
113 b LLong1 // join long operand code
115 // NB: memmove() must be 8 words past bcopy(), to agree with comm page addresses.
118 Lmemcpy_970: // void* memcpy(void *dst, void *src, size_t len)
119 Lmemmove_970: // void* memmove(void *dst, const void *src, size_t len)
120 cmplwi rc,kShort // short or long?
121 sub w1,r3,r4 // must move in reverse if (rd-rs)<rc
122 mr rd,r3 // must leave r3 alone, it is return value for memcpy etc
123 bge LLong0 // handle long operands
125 // Handle short operands.
129 // w1 = (rd-rs), must move reverse if (rd-rs)<rc
132 cmplw cr1,w1,rc // set cr1 blt if we must move reverse
133 mtcrf 0x02,rc // move length to cr6 and cr7 one at a time
135 blt-- cr1,LShortReverse
137 // Forward short operands. This is the most frequent case, so it is inline.
139 bf 26,0f // 32-byte chunk to move?
152 bf 27,1f // quadword to move?
160 LShort16: // join here to xfer 0-15 bytes
161 bf 28,2f // doubleword?
173 bf 30,4f // halfword to move?
179 bflr 31 // skip if no odd byte
185 // Handle short reverse operands.
186 // cr = length in bits 26-31
189 add rs,rs,rc // adjust ptrs for reverse move
191 bf 26,0f // 32 bytes to move?
201 bf 27,1f // quadword to move?
207 LShortReverse16: // join here to xfer 0-15 bytes and return
208 bf 28,2f // doubleword?
216 bf 30,4f // halfword to move?
220 bflr 31 // done if no odd byte
221 lbz w1,-1(rs) // no update
226 // Long operands, use Altivec in most cases.
230 // w1 = (rd-rs), must move reverse if (rd-rs)<rc
232 LLong0: // entry from memmove()
233 dcbt 0,rs // touch in source
234 dcbtst 0,rd // touch in destination
235 LLong1: // entry from bcopy() with operands already touched in
236 cmplw cr1,w1,rc // set cr1 blt iff we must move reverse
237 neg w3,rd // start to compute #bytes to align destination
238 rlwinm w2,w1,0,0xF // 16-byte aligned? (w2==0 if so)
239 andi. w4,w3,0xF // w4 <- #bytes to 16-byte align destination
240 cmpwi cr5,w2,0 // set cr5 beq if relatively 16-byte aligned
241 blt-- cr1,LLongReverse // handle reverse moves
242 sub rc,rc,w4 // adjust length for aligning destination
243 srwi r0,rc,7 // get #cache lines to copy (may be 0)
244 cmpwi cr1,r0,0 // set cr1 on #chunks
245 beq LFwdAligned // dest is already aligned
247 // 16-byte align destination.
249 mtcrf 0x01,w4 // cr7 <- #bytes to align dest (nonzero)
250 bf 31,1f // byte to move?
256 bf 30,2f // halfword?
268 bf 28,LFwdAligned // doubleword?
275 // Forward, destination is 16-byte aligned. There are five cases:
276 // 1. If the length>=kVeryLong (ie, several pages), then use the
277 // "bigcopy" path that pulls all the punches. This is the fastest
278 // case for cold-cache operands, as any this long will likely be.
279 // 2. If length>=128 and source is 16-byte aligned, then use the
280 // lvx/stvx loop over 128-byte chunks. This is the fastest
281 // case for hot-cache operands, 2nd fastest for cold.
282 // 3. If length>=128 and source is not 16-byte aligned, then use the
283 // lvx/vperm/stvx loop over 128-byte chunks.
284 // 4. If length<128 and source is 8-byte aligned, then use the
285 // ld/std loop over 32-byte chunks.
286 // 5. If length<128 and source is not 8-byte aligned, then use the
287 // lvx/vperm/stvx loop over 32-byte chunks. This is the slowest case.
288 // Registers at this point:
289 // r0/cr1 = count of cache lines ("chunks") that we'll cover (may be 0)
290 // rs = alignment unknown
291 // rd = 16-byte aligned
292 // rc = bytes remaining
293 // w2 = low 4 bits of (rd-rs), used to check alignment
294 // cr5 = beq if source is also 16-byte aligned
297 andi. w3,w2,7 // is source at least 8-byte aligned?
298 mtcrf 0x01,rc // move leftover count to cr7 for LShort16
299 bne cr1,LFwdLongVectors // at least one 128-byte chunk, so use vectors
300 srwi w1,rc,5 // get 32-byte chunk count
301 mtcrf 0x02,rc // move bit 27 of length to cr6 for LShort32
302 mtctr w1 // set up 32-byte loop (w1!=0)
303 beq LFwdMedAligned // source is 8-byte aligned, so use ld/std loop
304 mfspr rv,vrsave // get bitmap of live vector registers
305 oris w4,rv,0xFFF8 // we use v0-v12
306 li c16,16 // get constant used in lvx
308 mtspr vrsave,w4 // update mask
309 lvx v1,0,rs // prefetch 1st source quadword
310 lvsl vp,0,rs // get permute vector to shift left
313 // Fewer than 128 bytes but not doubleword aligned: use lvx/vperm/stvx.
315 1: // loop over 32-byte chunks
321 vor v1,v3,v3 // v1 <- v3
327 mtspr vrsave,rv // restore bitmap of live vr's
331 // Fewer than 128 bytes and doubleword aligned: use ld/std.
334 LFwdMedAligned: // loop over 32-byte chunks
350 // Forward, 128 bytes or more: use vectors. When entered:
351 // r0 = 128-byte chunks to move (>0)
352 // rd = 16-byte aligned
353 // cr5 = beq if source is 16-byte aligned
354 // cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
355 // We set up many registers:
356 // ctr = number of 128-byte chunks to move
357 // r0/cr0 = leftover QWs to move
358 // cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
359 // cr6 = beq if leftover byte count is 0
360 // rv = original value of VRSave
361 // c16,c32,c48 = loaded
364 mfspr rv,vrsave // get bitmap of live vector registers
365 lis w3,kVeryLong>>16 // cutoff for very-long-operand special case path
366 cmplw cr1,rc,w3 // very long operand?
367 rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3
368 bge-- cr1,LBigCopy // handle big copies separately
369 mtctr r0 // set up loop count
370 cmpwi cr6,w3,0 // set cr6 on leftover byte count
371 oris w4,rv,0xFFF8 // we use v0-v12
372 rlwinm. r0,rc,28,29,31 // get number of quadword leftovers (0-7) and set cr0
373 li c16,16 // get constants used in ldvx/stvx
374 mtspr vrsave,w4 // update mask
377 beq cr5,LFwdLongAligned // source is also 16-byte aligned, no need for vperm
378 lvsl vp,0,rs // get permute vector to shift left
379 lvx v1,0,rs // prefetch 1st source quadword
383 // Forward, long, unaligned vector loop.
385 .align 5 // align inner loops
386 LFwdLongUnaligned: // loop over 128-byte chunks
415 bdnz LFwdLongUnaligned
417 beq 4f // no leftover quadwords
419 3: // loop over remaining quadwords
423 vor v1,v2,v2 // v1 <- v2
428 mtspr vrsave,rv // restore bitmap of live vr's
429 bne cr6,LShort16 // handle last 0-15 bytes if any
433 // Forward, long, 16-byte aligned vector loop.
436 LFwdLongAligned: // loop over 128-byte chunks
459 beq 4f // no leftover quadwords
461 3: // loop over remaining quadwords (1-7)
468 mtspr vrsave,rv // restore bitmap of live vr's
469 bne cr6,LShort16 // handle last 0-15 bytes if any
473 // Long, reverse moves.
477 // cr5 = beq if relatively 16-byte aligned
480 add rd,rd,rc // point to end of operands
482 andi. r0,rd,0xF // #bytes to 16-byte align destination
483 beq 2f // already aligned
485 // 16-byte align destination.
487 mtctr r0 // set up for loop
494 // Prepare for reverse vector loop. When entered:
495 // rd = 16-byte aligned
496 // cr5 = beq if source also 16-byte aligned
497 // We set up many registers:
498 // ctr/cr1 = number of 64-byte chunks to move (may be 0)
499 // r0/cr0 = leftover QWs to move
500 // cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
501 // cr6 = beq if leftover byte count is 0
503 // rv = original value of vrsave
506 mfspr rv,vrsave // get bitmap of live vector registers
507 srwi r0,rc,6 // get count of 64-byte chunks to move (may be 0)
508 oris w1,rv,0xFFF8 // we use v0-v12
509 mtcrf 0x01,rc // prepare for moving last 0-15 bytes in LShortReverse16
510 rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3 too
511 cmpwi cr1,r0,0 // set cr1 on chunk count
512 mtspr vrsave,w1 // update mask
513 mtctr r0 // set up loop count
514 cmpwi cr6,w3,0 // set cr6 on leftover byte count
515 rlwinm. r0,rc,28,30,31 // get number of quadword leftovers (0-3) and set cr0
516 li cm1,-1 // get constants used in ldvx/stvx
518 bne cr5,LReverseVecUnal // handle unaligned operands
519 beq cr1,2f // no chunks (if no chunks, must be leftover QWs)
525 // Long, reverse 16-byte-aligned vector loop.
527 .align 5 // align inner loops
528 1: // loop over 64-byte chunks
541 beq 4f // no leftover quadwords
542 2: // r0=#QWs, rv=vrsave, cr7=(rc & F), cr6 set on cr7
544 3: // loop over remaining quadwords (1-7)
551 mtspr vrsave,rv // restore bitmap of live vr's
552 bne cr6,LShortReverse16 // handle last 0-15 bytes if any
556 // Long, reverse, unaligned vector loop.
557 // ctr/cr1 = number of 64-byte chunks to move (may be 0)
558 // r0/cr0 = leftover QWs to move
559 // cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
560 // cr6 = beq if leftover byte count is 0
561 // rv = original value of vrsave
565 lvsl vp,0,rs // get permute vector to shift left
566 lvx v1,cm1,rs // v1 always looks ahead
568 beq cr1,2f // no chunks (if no chunks, must be leftover QWs)
573 .align 5 // align the inner loops
574 1: // loop over 64-byte chunks
591 beq 4f // no leftover quadwords
592 2: // r0=#QWs, rv=vrsave, v1=next QW, cr7=(rc & F), cr6 set on cr7
594 3: // loop over 1-3 quadwords
598 vor v1,v2,v2 // v1 <- v2
603 mtspr vrsave,rv // restore bitmap of live vr's
604 bne cr6,LShortReverse16 // handle last 0-15 bytes iff any
608 // Very Big Copy Path. Save our return address in the stack for help decoding backtraces.
609 // The conditions bigcopy expects are:
610 // r0 = return address (also stored in caller's SF)
612 // r5 = length (at least several pages)
616 lis r2,0x4000 // r2 <- 0x40000000
617 mflr r0 // get our return address
618 add. r2,r2,r2 // set cr0_lt if running in 32-bit mode
619 stw r0,8(r1) // save return, assuming 32-bit mode ("crsave" if 64-bit mode)
620 blta _COMM_PAGE_BIGCOPY // 32-bit mode, join big operand copy
621 std r0,16(r1) // save return in correct spot for 64-bit mode
622 ba _COMM_PAGE_BIGCOPY // then join big operand code
625 COMMPAGE_DESCRIPTOR(bcopy_970,_COMM_PAGE_BCOPY,k64Bit+kHasAltivec,0, \
626 kCommPageMTCRF+kCommPageBoth+kPort32to64)