2 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
28 /* =======================================
29 * BCOPY, MEMCPY, and MEMMOVE for Mac OS X
30 * =======================================
32 * Version of 6/11/2003, tuned for the IBM 970.
34 * Register usage. Note the rather delicate way we assign multiple uses
35 * to the same register. Beware.
36 * r0 = temp (NB: cannot use r0 for any constant such as "c16")
37 * r3 = not used, as memcpy and memmove return 1st parameter as a value
38 * r4 = source ptr ("rs")
39 * r5 = count of bytes to move ("rc")
40 * r6 = "w1", "c16", or "cm17"
41 * r7 = "w2", "c32", or "cm33"
42 * r8 = "w3", "c48", or "cm49"
46 * r12 = destination ptr ("rd")
47 * v0 = permute vector ("vp")
48 * v1-v8 = qw's loaded from source
49 *v9-v12 = permuted qw's ("vw", "vx", "vy", and "vz")
76 #include <sys/appleapiopts.h>
78 #include <machine/cpu_capabilities.h>
79 #include <machine/commpage.h>
83 * WARNING: this code is written for 32-bit mode, and ported by the kernel if necessary
84 * to 64-bit mode for use in the 64-bit commpage. This "port" consists of the following
85 * simple transformations:
86 * - all word compares are changed to doubleword
87 * - all "srwi[.]" opcodes are changed to "srdi[.]"
88 * Nothing else is done. For this to work, the following rules must be
90 * - do not use carry or overflow
91 * - only use record mode if you are sure the results are mode-invariant
92 * for example, all "andi." and almost all "rlwinm." are fine
93 * - do not use "slwi", "slw", or "srw"
94 * An imaginative programmer could break the porting model in other ways, but the above
95 * are the most likely problem areas. It is perhaps surprising how well in practice
96 * this simple method works.
100 #define kVeryLong (128*1024)
103 // Main entry points.
106 bcopy_970: // void bcopy(const void *src, void *dst, size_t len)
107 cmplwi rc,kShort // short or long?
108 sub w1,r4,r3 // must move in reverse if (rd-rs)<rc
109 mr rd,r4 // move registers to canonic spot
111 blt LShort // handle short operands
112 dcbt 0,rs // touch in the first line of source
113 dcbtst 0,rd // touch in destination
114 b LLong1 // join long operand code
116 // NB: memmove() must be 8 words past bcopy(), to agree with comm page addresses.
119 Lmemcpy_970: // void* memcpy(void *dst, void *src, size_t len)
120 Lmemmove_970: // void* memmove(void *dst, const void *src, size_t len)
121 cmplwi rc,kShort // short or long?
122 sub w1,r3,r4 // must move in reverse if (rd-rs)<rc
123 mr rd,r3 // must leave r3 alone, it is return value for memcpy etc
124 bge LLong0 // handle long operands
126 // Handle short operands.
130 // w1 = (rd-rs), must move reverse if (rd-rs)<rc
133 cmplw cr1,w1,rc // set cr1 blt if we must move reverse
134 mtcrf 0x02,rc // move length to cr6 and cr7 one at a time
136 blt-- cr1,LShortReverse
138 // Forward short operands. This is the most frequent case, so it is inline.
140 bf 26,0f // 32-byte chunk to move?
153 bf 27,1f // quadword to move?
161 LShort16: // join here to xfer 0-15 bytes
162 bf 28,2f // doubleword?
174 bf 30,4f // halfword to move?
180 bflr 31 // skip if no odd byte
186 // Handle short reverse operands.
187 // cr = length in bits 26-31
190 add rs,rs,rc // adjust ptrs for reverse move
192 bf 26,0f // 32 bytes to move?
202 bf 27,1f // quadword to move?
208 LShortReverse16: // join here to xfer 0-15 bytes and return
209 bf 28,2f // doubleword?
217 bf 30,4f // halfword to move?
221 bflr 31 // done if no odd byte
222 lbz w1,-1(rs) // no update
227 // Long operands, use Altivec in most cases.
231 // w1 = (rd-rs), must move reverse if (rd-rs)<rc
233 LLong0: // entry from memmove()
234 dcbt 0,rs // touch in source
235 dcbtst 0,rd // touch in destination
236 LLong1: // entry from bcopy() with operands already touched in
237 cmplw cr1,w1,rc // set cr1 blt iff we must move reverse
238 neg w3,rd // start to compute #bytes to align destination
239 rlwinm w2,w1,0,0xF // 16-byte aligned? (w2==0 if so)
240 andi. w4,w3,0xF // w4 <- #bytes to 16-byte align destination
241 cmpwi cr5,w2,0 // set cr5 beq if relatively 16-byte aligned
242 blt-- cr1,LLongReverse // handle reverse moves
243 sub rc,rc,w4 // adjust length for aligning destination
244 srwi r0,rc,7 // get #cache lines to copy (may be 0)
245 cmpwi cr1,r0,0 // set cr1 on #chunks
246 beq LFwdAligned // dest is already aligned
248 // 16-byte align destination.
250 mtcrf 0x01,w4 // cr7 <- #bytes to align dest (nonzero)
251 bf 31,1f // byte to move?
257 bf 30,2f // halfword?
269 bf 28,LFwdAligned // doubleword?
276 // Forward, destination is 16-byte aligned. There are five cases:
277 // 1. If the length>=kVeryLong (ie, several pages), then use the
278 // "bigcopy" path that pulls all the punches. This is the fastest
279 // case for cold-cache operands, as any this long will likely be.
280 // 2. If length>=128 and source is 16-byte aligned, then use the
281 // lvx/stvx loop over 128-byte chunks. This is the fastest
282 // case for hot-cache operands, 2nd fastest for cold.
283 // 3. If length>=128 and source is not 16-byte aligned, then use the
284 // lvx/vperm/stvx loop over 128-byte chunks.
285 // 4. If length<128 and source is 8-byte aligned, then use the
286 // ld/std loop over 32-byte chunks.
287 // 5. If length<128 and source is not 8-byte aligned, then use the
288 // lvx/vperm/stvx loop over 32-byte chunks. This is the slowest case.
289 // Registers at this point:
290 // r0/cr1 = count of cache lines ("chunks") that we'll cover (may be 0)
291 // rs = alignment unknown
292 // rd = 16-byte aligned
293 // rc = bytes remaining
294 // w2 = low 4 bits of (rd-rs), used to check alignment
295 // cr5 = beq if source is also 16-byte aligned
298 andi. w3,w2,7 // is source at least 8-byte aligned?
299 mtcrf 0x01,rc // move leftover count to cr7 for LShort16
300 bne cr1,LFwdLongVectors // at least one 128-byte chunk, so use vectors
301 srwi w1,rc,5 // get 32-byte chunk count
302 mtcrf 0x02,rc // move bit 27 of length to cr6 for LShort32
303 mtctr w1 // set up 32-byte loop (w1!=0)
304 beq LFwdMedAligned // source is 8-byte aligned, so use ld/std loop
305 mfspr rv,vrsave // get bitmap of live vector registers
306 oris w4,rv,0xFFF8 // we use v0-v12
307 li c16,16 // get constant used in lvx
309 mtspr vrsave,w4 // update mask
310 lvx v1,0,rs // prefetch 1st source quadword
311 lvsl vp,0,rs // get permute vector to shift left
314 // Fewer than 128 bytes but not doubleword aligned: use lvx/vperm/stvx.
316 1: // loop over 32-byte chunks
322 vor v1,v3,v3 // v1 <- v3
328 mtspr vrsave,rv // restore bitmap of live vr's
332 // Fewer than 128 bytes and doubleword aligned: use ld/std.
335 LFwdMedAligned: // loop over 32-byte chunks
351 // Forward, 128 bytes or more: use vectors. When entered:
352 // r0 = 128-byte chunks to move (>0)
353 // rd = 16-byte aligned
354 // cr5 = beq if source is 16-byte aligned
355 // cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
356 // We set up many registers:
357 // ctr = number of 128-byte chunks to move
358 // r0/cr0 = leftover QWs to move
359 // cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
360 // cr6 = beq if leftover byte count is 0
361 // rv = original value of VRSave
362 // c16,c32,c48 = loaded
365 mfspr rv,vrsave // get bitmap of live vector registers
366 lis w3,kVeryLong>>16 // cutoff for very-long-operand special case path
367 cmplw cr1,rc,w3 // very long operand?
368 rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3
369 bge-- cr1,LBigCopy // handle big copies separately
370 mtctr r0 // set up loop count
371 cmpwi cr6,w3,0 // set cr6 on leftover byte count
372 oris w4,rv,0xFFF8 // we use v0-v12
373 rlwinm. r0,rc,28,29,31 // get number of quadword leftovers (0-7) and set cr0
374 li c16,16 // get constants used in ldvx/stvx
375 mtspr vrsave,w4 // update mask
378 beq cr5,LFwdLongAligned // source is also 16-byte aligned, no need for vperm
379 lvsl vp,0,rs // get permute vector to shift left
380 lvx v1,0,rs // prefetch 1st source quadword
384 // Forward, long, unaligned vector loop.
386 .align 5 // align inner loops
387 LFwdLongUnaligned: // loop over 128-byte chunks
416 bdnz LFwdLongUnaligned
418 beq 4f // no leftover quadwords
420 3: // loop over remaining quadwords
424 vor v1,v2,v2 // v1 <- v2
429 mtspr vrsave,rv // restore bitmap of live vr's
430 bne cr6,LShort16 // handle last 0-15 bytes if any
434 // Forward, long, 16-byte aligned vector loop.
437 LFwdLongAligned: // loop over 128-byte chunks
460 beq 4f // no leftover quadwords
462 3: // loop over remaining quadwords (1-7)
469 mtspr vrsave,rv // restore bitmap of live vr's
470 bne cr6,LShort16 // handle last 0-15 bytes if any
474 // Long, reverse moves.
478 // cr5 = beq if relatively 16-byte aligned
481 add rd,rd,rc // point to end of operands
483 andi. r0,rd,0xF // #bytes to 16-byte align destination
484 beq 2f // already aligned
486 // 16-byte align destination.
488 mtctr r0 // set up for loop
495 // Prepare for reverse vector loop. When entered:
496 // rd = 16-byte aligned
497 // cr5 = beq if source also 16-byte aligned
498 // We set up many registers:
499 // ctr/cr1 = number of 64-byte chunks to move (may be 0)
500 // r0/cr0 = leftover QWs to move
501 // cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
502 // cr6 = beq if leftover byte count is 0
504 // rv = original value of vrsave
507 mfspr rv,vrsave // get bitmap of live vector registers
508 srwi r0,rc,6 // get count of 64-byte chunks to move (may be 0)
509 oris w1,rv,0xFFF8 // we use v0-v12
510 mtcrf 0x01,rc // prepare for moving last 0-15 bytes in LShortReverse16
511 rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3 too
512 cmpwi cr1,r0,0 // set cr1 on chunk count
513 mtspr vrsave,w1 // update mask
514 mtctr r0 // set up loop count
515 cmpwi cr6,w3,0 // set cr6 on leftover byte count
516 rlwinm. r0,rc,28,30,31 // get number of quadword leftovers (0-3) and set cr0
517 li cm1,-1 // get constants used in ldvx/stvx
519 bne cr5,LReverseVecUnal // handle unaligned operands
520 beq cr1,2f // no chunks (if no chunks, must be leftover QWs)
526 // Long, reverse 16-byte-aligned vector loop.
528 .align 5 // align inner loops
529 1: // loop over 64-byte chunks
542 beq 4f // no leftover quadwords
543 2: // r0=#QWs, rv=vrsave, cr7=(rc & F), cr6 set on cr7
545 3: // loop over remaining quadwords (1-7)
552 mtspr vrsave,rv // restore bitmap of live vr's
553 bne cr6,LShortReverse16 // handle last 0-15 bytes if any
557 // Long, reverse, unaligned vector loop.
558 // ctr/cr1 = number of 64-byte chunks to move (may be 0)
559 // r0/cr0 = leftover QWs to move
560 // cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
561 // cr6 = beq if leftover byte count is 0
562 // rv = original value of vrsave
566 lvsl vp,0,rs // get permute vector to shift left
567 lvx v1,cm1,rs // v1 always looks ahead
569 beq cr1,2f // no chunks (if no chunks, must be leftover QWs)
574 .align 5 // align the inner loops
575 1: // loop over 64-byte chunks
592 beq 4f // no leftover quadwords
593 2: // r0=#QWs, rv=vrsave, v1=next QW, cr7=(rc & F), cr6 set on cr7
595 3: // loop over 1-3 quadwords
599 vor v1,v2,v2 // v1 <- v2
604 mtspr vrsave,rv // restore bitmap of live vr's
605 bne cr6,LShortReverse16 // handle last 0-15 bytes iff any
609 // Very Big Copy Path. Save our return address in the stack for help decoding backtraces.
610 // The conditions bigcopy expects are:
611 // r0 = return address (also stored in caller's SF)
613 // r5 = length (at least several pages)
617 lis r2,0x4000 // r2 <- 0x40000000
618 mflr r0 // get our return address
619 add. r2,r2,r2 // set cr0_lt if running in 32-bit mode
620 stw r0,8(r1) // save return, assuming 32-bit mode ("crsave" if 64-bit mode)
621 blta _COMM_PAGE_BIGCOPY // 32-bit mode, join big operand copy
622 std r0,16(r1) // save return in correct spot for 64-bit mode
623 ba _COMM_PAGE_BIGCOPY // then join big operand code
626 COMMPAGE_DESCRIPTOR(bcopy_970,_COMM_PAGE_BCOPY,k64Bit+kHasAltivec,0, \
627 kCommPageMTCRF+kCommPageBoth+kPort32to64)