2 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
4 * @APPLE_LICENSE_OSREFERENCE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the
10 * License may not be used to create, or enable the creation or
11 * redistribution of, unlawful or unlicensed copies of an Apple operating
12 * system, or to circumvent, violate, or enable the circumvention or
13 * violation of, any terms of an Apple operating system software license
16 * Please obtain a copy of the License at
17 * http://www.opensource.apple.com/apsl/ and read it before using this
20 * The Original Code and all software distributed under the License are
21 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
22 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
23 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
24 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
25 * Please see the License for the specific language governing rights and
26 * limitations under the License.
28 * @APPLE_LICENSE_OSREFERENCE_HEADER_END@
30 /* =======================================
31 * BCOPY, MEMCPY, and MEMMOVE for Mac OS X
32 * =======================================
34 * Version of 6/11/2003, tuned for the IBM 970.
36 * Register usage. Note the rather delicate way we assign multiple uses
37 * to the same register. Beware.
38 * r0 = temp (NB: cannot use r0 for any constant such as "c16")
39 * r3 = not used, as memcpy and memmove return 1st parameter as a value
40 * r4 = source ptr ("rs")
41 * r5 = count of bytes to move ("rc")
42 * r6 = "w1", "c16", or "cm17"
43 * r7 = "w2", "c32", or "cm33"
44 * r8 = "w3", "c48", or "cm49"
48 * r12 = destination ptr ("rd")
49 * v0 = permute vector ("vp")
50 * v1-v8 = qw's loaded from source
51 *v9-v12 = permuted qw's ("vw", "vx", "vy", and "vz")
78 #include <sys/appleapiopts.h>
80 #include <machine/cpu_capabilities.h>
81 #include <machine/commpage.h>
85 * WARNING: this code is written for 32-bit mode, and ported by the kernel if necessary
86 * to 64-bit mode for use in the 64-bit commpage. This "port" consists of the following
87 * simple transformations:
88 * - all word compares are changed to doubleword
89 * - all "srwi[.]" opcodes are changed to "srdi[.]"
90 * Nothing else is done. For this to work, the following rules must be
92 * - do not use carry or overflow
93 * - only use record mode if you are sure the results are mode-invariant
94 * for example, all "andi." and almost all "rlwinm." are fine
95 * - do not use "slwi", "slw", or "srw"
96 * An imaginative programmer could break the porting model in other ways, but the above
97 * are the most likely problem areas. It is perhaps surprising how well in practice
98 * this simple method works.
102 #define kVeryLong (128*1024)
105 // Main entry points.
108 bcopy_970: // void bcopy(const void *src, void *dst, size_t len)
109 cmplwi rc,kShort // short or long?
110 sub w1,r4,r3 // must move in reverse if (rd-rs)<rc
111 mr rd,r4 // move registers to canonic spot
113 blt LShort // handle short operands
114 dcbt 0,rs // touch in the first line of source
115 dcbtst 0,rd // touch in destination
116 b LLong1 // join long operand code
118 // NB: memmove() must be 8 words past bcopy(), to agree with comm page addresses.
121 Lmemcpy_970: // void* memcpy(void *dst, void *src, size_t len)
122 Lmemmove_970: // void* memmove(void *dst, const void *src, size_t len)
123 cmplwi rc,kShort // short or long?
124 sub w1,r3,r4 // must move in reverse if (rd-rs)<rc
125 mr rd,r3 // must leave r3 alone, it is return value for memcpy etc
126 bge LLong0 // handle long operands
128 // Handle short operands.
132 // w1 = (rd-rs), must move reverse if (rd-rs)<rc
135 cmplw cr1,w1,rc // set cr1 blt if we must move reverse
136 mtcrf 0x02,rc // move length to cr6 and cr7 one at a time
138 blt-- cr1,LShortReverse
140 // Forward short operands. This is the most frequent case, so it is inline.
142 bf 26,0f // 32-byte chunk to move?
155 bf 27,1f // quadword to move?
163 LShort16: // join here to xfer 0-15 bytes
164 bf 28,2f // doubleword?
176 bf 30,4f // halfword to move?
182 bflr 31 // skip if no odd byte
188 // Handle short reverse operands.
189 // cr = length in bits 26-31
192 add rs,rs,rc // adjust ptrs for reverse move
194 bf 26,0f // 32 bytes to move?
204 bf 27,1f // quadword to move?
210 LShortReverse16: // join here to xfer 0-15 bytes and return
211 bf 28,2f // doubleword?
219 bf 30,4f // halfword to move?
223 bflr 31 // done if no odd byte
224 lbz w1,-1(rs) // no update
229 // Long operands, use Altivec in most cases.
233 // w1 = (rd-rs), must move reverse if (rd-rs)<rc
235 LLong0: // entry from memmove()
236 dcbt 0,rs // touch in source
237 dcbtst 0,rd // touch in destination
238 LLong1: // entry from bcopy() with operands already touched in
239 cmplw cr1,w1,rc // set cr1 blt iff we must move reverse
240 neg w3,rd // start to compute #bytes to align destination
241 rlwinm w2,w1,0,0xF // 16-byte aligned? (w2==0 if so)
242 andi. w4,w3,0xF // w4 <- #bytes to 16-byte align destination
243 cmpwi cr5,w2,0 // set cr5 beq if relatively 16-byte aligned
244 blt-- cr1,LLongReverse // handle reverse moves
245 sub rc,rc,w4 // adjust length for aligning destination
246 srwi r0,rc,7 // get #cache lines to copy (may be 0)
247 cmpwi cr1,r0,0 // set cr1 on #chunks
248 beq LFwdAligned // dest is already aligned
250 // 16-byte align destination.
252 mtcrf 0x01,w4 // cr7 <- #bytes to align dest (nonzero)
253 bf 31,1f // byte to move?
259 bf 30,2f // halfword?
271 bf 28,LFwdAligned // doubleword?
278 // Forward, destination is 16-byte aligned. There are five cases:
279 // 1. If the length>=kVeryLong (ie, several pages), then use the
280 // "bigcopy" path that pulls all the punches. This is the fastest
281 // case for cold-cache operands, as any this long will likely be.
282 // 2. If length>=128 and source is 16-byte aligned, then use the
283 // lvx/stvx loop over 128-byte chunks. This is the fastest
284 // case for hot-cache operands, 2nd fastest for cold.
285 // 3. If length>=128 and source is not 16-byte aligned, then use the
286 // lvx/vperm/stvx loop over 128-byte chunks.
287 // 4. If length<128 and source is 8-byte aligned, then use the
288 // ld/std loop over 32-byte chunks.
289 // 5. If length<128 and source is not 8-byte aligned, then use the
290 // lvx/vperm/stvx loop over 32-byte chunks. This is the slowest case.
291 // Registers at this point:
292 // r0/cr1 = count of cache lines ("chunks") that we'll cover (may be 0)
293 // rs = alignment unknown
294 // rd = 16-byte aligned
295 // rc = bytes remaining
296 // w2 = low 4 bits of (rd-rs), used to check alignment
297 // cr5 = beq if source is also 16-byte aligned
300 andi. w3,w2,7 // is source at least 8-byte aligned?
301 mtcrf 0x01,rc // move leftover count to cr7 for LShort16
302 bne cr1,LFwdLongVectors // at least one 128-byte chunk, so use vectors
303 srwi w1,rc,5 // get 32-byte chunk count
304 mtcrf 0x02,rc // move bit 27 of length to cr6 for LShort32
305 mtctr w1 // set up 32-byte loop (w1!=0)
306 beq LFwdMedAligned // source is 8-byte aligned, so use ld/std loop
307 mfspr rv,vrsave // get bitmap of live vector registers
308 oris w4,rv,0xFFF8 // we use v0-v12
309 li c16,16 // get constant used in lvx
311 mtspr vrsave,w4 // update mask
312 lvx v1,0,rs // prefetch 1st source quadword
313 lvsl vp,0,rs // get permute vector to shift left
316 // Fewer than 128 bytes but not doubleword aligned: use lvx/vperm/stvx.
318 1: // loop over 32-byte chunks
324 vor v1,v3,v3 // v1 <- v3
330 mtspr vrsave,rv // restore bitmap of live vr's
334 // Fewer than 128 bytes and doubleword aligned: use ld/std.
337 LFwdMedAligned: // loop over 32-byte chunks
353 // Forward, 128 bytes or more: use vectors. When entered:
354 // r0 = 128-byte chunks to move (>0)
355 // rd = 16-byte aligned
356 // cr5 = beq if source is 16-byte aligned
357 // cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
358 // We set up many registers:
359 // ctr = number of 128-byte chunks to move
360 // r0/cr0 = leftover QWs to move
361 // cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
362 // cr6 = beq if leftover byte count is 0
363 // rv = original value of VRSave
364 // c16,c32,c48 = loaded
367 mfspr rv,vrsave // get bitmap of live vector registers
368 lis w3,kVeryLong>>16 // cutoff for very-long-operand special case path
369 cmplw cr1,rc,w3 // very long operand?
370 rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3
371 bge-- cr1,LBigCopy // handle big copies separately
372 mtctr r0 // set up loop count
373 cmpwi cr6,w3,0 // set cr6 on leftover byte count
374 oris w4,rv,0xFFF8 // we use v0-v12
375 rlwinm. r0,rc,28,29,31 // get number of quadword leftovers (0-7) and set cr0
376 li c16,16 // get constants used in ldvx/stvx
377 mtspr vrsave,w4 // update mask
380 beq cr5,LFwdLongAligned // source is also 16-byte aligned, no need for vperm
381 lvsl vp,0,rs // get permute vector to shift left
382 lvx v1,0,rs // prefetch 1st source quadword
386 // Forward, long, unaligned vector loop.
388 .align 5 // align inner loops
389 LFwdLongUnaligned: // loop over 128-byte chunks
418 bdnz LFwdLongUnaligned
420 beq 4f // no leftover quadwords
422 3: // loop over remaining quadwords
426 vor v1,v2,v2 // v1 <- v2
431 mtspr vrsave,rv // restore bitmap of live vr's
432 bne cr6,LShort16 // handle last 0-15 bytes if any
436 // Forward, long, 16-byte aligned vector loop.
439 LFwdLongAligned: // loop over 128-byte chunks
462 beq 4f // no leftover quadwords
464 3: // loop over remaining quadwords (1-7)
471 mtspr vrsave,rv // restore bitmap of live vr's
472 bne cr6,LShort16 // handle last 0-15 bytes if any
476 // Long, reverse moves.
480 // cr5 = beq if relatively 16-byte aligned
483 add rd,rd,rc // point to end of operands
485 andi. r0,rd,0xF // #bytes to 16-byte align destination
486 beq 2f // already aligned
488 // 16-byte align destination.
490 mtctr r0 // set up for loop
497 // Prepare for reverse vector loop. When entered:
498 // rd = 16-byte aligned
499 // cr5 = beq if source also 16-byte aligned
500 // We set up many registers:
501 // ctr/cr1 = number of 64-byte chunks to move (may be 0)
502 // r0/cr0 = leftover QWs to move
503 // cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
504 // cr6 = beq if leftover byte count is 0
506 // rv = original value of vrsave
509 mfspr rv,vrsave // get bitmap of live vector registers
510 srwi r0,rc,6 // get count of 64-byte chunks to move (may be 0)
511 oris w1,rv,0xFFF8 // we use v0-v12
512 mtcrf 0x01,rc // prepare for moving last 0-15 bytes in LShortReverse16
513 rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3 too
514 cmpwi cr1,r0,0 // set cr1 on chunk count
515 mtspr vrsave,w1 // update mask
516 mtctr r0 // set up loop count
517 cmpwi cr6,w3,0 // set cr6 on leftover byte count
518 rlwinm. r0,rc,28,30,31 // get number of quadword leftovers (0-3) and set cr0
519 li cm1,-1 // get constants used in ldvx/stvx
521 bne cr5,LReverseVecUnal // handle unaligned operands
522 beq cr1,2f // no chunks (if no chunks, must be leftover QWs)
528 // Long, reverse 16-byte-aligned vector loop.
530 .align 5 // align inner loops
531 1: // loop over 64-byte chunks
544 beq 4f // no leftover quadwords
545 2: // r0=#QWs, rv=vrsave, cr7=(rc & F), cr6 set on cr7
547 3: // loop over remaining quadwords (1-7)
554 mtspr vrsave,rv // restore bitmap of live vr's
555 bne cr6,LShortReverse16 // handle last 0-15 bytes if any
559 // Long, reverse, unaligned vector loop.
560 // ctr/cr1 = number of 64-byte chunks to move (may be 0)
561 // r0/cr0 = leftover QWs to move
562 // cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
563 // cr6 = beq if leftover byte count is 0
564 // rv = original value of vrsave
568 lvsl vp,0,rs // get permute vector to shift left
569 lvx v1,cm1,rs // v1 always looks ahead
571 beq cr1,2f // no chunks (if no chunks, must be leftover QWs)
576 .align 5 // align the inner loops
577 1: // loop over 64-byte chunks
594 beq 4f // no leftover quadwords
595 2: // r0=#QWs, rv=vrsave, v1=next QW, cr7=(rc & F), cr6 set on cr7
597 3: // loop over 1-3 quadwords
601 vor v1,v2,v2 // v1 <- v2
606 mtspr vrsave,rv // restore bitmap of live vr's
607 bne cr6,LShortReverse16 // handle last 0-15 bytes iff any
611 // Very Big Copy Path. Save our return address in the stack for help decoding backtraces.
612 // The conditions bigcopy expects are:
613 // r0 = return address (also stored in caller's SF)
615 // r5 = length (at least several pages)
619 lis r2,0x4000 // r2 <- 0x40000000
620 mflr r0 // get our return address
621 add. r2,r2,r2 // set cr0_lt if running in 32-bit mode
622 stw r0,8(r1) // save return, assuming 32-bit mode ("crsave" if 64-bit mode)
623 blta _COMM_PAGE_BIGCOPY // 32-bit mode, join big operand copy
624 std r0,16(r1) // save return in correct spot for 64-bit mode
625 ba _COMM_PAGE_BIGCOPY // then join big operand code
628 COMMPAGE_DESCRIPTOR(bcopy_970,_COMM_PAGE_BCOPY,k64Bit+kHasAltivec,0, \
629 kCommPageMTCRF+kCommPageBoth+kPort32to64)