2 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
4 * @APPLE_LICENSE_HEADER_START@
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
20 * @APPLE_LICENSE_HEADER_END@
22 /* =======================================
23 * BCOPY, MEMCPY, and MEMMOVE for Mac OS X
24 * =======================================
26 * Version of 6/11/2003, tuned for the IBM 970.
28 * Register usage. Note the rather delicate way we assign multiple uses
29 * to the same register. Beware.
30 * r0 = temp (NB: cannot use r0 for any constant such as "c16")
31 * r3 = not used, as memcpy and memmove return 1st parameter as a value
32 * r4 = source ptr ("rs")
33 * r5 = count of bytes to move ("rc")
34 * r6 = "w1", "c16", or "cm17"
35 * r7 = "w2", "c32", or "cm33"
36 * r8 = "w3", "c48", or "cm49"
40 * r12 = destination ptr ("rd")
41 * v0 = permute vector ("vp")
42 * v1-v8 = qw's loaded from source
43 *v9-v12 = permuted qw's ("vw", "vx", "vy", and "vz")
70 #include <sys/appleapiopts.h>
72 #include <machine/cpu_capabilities.h>
73 #include <machine/commpage.h>
77 * WARNING: this code is written for 32-bit mode, and ported by the kernel if necessary
78 * to 64-bit mode for use in the 64-bit commpage. This "port" consists of the following
79 * simple transformations:
80 * - all word compares are changed to doubleword
81 * - all "srwi[.]" opcodes are changed to "srdi[.]"
82 * Nothing else is done. For this to work, the following rules must be
84 * - do not use carry or overflow
85 * - only use record mode if you are sure the results are mode-invariant
86 * for example, all "andi." and almost all "rlwinm." are fine
87 * - do not use "slwi", "slw", or "srw"
88 * An imaginative programmer could break the porting model in other ways, but the above
89 * are the most likely problem areas. It is perhaps surprising how well in practice
90 * this simple method works.
94 #define kVeryLong (128*1024)
100 bcopy_970: // void bcopy(const void *src, void *dst, size_t len)
101 cmplwi rc,kShort // short or long?
102 sub w1,r4,r3 // must move in reverse if (rd-rs)<rc
103 mr rd,r4 // move registers to canonic spot
105 blt LShort // handle short operands
106 dcbt 0,rs // touch in the first line of source
107 dcbtst 0,rd // touch in destination
108 b LLong1 // join long operand code
110 // NB: memmove() must be 8 words past bcopy(), to agree with comm page addresses.
113 Lmemcpy_970: // void* memcpy(void *dst, void *src, size_t len)
114 Lmemmove_970: // void* memmove(void *dst, const void *src, size_t len)
115 cmplwi rc,kShort // short or long?
116 sub w1,r3,r4 // must move in reverse if (rd-rs)<rc
117 mr rd,r3 // must leave r3 alone, it is return value for memcpy etc
118 bge LLong0 // handle long operands
120 // Handle short operands.
124 // w1 = (rd-rs), must move reverse if (rd-rs)<rc
127 cmplw cr1,w1,rc // set cr1 blt if we must move reverse
128 mtcrf 0x02,rc // move length to cr6 and cr7 one at a time
130 blt-- cr1,LShortReverse
132 // Forward short operands. This is the most frequent case, so it is inline.
134 bf 26,0f // 32-byte chunk to move?
147 bf 27,1f // quadword to move?
155 LShort16: // join here to xfer 0-15 bytes
156 bf 28,2f // doubleword?
168 bf 30,4f // halfword to move?
174 bflr 31 // skip if no odd byte
180 // Handle short reverse operands.
181 // cr = length in bits 26-31
184 add rs,rs,rc // adjust ptrs for reverse move
186 bf 26,0f // 32 bytes to move?
196 bf 27,1f // quadword to move?
202 LShortReverse16: // join here to xfer 0-15 bytes and return
203 bf 28,2f // doubleword?
211 bf 30,4f // halfword to move?
215 bflr 31 // done if no odd byte
216 lbz w1,-1(rs) // no update
221 // Long operands, use Altivec in most cases.
225 // w1 = (rd-rs), must move reverse if (rd-rs)<rc
227 LLong0: // entry from memmove()
228 dcbt 0,rs // touch in source
229 dcbtst 0,rd // touch in destination
230 LLong1: // entry from bcopy() with operands already touched in
231 cmplw cr1,w1,rc // set cr1 blt iff we must move reverse
232 neg w3,rd // start to compute #bytes to align destination
233 rlwinm w2,w1,0,0xF // 16-byte aligned? (w2==0 if so)
234 andi. w4,w3,0xF // w4 <- #bytes to 16-byte align destination
235 cmpwi cr5,w2,0 // set cr5 beq if relatively 16-byte aligned
236 blt-- cr1,LLongReverse // handle reverse moves
237 sub rc,rc,w4 // adjust length for aligning destination
238 srwi r0,rc,7 // get #cache lines to copy (may be 0)
239 cmpwi cr1,r0,0 // set cr1 on #chunks
240 beq LFwdAligned // dest is already aligned
242 // 16-byte align destination.
244 mtcrf 0x01,w4 // cr7 <- #bytes to align dest (nonzero)
245 bf 31,1f // byte to move?
251 bf 30,2f // halfword?
263 bf 28,LFwdAligned // doubleword?
270 // Forward, destination is 16-byte aligned. There are five cases:
271 // 1. If the length>=kVeryLong (ie, several pages), then use the
272 // "bigcopy" path that pulls all the punches. This is the fastest
273 // case for cold-cache operands, as any this long will likely be.
274 // 2. If length>=128 and source is 16-byte aligned, then use the
275 // lvx/stvx loop over 128-byte chunks. This is the fastest
276 // case for hot-cache operands, 2nd fastest for cold.
277 // 3. If length>=128 and source is not 16-byte aligned, then use the
278 // lvx/vperm/stvx loop over 128-byte chunks.
279 // 4. If length<128 and source is 8-byte aligned, then use the
280 // ld/std loop over 32-byte chunks.
281 // 5. If length<128 and source is not 8-byte aligned, then use the
282 // lvx/vperm/stvx loop over 32-byte chunks. This is the slowest case.
283 // Registers at this point:
284 // r0/cr1 = count of cache lines ("chunks") that we'll cover (may be 0)
285 // rs = alignment unknown
286 // rd = 16-byte aligned
287 // rc = bytes remaining
288 // w2 = low 4 bits of (rd-rs), used to check alignment
289 // cr5 = beq if source is also 16-byte aligned
292 andi. w3,w2,7 // is source at least 8-byte aligned?
293 mtcrf 0x01,rc // move leftover count to cr7 for LShort16
294 bne cr1,LFwdLongVectors // at least one 128-byte chunk, so use vectors
295 srwi w1,rc,5 // get 32-byte chunk count
296 mtcrf 0x02,rc // move bit 27 of length to cr6 for LShort32
297 mtctr w1 // set up 32-byte loop (w1!=0)
298 beq LFwdMedAligned // source is 8-byte aligned, so use ld/std loop
299 mfspr rv,vrsave // get bitmap of live vector registers
300 oris w4,rv,0xFFF8 // we use v0-v12
301 li c16,16 // get constant used in lvx
303 mtspr vrsave,w4 // update mask
304 lvx v1,0,rs // prefetch 1st source quadword
305 lvsl vp,0,rs // get permute vector to shift left
308 // Fewer than 128 bytes but not doubleword aligned: use lvx/vperm/stvx.
310 1: // loop over 32-byte chunks
316 vor v1,v3,v3 // v1 <- v3
322 mtspr vrsave,rv // restore bitmap of live vr's
326 // Fewer than 128 bytes and doubleword aligned: use ld/std.
329 LFwdMedAligned: // loop over 32-byte chunks
345 // Forward, 128 bytes or more: use vectors. When entered:
346 // r0 = 128-byte chunks to move (>0)
347 // rd = 16-byte aligned
348 // cr5 = beq if source is 16-byte aligned
349 // cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
350 // We set up many registers:
351 // ctr = number of 128-byte chunks to move
352 // r0/cr0 = leftover QWs to move
353 // cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
354 // cr6 = beq if leftover byte count is 0
355 // rv = original value of VRSave
356 // c16,c32,c48 = loaded
359 mfspr rv,vrsave // get bitmap of live vector registers
360 lis w3,kVeryLong>>16 // cutoff for very-long-operand special case path
361 cmplw cr1,rc,w3 // very long operand?
362 rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3
363 bge-- cr1,LBigCopy // handle big copies separately
364 mtctr r0 // set up loop count
365 cmpwi cr6,w3,0 // set cr6 on leftover byte count
366 oris w4,rv,0xFFF8 // we use v0-v12
367 rlwinm. r0,rc,28,29,31 // get number of quadword leftovers (0-7) and set cr0
368 li c16,16 // get constants used in ldvx/stvx
369 mtspr vrsave,w4 // update mask
372 beq cr5,LFwdLongAligned // source is also 16-byte aligned, no need for vperm
373 lvsl vp,0,rs // get permute vector to shift left
374 lvx v1,0,rs // prefetch 1st source quadword
378 // Forward, long, unaligned vector loop.
380 .align 5 // align inner loops
381 LFwdLongUnaligned: // loop over 128-byte chunks
410 bdnz LFwdLongUnaligned
412 beq 4f // no leftover quadwords
414 3: // loop over remaining quadwords
418 vor v1,v2,v2 // v1 <- v2
423 mtspr vrsave,rv // restore bitmap of live vr's
424 bne cr6,LShort16 // handle last 0-15 bytes if any
428 // Forward, long, 16-byte aligned vector loop.
431 LFwdLongAligned: // loop over 128-byte chunks
454 beq 4f // no leftover quadwords
456 3: // loop over remaining quadwords (1-7)
463 mtspr vrsave,rv // restore bitmap of live vr's
464 bne cr6,LShort16 // handle last 0-15 bytes if any
468 // Long, reverse moves.
472 // cr5 = beq if relatively 16-byte aligned
475 add rd,rd,rc // point to end of operands
477 andi. r0,rd,0xF // #bytes to 16-byte align destination
478 beq 2f // already aligned
480 // 16-byte align destination.
482 mtctr r0 // set up for loop
489 // Prepare for reverse vector loop. When entered:
490 // rd = 16-byte aligned
491 // cr5 = beq if source also 16-byte aligned
492 // We set up many registers:
493 // ctr/cr1 = number of 64-byte chunks to move (may be 0)
494 // r0/cr0 = leftover QWs to move
495 // cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
496 // cr6 = beq if leftover byte count is 0
498 // rv = original value of vrsave
501 mfspr rv,vrsave // get bitmap of live vector registers
502 srwi r0,rc,6 // get count of 64-byte chunks to move (may be 0)
503 oris w1,rv,0xFFF8 // we use v0-v12
504 mtcrf 0x01,rc // prepare for moving last 0-15 bytes in LShortReverse16
505 rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3 too
506 cmpwi cr1,r0,0 // set cr1 on chunk count
507 mtspr vrsave,w1 // update mask
508 mtctr r0 // set up loop count
509 cmpwi cr6,w3,0 // set cr6 on leftover byte count
510 rlwinm. r0,rc,28,30,31 // get number of quadword leftovers (0-3) and set cr0
511 li cm1,-1 // get constants used in ldvx/stvx
513 bne cr5,LReverseVecUnal // handle unaligned operands
514 beq cr1,2f // no chunks (if no chunks, must be leftover QWs)
520 // Long, reverse 16-byte-aligned vector loop.
522 .align 5 // align inner loops
523 1: // loop over 64-byte chunks
536 beq 4f // no leftover quadwords
537 2: // r0=#QWs, rv=vrsave, cr7=(rc & F), cr6 set on cr7
539 3: // loop over remaining quadwords (1-7)
546 mtspr vrsave,rv // restore bitmap of live vr's
547 bne cr6,LShortReverse16 // handle last 0-15 bytes if any
551 // Long, reverse, unaligned vector loop.
552 // ctr/cr1 = number of 64-byte chunks to move (may be 0)
553 // r0/cr0 = leftover QWs to move
554 // cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
555 // cr6 = beq if leftover byte count is 0
556 // rv = original value of vrsave
560 lvsl vp,0,rs // get permute vector to shift left
561 lvx v1,cm1,rs // v1 always looks ahead
563 beq cr1,2f // no chunks (if no chunks, must be leftover QWs)
568 .align 5 // align the inner loops
569 1: // loop over 64-byte chunks
586 beq 4f // no leftover quadwords
587 2: // r0=#QWs, rv=vrsave, v1=next QW, cr7=(rc & F), cr6 set on cr7
589 3: // loop over 1-3 quadwords
593 vor v1,v2,v2 // v1 <- v2
598 mtspr vrsave,rv // restore bitmap of live vr's
599 bne cr6,LShortReverse16 // handle last 0-15 bytes iff any
603 // Very Big Copy Path. Save our return address in the stack for help decoding backtraces.
604 // The conditions bigcopy expects are:
605 // r0 = return address (also stored in caller's SF)
607 // r5 = length (at least several pages)
611 lis r2,0x4000 // r2 <- 0x40000000
612 mflr r0 // get our return address
613 add. r2,r2,r2 // set cr0_lt if running in 32-bit mode
614 stw r0,8(r1) // save return, assuming 32-bit mode ("crsave" if 64-bit mode)
615 blta _COMM_PAGE_BIGCOPY // 32-bit mode, join big operand copy
616 std r0,16(r1) // save return in correct spot for 64-bit mode
617 ba _COMM_PAGE_BIGCOPY // then join big operand code
620 COMMPAGE_DESCRIPTOR(bcopy_970,_COMM_PAGE_BCOPY,k64Bit+kHasAltivec,0, \
621 kCommPageMTCRF+kCommPageBoth+kPort32to64)