2 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
4 * @APPLE_LICENSE_HEADER_START@
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
20 * @APPLE_LICENSE_HEADER_END@
22 /* =======================================
23 * BCOPY, MEMCPY, and MEMMOVE for Mac OS X
24 * =======================================
26 * Version of 6/11/2003, tuned for the IBM 970.
29 * Register usage. Note the rather delicate way we assign multiple uses
30 * to the same register. Beware.
31 * r0 = temp (NB: cannot use r0 for any constant such as "c16")
32 * r3 = not used, as memcpy and memmove return 1st parameter as a value
33 * r4 = source ptr ("rs")
34 * r5 = count of bytes to move ("rc")
35 * r6 = "w1", "c16", or "cm17"
36 * r7 = "w2", "c32", or "cm33"
37 * r8 = "w3", "c48", or "cm49"
41 * r12 = destination ptr ("rd")
42 * v0 = permute vector ("vp")
43 * v1-v8 = qw's loaded from source
44 *v9-v12 = permuted qw's ("vw", "vx", "vy", and "vz")
71 #include <sys/appleapiopts.h>
73 #include <machine/cpu_capabilities.h>
74 #include <machine/commpage.h>
81 #define kVeryLong (128*1024)
87 bcopy_970: // void bcopy(const void *src, void *dst, size_t len)
88 cmplwi rc,kShort // short or long?
89 sub w1,r4,r3 // must move in reverse if (rd-rs)<rc
90 mr rd,r4 // move registers to canonic spot
92 blt LShort // handle short operands
93 dcbt 0,rs // touch in the first line of source
94 dcbtst 0,rd // touch in destination
95 b LLong1 // join long operand code
97 // NB: memmove() must be 8 words past bcopy(), to agree with comm page addresses.
100 Lmemcpy_970: // void* memcpy(void *dst, void *src, size_t len)
101 Lmemmove_970: // void* memmove(void *dst, const void *src, size_t len)
102 cmplwi rc,kShort // short or long?
103 sub w1,r3,r4 // must move in reverse if (rd-rs)<rc
104 mr rd,r3 // must leave r3 alone, it is return value for memcpy etc
105 bge LLong0 // handle long operands
107 // Handle short operands.
111 // w1 = (rd-rs), must move reverse if (rd-rs)<rc
114 cmplw cr1,w1,rc // set cr1 blt if we must move reverse
115 mtcrf 0x02,rc // move length to cr6 and cr7 one at a time
117 blt-- cr1,LShortReverse
119 // Forward short operands. This is the most frequent case, so it is inline.
121 bf 26,0f // 32-byte chunk to move?
134 bf 27,1f // quadword to move?
142 LShort16: // join here to xfer 0-15 bytes
143 bf 28,2f // doubleword?
155 bf 30,4f // halfword to move?
161 bflr 31 // skip if no odd byte
167 // Handle short reverse operands.
168 // cr = length in bits 26-31
171 add rs,rs,rc // adjust ptrs for reverse move
173 bf 26,0f // 32 bytes to move?
183 bf 27,1f // quadword to move?
189 LShortReverse16: // join here to xfer 0-15 bytes and return
190 bf 28,2f // doubleword?
198 bf 30,4f // halfword to move?
202 bflr 31 // done if no odd byte
203 lbz w1,-1(rs) // no update
208 // Long operands, use Altivec in most cases.
212 // w1 = (rd-rs), must move reverse if (rd-rs)<rc
214 LLong0: // entry from memmove()
215 dcbt 0,rs // touch in source
216 dcbtst 0,rd // touch in destination
217 LLong1: // entry from bcopy() with operands already touched in
218 cmplw cr1,w1,rc // set cr1 blt iff we must move reverse
219 neg w3,rd // start to compute #bytes to align destination
220 rlwinm w2,w1,0,0xF // 16-byte aligned? (w2==0 if so)
221 andi. w4,w3,0xF // w4 <- #bytes to 16-byte align destination
222 cmpwi cr5,w2,0 // set cr5 beq if relatively 16-byte aligned
223 blt-- cr1,LLongReverse // handle reverse moves
224 sub rc,rc,w4 // adjust length for aligning destination
225 srwi r0,rc,7 // get #cache lines to copy (may be 0)
226 cmpwi cr1,r0,0 // set cr1 on #chunks
227 beq LFwdAligned // dest is already aligned
229 // 16-byte align destination.
231 mtcrf 0x01,w4 // cr7 <- #bytes to align dest (nonzero)
232 bf 31,1f // byte to move?
238 bf 30,2f // halfword?
250 bf 28,LFwdAligned // doubleword?
257 // Forward, destination is 16-byte aligned. There are five cases:
258 // 1. If the length>=kVeryLong (ie, several pages), then use the
259 // "bigcopy" path that pulls all the punches. This is the fastest
260 // case for cold-cache operands, as any this long will likely be.
261 // 2. If length>=128 and source is 16-byte aligned, then use the
262 // lvx/stvx loop over 128-byte chunks. This is the fastest
263 // case for hot-cache operands, 2nd fastest for cold.
264 // 3. If length>=128 and source is not 16-byte aligned, then use the
265 // lvx/vperm/stvx loop over 128-byte chunks.
266 // 4. If length<128 and source is 8-byte aligned, then use the
267 // ld/std loop over 32-byte chunks.
268 // 5. If length<128 and source is not 8-byte aligned, then use the
269 // lvx/vperm/stvx loop over 32-byte chunks. This is the slowest case.
270 // Registers at this point:
271 // r0/cr1 = count of cache lines ("chunks") that we'll cover (may be 0)
272 // rs = alignment unknown
273 // rd = 16-byte aligned
274 // rc = bytes remaining
275 // w2 = low 4 bits of (rd-rs), used to check alignment
276 // cr5 = beq if source is also 16-byte aligned
279 andi. w3,w2,7 // is source at least 8-byte aligned?
280 mtcrf 0x01,rc // move leftover count to cr7 for LShort16
281 bne cr1,LFwdLongVectors // at least one 128-byte chunk, so use vectors
282 srwi w1,rc,5 // get 32-byte chunk count
283 mtcrf 0x02,rc // move bit 27 of length to cr6 for LShort32
284 mtctr w1 // set up 32-byte loop (w1!=0)
285 beq LFwdMedAligned // source is 8-byte aligned, so use ld/std loop
286 mfspr rv,vrsave // get bitmap of live vector registers
287 oris w4,rv,0xFFF8 // we use v0-v12
288 li c16,16 // get constant used in lvx
290 mtspr vrsave,w4 // update mask
291 lvx v1,0,rs // prefetch 1st source quadword
292 lvsl vp,0,rs // get permute vector to shift left
295 // Fewer than 128 bytes but not doubleword aligned: use lvx/vperm/stvx.
297 1: // loop over 32-byte chunks
303 vor v1,v3,v3 // v1 <- v3
309 mtspr vrsave,rv // restore bitmap of live vr's
313 // Fewer than 128 bytes and doubleword aligned: use ld/std.
316 LFwdMedAligned: // loop over 32-byte chunks
332 // Forward, 128 bytes or more: use vectors. When entered:
333 // r0 = 128-byte chunks to move (>0)
334 // rd = 16-byte aligned
335 // cr5 = beq if source is 16-byte aligned
336 // cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
337 // We set up many registers:
338 // ctr = number of 128-byte chunks to move
339 // r0/cr0 = leftover QWs to move
340 // cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
341 // cr6 = beq if leftover byte count is 0
342 // rv = original value of VRSave
343 // c16,c32,c48 = loaded
346 mfspr rv,vrsave // get bitmap of live vector registers
347 lis w3,kVeryLong>>16 // cutoff for very-long-operand special case path
348 cmplw cr1,rc,w3 // very long operand?
349 rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3
350 bgea-- cr1,_COMM_PAGE_BIGCOPY // handle big copies separately
351 mtctr r0 // set up loop count
352 cmpwi cr6,w3,0 // set cr6 on leftover byte count
353 oris w4,rv,0xFFF8 // we use v0-v12
354 rlwinm. r0,rc,28,29,31 // get number of quadword leftovers (0-7) and set cr0
355 li c16,16 // get constants used in ldvx/stvx
356 mtspr vrsave,w4 // update mask
359 beq cr5,LFwdLongAligned // source is also 16-byte aligned, no need for vperm
360 lvsl vp,0,rs // get permute vector to shift left
361 lvx v1,0,rs // prefetch 1st source quadword
365 // Forward, long, unaligned vector loop.
367 .align 5 // align inner loops
368 LFwdLongUnaligned: // loop over 128-byte chunks
397 bdnz LFwdLongUnaligned
399 beq 4f // no leftover quadwords
401 3: // loop over remaining quadwords
405 vor v1,v2,v2 // v1 <- v2
410 mtspr vrsave,rv // restore bitmap of live vr's
411 bne cr6,LShort16 // handle last 0-15 bytes if any
415 // Forward, long, 16-byte aligned vector loop.
418 LFwdLongAligned: // loop over 128-byte chunks
441 beq 4f // no leftover quadwords
443 3: // loop over remaining quadwords (1-7)
450 mtspr vrsave,rv // restore bitmap of live vr's
451 bne cr6,LShort16 // handle last 0-15 bytes if any
455 // Long, reverse moves.
459 // cr5 = beq if relatively 16-byte aligned
462 add rd,rd,rc // point to end of operands
464 andi. r0,rd,0xF // #bytes to 16-byte align destination
465 beq 2f // already aligned
467 // 16-byte align destination.
469 mtctr r0 // set up for loop
476 // Prepare for reverse vector loop. When entered:
477 // rd = 16-byte aligned
478 // cr5 = beq if source also 16-byte aligned
479 // We set up many registers:
480 // ctr/cr1 = number of 64-byte chunks to move (may be 0)
481 // r0/cr0 = leftover QWs to move
482 // cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
483 // cr6 = beq if leftover byte count is 0
485 // rv = original value of vrsave
488 mfspr rv,vrsave // get bitmap of live vector registers
489 srwi r0,rc,6 // get count of 64-byte chunks to move (may be 0)
490 oris w1,rv,0xFFF8 // we use v0-v12
491 mtcrf 0x01,rc // prepare for moving last 0-15 bytes in LShortReverse16
492 rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3 too
493 cmpwi cr1,r0,0 // set cr1 on chunk count
494 mtspr vrsave,w1 // update mask
495 mtctr r0 // set up loop count
496 cmpwi cr6,w3,0 // set cr6 on leftover byte count
497 rlwinm. r0,rc,28,30,31 // get number of quadword leftovers (0-3) and set cr0
498 li cm1,-1 // get constants used in ldvx/stvx
500 bne cr5,LReverseVecUnal // handle unaligned operands
501 beq cr1,2f // no chunks (if no chunks, must be leftover QWs)
507 // Long, reverse 16-byte-aligned vector loop.
509 .align 5 // align inner loops
510 1: // loop over 64-byte chunks
523 beq 4f // no leftover quadwords
524 2: // r0=#QWs, rv=vrsave, cr7=(rc & F), cr6 set on cr7
526 3: // loop over remaining quadwords (1-7)
533 mtspr vrsave,rv // restore bitmap of live vr's
534 bne cr6,LShortReverse16 // handle last 0-15 bytes if any
538 // Long, reverse, unaligned vector loop.
539 // ctr/cr1 = number of 64-byte chunks to move (may be 0)
540 // r0/cr0 = leftover QWs to move
541 // cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
542 // cr6 = beq if leftover byte count is 0
543 // rv = original value of vrsave
547 lvsl vp,0,rs // get permute vector to shift left
548 lvx v1,cm1,rs // v1 always looks ahead
550 beq cr1,2f // no chunks (if no chunks, must be leftover QWs)
555 .align 5 // align the inner loops
556 1: // loop over 64-byte chunks
573 beq 4f // no leftover quadwords
574 2: // r0=#QWs, rv=vrsave, v1=next QW, cr7=(rc & F), cr6 set on cr7
576 3: // loop over 1-3 quadwords
580 vor v1,v2,v2 // v1 <- v2
585 mtspr vrsave,rv // restore bitmap of live vr's
586 bne cr6,LShortReverse16 // handle last 0-15 bytes iff any
589 COMMPAGE_DESCRIPTOR(bcopy_970,_COMM_PAGE_BCOPY,k64Bit+kHasAltivec,0,kCommPageMTCRF)