2 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
4 * @APPLE_LICENSE_HEADER_START@
6 * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved.
8 * This file contains Original Code and/or Modifications of Original Code
9 * as defined in and that are subject to the Apple Public Source License
10 * Version 2.0 (the 'License'). You may not use this file except in
11 * compliance with the License. Please obtain a copy of the License at
12 * http://www.opensource.apple.com/apsl/ and read it before using this
15 * The Original Code and all software distributed under the License are
16 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
17 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
18 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
20 * Please see the License for the specific language governing rights and
21 * limitations under the License.
23 * @APPLE_LICENSE_HEADER_END@
25 /* =======================================
26 * BCOPY, MEMCPY, and MEMMOVE for Mac OS X
27 * =======================================
29 * Version of 6/11/2003, tuned for the IBM 970.
32 * Register usage. Note the rather delicate way we assign multiple uses
33 * to the same register. Beware.
34 * r0 = temp (NB: cannot use r0 for any constant such as "c16")
35 * r3 = not used, as memcpy and memmove return 1st parameter as a value
36 * r4 = source ptr ("rs")
37 * r5 = count of bytes to move ("rc")
38 * r6 = "w1", "c16", or "cm17"
39 * r7 = "w2", "c32", or "cm33"
40 * r8 = "w3", "c48", or "cm49"
44 * r12 = destination ptr ("rd")
45 * v0 = permute vector ("vp")
46 * v1-v8 = qw's loaded from source
47 *v9-v12 = permuted qw's ("vw", "vx", "vy", and "vz")
74 #include <sys/appleapiopts.h>
76 #include <machine/cpu_capabilities.h>
77 #include <machine/commpage.h>
84 #define kVeryLong (128*1024)
90 bcopy_970: // void bcopy(const void *src, void *dst, size_t len)
91 cmplwi rc,kShort // short or long?
92 sub w1,r4,r3 // must move in reverse if (rd-rs)<rc
93 mr rd,r4 // move registers to canonic spot
95 blt LShort // handle short operands
96 dcbt 0,rs // touch in the first line of source
97 dcbtst 0,rd // touch in destination
98 b LLong1 // join long operand code
100 // NB: memmove() must be 8 words past bcopy(), to agree with comm page addresses.
103 Lmemcpy_970: // void* memcpy(void *dst, void *src, size_t len)
104 Lmemmove_970: // void* memmove(void *dst, const void *src, size_t len)
105 cmplwi rc,kShort // short or long?
106 sub w1,r3,r4 // must move in reverse if (rd-rs)<rc
107 mr rd,r3 // must leave r3 alone, it is return value for memcpy etc
108 bge LLong0 // handle long operands
110 // Handle short operands.
114 // w1 = (rd-rs), must move reverse if (rd-rs)<rc
117 cmplw cr1,w1,rc // set cr1 blt if we must move reverse
118 mtcrf 0x02,rc // move length to cr6 and cr7 one at a time
120 blt-- cr1,LShortReverse
122 // Forward short operands. This is the most frequent case, so it is inline.
124 bf 26,0f // 32-byte chunk to move?
137 bf 27,1f // quadword to move?
145 LShort16: // join here to xfer 0-15 bytes
146 bf 28,2f // doubleword?
158 bf 30,4f // halfword to move?
164 bflr 31 // skip if no odd byte
170 // Handle short reverse operands.
171 // cr = length in bits 26-31
174 add rs,rs,rc // adjust ptrs for reverse move
176 bf 26,0f // 32 bytes to move?
186 bf 27,1f // quadword to move?
192 LShortReverse16: // join here to xfer 0-15 bytes and return
193 bf 28,2f // doubleword?
201 bf 30,4f // halfword to move?
205 bflr 31 // done if no odd byte
206 lbz w1,-1(rs) // no update
211 // Long operands, use Altivec in most cases.
215 // w1 = (rd-rs), must move reverse if (rd-rs)<rc
217 LLong0: // entry from memmove()
218 dcbt 0,rs // touch in source
219 dcbtst 0,rd // touch in destination
220 LLong1: // entry from bcopy() with operands already touched in
221 cmplw cr1,w1,rc // set cr1 blt iff we must move reverse
222 neg w3,rd // start to compute #bytes to align destination
223 rlwinm w2,w1,0,0xF // 16-byte aligned? (w2==0 if so)
224 andi. w4,w3,0xF // w4 <- #bytes to 16-byte align destination
225 cmpwi cr5,w2,0 // set cr5 beq if relatively 16-byte aligned
226 blt-- cr1,LLongReverse // handle reverse moves
227 sub rc,rc,w4 // adjust length for aligning destination
228 srwi r0,rc,7 // get #cache lines to copy (may be 0)
229 cmpwi cr1,r0,0 // set cr1 on #chunks
230 beq LFwdAligned // dest is already aligned
232 // 16-byte align destination.
234 mtcrf 0x01,w4 // cr7 <- #bytes to align dest (nonzero)
235 bf 31,1f // byte to move?
241 bf 30,2f // halfword?
253 bf 28,LFwdAligned // doubleword?
260 // Forward, destination is 16-byte aligned. There are five cases:
261 // 1. If the length>=kVeryLong (ie, several pages), then use the
262 // "bigcopy" path that pulls all the punches. This is the fastest
263 // case for cold-cache operands, as any this long will likely be.
264 // 2. If length>=128 and source is 16-byte aligned, then use the
265 // lvx/stvx loop over 128-byte chunks. This is the fastest
266 // case for hot-cache operands, 2nd fastest for cold.
267 // 3. If length>=128 and source is not 16-byte aligned, then use the
268 // lvx/vperm/stvx loop over 128-byte chunks.
269 // 4. If length<128 and source is 8-byte aligned, then use the
270 // ld/std loop over 32-byte chunks.
271 // 5. If length<128 and source is not 8-byte aligned, then use the
272 // lvx/vperm/stvx loop over 32-byte chunks. This is the slowest case.
273 // Registers at this point:
274 // r0/cr1 = count of cache lines ("chunks") that we'll cover (may be 0)
275 // rs = alignment unknown
276 // rd = 16-byte aligned
277 // rc = bytes remaining
278 // w2 = low 4 bits of (rd-rs), used to check alignment
279 // cr5 = beq if source is also 16-byte aligned
282 andi. w3,w2,7 // is source at least 8-byte aligned?
283 mtcrf 0x01,rc // move leftover count to cr7 for LShort16
284 bne cr1,LFwdLongVectors // at least one 128-byte chunk, so use vectors
285 srwi w1,rc,5 // get 32-byte chunk count
286 mtcrf 0x02,rc // move bit 27 of length to cr6 for LShort32
287 mtctr w1 // set up 32-byte loop (w1!=0)
288 beq LFwdMedAligned // source is 8-byte aligned, so use ld/std loop
289 mfspr rv,vrsave // get bitmap of live vector registers
290 oris w4,rv,0xFFF8 // we use v0-v12
291 li c16,16 // get constants used in lvx
293 mtspr vrsave,w4 // update mask
294 lvx v1,0,rs // prefetch 1st source quadword
295 lvsl vp,0,rs // get permute vector to shift left
298 // Fewer than 128 bytes but not doubleword aligned: use lvx/vperm/stvx.
300 1: // loop over 32-byte chunks
306 vor v1,v3,v3 // v1 <- v3
312 mtspr vrsave,rv // restore bitmap of live vr's
316 // Fewer than 128 bytes and doubleword aligned: use ld/std.
319 LFwdMedAligned: // loop over 32-byte chunks
335 // Forward, 128 bytes or more: use vectors. When entered:
336 // r0 = 128-byte chunks to move (>0)
337 // rd = 16-byte aligned
338 // cr5 = beq if source is 16-byte aligned
339 // cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
340 // We set up many registers:
341 // ctr = number of 128-byte chunks to move
342 // r0/cr0 = leftover QWs to move
343 // cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
344 // cr6 = beq if leftover byte count is 0
345 // rv = original value of VRSave
346 // c16,c32,c48 = loaded
349 mfspr rv,vrsave // get bitmap of live vector registers
350 lis w3,kVeryLong>>16 // cutoff for very-long-operand special case path
351 cmplw cr1,rc,w3 // very long operand?
352 rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3
353 bgea-- cr1,_COMM_PAGE_BIGCOPY // handle big copies separately
354 mtctr r0 // set up loop count
355 cmpwi cr6,w3,0 // set cr6 on leftover byte count
356 oris w4,rv,0xFFF8 // we use v0-v12
357 rlwinm. r0,rc,28,29,31 // get number of quadword leftovers (0-7) and set cr0
358 li c16,16 // get constants used in ldvx/stvx
359 mtspr vrsave,w4 // update mask
362 beq cr5,LFwdLongAligned // source is also 16-byte aligned, no need for vperm
363 lvsl vp,0,rs // get permute vector to shift left
364 lvx v1,0,rs // prefetch 1st source quadword
368 // Forward, long, unaligned vector loop.
370 .align 5 // align inner loops
371 LFwdLongUnaligned: // loop over 128-byte chunks
400 bdnz LFwdLongUnaligned
402 beq 4f // no leftover quadwords
404 3: // loop over remaining quadwords
408 vor v1,v2,v2 // v1 <- v2
413 mtspr vrsave,rv // restore bitmap of live vr's
414 bne cr6,LShort16 // handle last 0-15 bytes if any
418 // Forward, long, 16-byte aligned vector loop.
421 LFwdLongAligned: // loop over 128-byte chunks
444 beq 4f // no leftover quadwords
446 3: // loop over remaining quadwords (1-7)
453 mtspr vrsave,rv // restore bitmap of live vr's
454 bne cr6,LShort16 // handle last 0-15 bytes if any
458 // Long, reverse moves.
462 // cr5 = beq if relatively 16-byte aligned
465 add rd,rd,rc // point to end of operands
467 andi. r0,rd,0xF // #bytes to 16-byte align destination
468 beq 2f // already aligned
470 // 16-byte align destination.
472 mtctr r0 // set up for loop
479 // Prepare for reverse vector loop. When entered:
480 // rd = 16-byte aligned
481 // cr5 = beq if source also 16-byte aligned
482 // We set up many registers:
483 // ctr/cr1 = number of 64-byte chunks to move (may be 0)
484 // r0/cr0 = leftover QWs to move
485 // cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
486 // cr6 = beq if leftover byte count is 0
488 // rv = original value of vrsave
491 mfspr rv,vrsave // get bitmap of live vector registers
492 srwi r0,rc,6 // get count of 64-byte chunks to move (may be 0)
493 oris w1,rv,0xFFF8 // we use v0-v12
494 mtcrf 0x01,rc // prepare for moving last 0-15 bytes in LShortReverse16
495 rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3 too
496 cmpwi cr1,r0,0 // set cr1 on chunk count
497 mtspr vrsave,w1 // update mask
498 mtctr r0 // set up loop count
499 cmpwi cr6,w3,0 // set cr6 on leftover byte count
500 rlwinm. r0,rc,28,30,31 // get number of quadword leftovers (0-3) and set cr0
501 li cm1,-1 // get constants used in ldvx/stvx
503 bne cr5,LReverseVecUnal // handle unaligned operands
504 beq cr1,2f // no chunks (if no chunks, must be leftover QWs)
510 // Long, reverse 16-byte-aligned vector loop.
512 .align 5 // align inner loops
513 1: // loop over 64-byte chunks
526 beq 4f // no leftover quadwords
527 2: // r0=#QWs, rv=vrsave, cr7=(rc & F), cr6 set on cr7
529 3: // loop over remaining quadwords (1-7)
536 mtspr vrsave,rv // restore bitmap of live vr's
537 bne cr6,LShortReverse16 // handle last 0-15 bytes if any
541 // Long, reverse, unaligned vector loop.
542 // ctr/cr1 = number of 64-byte chunks to move (may be 0)
543 // r0/cr0 = leftover QWs to move
544 // cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
545 // cr6 = beq if leftover byte count is 0
546 // rv = original value of vrsave
550 lvsl vp,0,rs // get permute vector to shift left
551 lvx v1,cm1,rs // v1 always looks ahead
553 beq cr1,2f // no chunks (if no chunks, must be leftover QWs)
558 .align 5 // align the inner loops
559 1: // loop over 64-byte chunks
576 beq 4f // no leftover quadwords
577 2: // r0=#QWs, rv=vrsave, v1=next QW, cr7=(rc & F), cr6 set on cr7
579 3: // loop over 1-3 quadwords
583 vor v1,v2,v2 // v1 <- v2
588 mtspr vrsave,rv // restore bitmap of live vr's
589 bne cr6,LShortReverse16 // handle last 0-15 bytes iff any
592 COMMPAGE_DESCRIPTOR(bcopy_970,_COMM_PAGE_BCOPY,k64Bit+kHasAltivec,0,kCommPageMTCRF)