2 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
4 * @APPLE_LICENSE_HEADER_START@
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
20 * @APPLE_LICENSE_HEADER_END@
22 /* ====================================
23 * Very Long Operand BCOPY for Mac OS X
24 * ====================================
26 * Version of 2/21/2004, tuned for the IBM 970. This is for operands at
27 * least several pages long. It is called from bcopy()/memcpy()/memmove(),
28 * and runs both in 32 and 64-bit mode.
30 * We use the following additional strategies not used by the shorter
31 * operand paths. Mostly, we try to optimize for memory bandwidth:
32 * 1. Use DCBZ128 to avoid reading destination lines. Because this code
33 * resides on the commmpage, it can use a private interface with the
34 * kernel to minimize alignment exceptions if the destination is
35 * uncached. The kernel will clear cr7 whenever it emulates a DCBZ or
36 * DCBZ128 on the commpage. Thus we take at most one exception per call,
37 * which is amortized across the very long operand.
38 * 2. Copy larger chunks per iteration to minimize R/W bus turnaround
39 * and maximize DRAM page locality (opening a new page is expensive.)
40 * We use 256-byte chunks.
41 * 3. Touch in one source chunk ahead with DCBT. This is probably the
42 * least important change, and probably only helps restart the
43 * hardware stream at the start of each source page.
60 #define rv r12 // vrsave
62 // Offsets within the "red zone" (which is 224 bytes long):
76 #include <sys/appleapiopts.h>
78 #include <machine/cpu_capabilities.h>
79 #include <machine/commpage.h>
83 * WARNING: this code is written for 32-bit mode, and ported by the kernel if necessary
84 * to 64-bit mode for use in the 64-bit commpage. This "port" consists of the following
85 * simple transformations:
86 * - all word compares are changed to doubleword
87 * - all "srwi[.]" opcodes are changed to "srdi[.]"
88 * Nothing else is done. For this to work, the following rules must be
90 * - do not use carry or overflow
91 * - only use record mode if you are sure the results are mode-invariant
92 * for example, all "andi." and almost all "rlwinm." are fine
93 * - do not use "slwi", "slw", or "srw"
94 * An imaginative programmer could break the porting model in other ways, but the above
95 * are the most likely problem areas. It is perhaps surprising how well in practice
96 * this simple method works.
99 // Entry point. This is a subroutine of bcopy(). When called:
100 // r0 = return address (also stored in caller's SF)
102 // r5 = length (at least several pages)
105 // We only do "forward" moves, ie non-overlapping or toward 0. We return with non-volatiles
110 neg r2,r12 // is destination cache-line-aligned?
111 std r3,rzR3(r1) // save caller's r3, which must be preserved for memcpy()
112 std r13,rzR13(r1) // spill non-volatile regs we use to redzone
115 andi. r2,r2,0x7F // #bytes to align
117 mr rs,r4 // copy parameters into nonvolatile registers
120 mr rx,r0 // also save return address
121 beq 1f // skip if already aligned
123 // Cache-line-align destination.
125 mr r3,rd // set up dest ptr for memcpy()
126 mr r5,r2 // number of bytes to copy
127 add rs,rs,r2 // then bump our parameters past initial copy
130 bla _COMM_PAGE_MEMCPY // 128-byte-align destination
133 // Load constant offsets and check whether source is 16-byte aligned.
134 // NB: the kernel clears cr7 if it emulates a dcbz128 on the commpage,
135 // and we dcbz only if cr7 beq is set.
138 dcbt 0,rs // touch in 1st line of source
139 andi. r0,rs,15 // check source alignment
140 mfspr rv,vrsave // save caller's bitmask
141 li c16,16 // load the constant offsets for x-form ops
143 srwi r2,rc,8 // get number of 256-byte chunks to xfer
144 li r0,-256 // we use 24 VRs (ie, 0-23)
148 or r0,r0,rv // add our bits to caller's
150 mtctr r2 // set up loop count
152 cmpd cr7,r2,r2 // initialize cr7_eq to "on", so we dcbz128
153 mtspr vrsave,r0 // say we use vr0..vr23
156 beq LalignedLoop // handle aligned sources
159 // Set up for unaligned loop.
161 lvsl v0,0,rs // get permute vector for left shift
162 lvxl v1,0,rs // prime the loop
163 li r0,rzV20 // save non-volatile VRs in redzone
171 b LunalignedLoop // enter unaligned loop
174 // Main loop for unaligned operands. We loop over 256-byte chunks (2 cache lines).
175 // Destination is 128-byte aligned, source is unaligned.
179 dcbt c256,rs // touch in next chunk
181 addi r2,rs,128 // point to 2nd 128 bytes of source
190 addi rs,rs,256 // point to next source chunk
201 lvxl v1,0,rs // peek ahead at first source quad in next chunk
203 addi r2,rd,128 // point to 2nd 128 bytes of dest
204 bne-- cr7,1f // skip dcbz's if cr7 beq has been turned off by kernel
225 addi rd,rd,256 // point to next dest chunk
237 bdnz++ LunalignedLoop // loop if another 256 bytes to go
239 li r6,rzV20 // restore non-volatile VRs
250 // Aligned loop. Destination is 128-byte aligned, and source is 16-byte
251 // aligned. Loop over 256-byte chunks (2 cache lines.)
255 dcbt c256,rs // touch in next chunk
257 addi r2,rs,128 // point to 2nd 128 bytes of source
274 addi r2,rd,128 // point to 2nd 128 bytes of dest
275 bne-- cr7,1f // skip dcbz's if cr7 beq has been turned off by kernel
279 addi rs,rs,256 // point to next source chunk
288 addi rd,rd,256 // point to next dest chunk
297 bdnz++ LalignedLoop // loop if another 256 bytes to go
300 // Done, except for 0..255 leftover bytes at end.
303 // rc = remaining count in low 7 bits
304 // rv = caller's vrsave
305 // rx = caller's return address
308 andi. r5,rc,0xFF // any leftover bytes? (0..255)
309 mtspr vrsave,rv // restore bitmap of live vr's
313 bnela _COMM_PAGE_MEMCPY // copy leftover bytes
315 mtlr rx // restore return address
316 ld r3,rzR3(r1) // restore non-volatile GPRs from redzone
324 COMMPAGE_DESCRIPTOR(bigcopy_970,_COMM_PAGE_BIGCOPY,0,0,kPort32to64+kCommPageBoth)