2 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
28 /* ====================================
29 * Very Long Operand BCOPY for Mac OS X
30 * ====================================
32 * Version of 2/21/2004, tuned for the IBM 970. This is for operands at
33 * least several pages long. It is called from bcopy()/memcpy()/memmove(),
34 * and runs both in 32 and 64-bit mode.
36 * We use the following additional strategies not used by the shorter
37 * operand paths. Mostly, we try to optimize for memory bandwidth:
38 * 1. Use DCBZ128 to avoid reading destination lines. Because this code
39 * resides on the commmpage, it can use a private interface with the
40 * kernel to minimize alignment exceptions if the destination is
41 * uncached. The kernel will clear cr7 whenever it emulates a DCBZ or
42 * DCBZ128 on the commpage. Thus we take at most one exception per call,
43 * which is amortized across the very long operand.
44 * 2. Copy larger chunks per iteration to minimize R/W bus turnaround
45 * and maximize DRAM page locality (opening a new page is expensive.)
46 * We use 256-byte chunks.
47 * 3. Touch in one source chunk ahead with DCBT. This is probably the
48 * least important change, and probably only helps restart the
49 * hardware stream at the start of each source page.
66 #define rv r12 // vrsave
68 // Offsets within the "red zone" (which is 224 bytes long):
82 #include <sys/appleapiopts.h>
84 #include <machine/cpu_capabilities.h>
85 #include <machine/commpage.h>
89 * WARNING: this code is written for 32-bit mode, and ported by the kernel if necessary
90 * to 64-bit mode for use in the 64-bit commpage. This "port" consists of the following
91 * simple transformations:
92 * - all word compares are changed to doubleword
93 * - all "srwi[.]" opcodes are changed to "srdi[.]"
94 * Nothing else is done. For this to work, the following rules must be
96 * - do not use carry or overflow
97 * - only use record mode if you are sure the results are mode-invariant
98 * for example, all "andi." and almost all "rlwinm." are fine
99 * - do not use "slwi", "slw", or "srw"
100 * An imaginative programmer could break the porting model in other ways, but the above
101 * are the most likely problem areas. It is perhaps surprising how well in practice
102 * this simple method works.
105 // Entry point. This is a subroutine of bcopy(). When called:
106 // r0 = return address (also stored in caller's SF)
108 // r5 = length (at least several pages)
111 // We only do "forward" moves, ie non-overlapping or toward 0. We return with non-volatiles
116 neg r2,r12 // is destination cache-line-aligned?
117 std r3,rzR3(r1) // save caller's r3, which must be preserved for memcpy()
118 std r13,rzR13(r1) // spill non-volatile regs we use to redzone
121 andi. r2,r2,0x7F // #bytes to align
123 mr rs,r4 // copy parameters into nonvolatile registers
126 mr rx,r0 // also save return address
127 beq 1f // skip if already aligned
129 // Cache-line-align destination.
131 mr r3,rd // set up dest ptr for memcpy()
132 mr r5,r2 // number of bytes to copy
133 add rs,rs,r2 // then bump our parameters past initial copy
136 bla _COMM_PAGE_MEMCPY // 128-byte-align destination
139 // Load constant offsets and check whether source is 16-byte aligned.
140 // NB: the kernel clears cr7 if it emulates a dcbz128 on the commpage,
141 // and we dcbz only if cr7 beq is set.
144 dcbt 0,rs // touch in 1st line of source
145 andi. r0,rs,15 // check source alignment
146 mfspr rv,vrsave // save caller's bitmask
147 li c16,16 // load the constant offsets for x-form ops
149 srwi r2,rc,8 // get number of 256-byte chunks to xfer
150 li r0,-256 // we use 24 VRs (ie, 0-23)
154 or r0,r0,rv // add our bits to caller's
156 mtctr r2 // set up loop count
158 cmpd cr7,r2,r2 // initialize cr7_eq to "on", so we dcbz128
159 mtspr vrsave,r0 // say we use vr0..vr23
162 beq LalignedLoop // handle aligned sources
165 // Set up for unaligned loop.
167 lvsl v0,0,rs // get permute vector for left shift
168 lvxl v1,0,rs // prime the loop
169 li r0,rzV20 // save non-volatile VRs in redzone
177 b LunalignedLoop // enter unaligned loop
180 // Main loop for unaligned operands. We loop over 256-byte chunks (2 cache lines).
181 // Destination is 128-byte aligned, source is unaligned.
185 dcbt c256,rs // touch in next chunk
187 addi r2,rs,128 // point to 2nd 128 bytes of source
196 addi rs,rs,256 // point to next source chunk
207 lvxl v1,0,rs // peek ahead at first source quad in next chunk
209 addi r2,rd,128 // point to 2nd 128 bytes of dest
210 bne-- cr7,1f // skip dcbz's if cr7 beq has been turned off by kernel
231 addi rd,rd,256 // point to next dest chunk
243 bdnz++ LunalignedLoop // loop if another 256 bytes to go
245 li r6,rzV20 // restore non-volatile VRs
256 // Aligned loop. Destination is 128-byte aligned, and source is 16-byte
257 // aligned. Loop over 256-byte chunks (2 cache lines.)
261 dcbt c256,rs // touch in next chunk
263 addi r2,rs,128 // point to 2nd 128 bytes of source
280 addi r2,rd,128 // point to 2nd 128 bytes of dest
281 bne-- cr7,1f // skip dcbz's if cr7 beq has been turned off by kernel
285 addi rs,rs,256 // point to next source chunk
294 addi rd,rd,256 // point to next dest chunk
303 bdnz++ LalignedLoop // loop if another 256 bytes to go
306 // Done, except for 0..255 leftover bytes at end.
309 // rc = remaining count in low 7 bits
310 // rv = caller's vrsave
311 // rx = caller's return address
314 andi. r5,rc,0xFF // any leftover bytes? (0..255)
315 mtspr vrsave,rv // restore bitmap of live vr's
319 bnela _COMM_PAGE_MEMCPY // copy leftover bytes
321 mtlr rx // restore return address
322 ld r3,rzR3(r1) // restore non-volatile GPRs from redzone
330 COMMPAGE_DESCRIPTOR(bigcopy_970,_COMM_PAGE_BIGCOPY,0,0,kPort32to64+kCommPageBoth)