2 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
4 * @APPLE_LICENSE_OSREFERENCE_HEADER_START@
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the
10 * License may not be used to create, or enable the creation or
11 * redistribution of, unlawful or unlicensed copies of an Apple operating
12 * system, or to circumvent, violate, or enable the circumvention or
13 * violation of, any terms of an Apple operating system software license
16 * Please obtain a copy of the License at
17 * http://www.opensource.apple.com/apsl/ and read it before using this
20 * The Original Code and all software distributed under the License are
21 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
22 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
23 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
24 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
25 * Please see the License for the specific language governing rights and
26 * limitations under the License.
28 * @APPLE_LICENSE_OSREFERENCE_HEADER_END@
30 /* ====================================
31 * Very Long Operand BCOPY for Mac OS X
32 * ====================================
34 * Version of 2/21/2004, tuned for the IBM 970. This is for operands at
35 * least several pages long. It is called from bcopy()/memcpy()/memmove(),
36 * and runs both in 32 and 64-bit mode.
38 * We use the following additional strategies not used by the shorter
39 * operand paths. Mostly, we try to optimize for memory bandwidth:
40 * 1. Use DCBZ128 to avoid reading destination lines. Because this code
41 * resides on the commmpage, it can use a private interface with the
42 * kernel to minimize alignment exceptions if the destination is
43 * uncached. The kernel will clear cr7 whenever it emulates a DCBZ or
44 * DCBZ128 on the commpage. Thus we take at most one exception per call,
45 * which is amortized across the very long operand.
46 * 2. Copy larger chunks per iteration to minimize R/W bus turnaround
47 * and maximize DRAM page locality (opening a new page is expensive.)
48 * We use 256-byte chunks.
49 * 3. Touch in one source chunk ahead with DCBT. This is probably the
50 * least important change, and probably only helps restart the
51 * hardware stream at the start of each source page.
68 #define rv r12 // vrsave
70 // Offsets within the "red zone" (which is 224 bytes long):
84 #include <sys/appleapiopts.h>
86 #include <machine/cpu_capabilities.h>
87 #include <machine/commpage.h>
91 * WARNING: this code is written for 32-bit mode, and ported by the kernel if necessary
92 * to 64-bit mode for use in the 64-bit commpage. This "port" consists of the following
93 * simple transformations:
94 * - all word compares are changed to doubleword
95 * - all "srwi[.]" opcodes are changed to "srdi[.]"
96 * Nothing else is done. For this to work, the following rules must be
98 * - do not use carry or overflow
99 * - only use record mode if you are sure the results are mode-invariant
100 * for example, all "andi." and almost all "rlwinm." are fine
101 * - do not use "slwi", "slw", or "srw"
102 * An imaginative programmer could break the porting model in other ways, but the above
103 * are the most likely problem areas. It is perhaps surprising how well in practice
104 * this simple method works.
107 // Entry point. This is a subroutine of bcopy(). When called:
108 // r0 = return address (also stored in caller's SF)
110 // r5 = length (at least several pages)
113 // We only do "forward" moves, ie non-overlapping or toward 0. We return with non-volatiles
118 neg r2,r12 // is destination cache-line-aligned?
119 std r3,rzR3(r1) // save caller's r3, which must be preserved for memcpy()
120 std r13,rzR13(r1) // spill non-volatile regs we use to redzone
123 andi. r2,r2,0x7F // #bytes to align
125 mr rs,r4 // copy parameters into nonvolatile registers
128 mr rx,r0 // also save return address
129 beq 1f // skip if already aligned
131 // Cache-line-align destination.
133 mr r3,rd // set up dest ptr for memcpy()
134 mr r5,r2 // number of bytes to copy
135 add rs,rs,r2 // then bump our parameters past initial copy
138 bla _COMM_PAGE_MEMCPY // 128-byte-align destination
141 // Load constant offsets and check whether source is 16-byte aligned.
142 // NB: the kernel clears cr7 if it emulates a dcbz128 on the commpage,
143 // and we dcbz only if cr7 beq is set.
146 dcbt 0,rs // touch in 1st line of source
147 andi. r0,rs,15 // check source alignment
148 mfspr rv,vrsave // save caller's bitmask
149 li c16,16 // load the constant offsets for x-form ops
151 srwi r2,rc,8 // get number of 256-byte chunks to xfer
152 li r0,-256 // we use 24 VRs (ie, 0-23)
156 or r0,r0,rv // add our bits to caller's
158 mtctr r2 // set up loop count
160 cmpd cr7,r2,r2 // initialize cr7_eq to "on", so we dcbz128
161 mtspr vrsave,r0 // say we use vr0..vr23
164 beq LalignedLoop // handle aligned sources
167 // Set up for unaligned loop.
169 lvsl v0,0,rs // get permute vector for left shift
170 lvxl v1,0,rs // prime the loop
171 li r0,rzV20 // save non-volatile VRs in redzone
179 b LunalignedLoop // enter unaligned loop
182 // Main loop for unaligned operands. We loop over 256-byte chunks (2 cache lines).
183 // Destination is 128-byte aligned, source is unaligned.
187 dcbt c256,rs // touch in next chunk
189 addi r2,rs,128 // point to 2nd 128 bytes of source
198 addi rs,rs,256 // point to next source chunk
209 lvxl v1,0,rs // peek ahead at first source quad in next chunk
211 addi r2,rd,128 // point to 2nd 128 bytes of dest
212 bne-- cr7,1f // skip dcbz's if cr7 beq has been turned off by kernel
233 addi rd,rd,256 // point to next dest chunk
245 bdnz++ LunalignedLoop // loop if another 256 bytes to go
247 li r6,rzV20 // restore non-volatile VRs
258 // Aligned loop. Destination is 128-byte aligned, and source is 16-byte
259 // aligned. Loop over 256-byte chunks (2 cache lines.)
263 dcbt c256,rs // touch in next chunk
265 addi r2,rs,128 // point to 2nd 128 bytes of source
282 addi r2,rd,128 // point to 2nd 128 bytes of dest
283 bne-- cr7,1f // skip dcbz's if cr7 beq has been turned off by kernel
287 addi rs,rs,256 // point to next source chunk
296 addi rd,rd,256 // point to next dest chunk
305 bdnz++ LalignedLoop // loop if another 256 bytes to go
308 // Done, except for 0..255 leftover bytes at end.
311 // rc = remaining count in low 7 bits
312 // rv = caller's vrsave
313 // rx = caller's return address
316 andi. r5,rc,0xFF // any leftover bytes? (0..255)
317 mtspr vrsave,rv // restore bitmap of live vr's
321 bnela _COMM_PAGE_MEMCPY // copy leftover bytes
323 mtlr rx // restore return address
324 ld r3,rzR3(r1) // restore non-volatile GPRs from redzone
332 COMMPAGE_DESCRIPTOR(bigcopy_970,_COMM_PAGE_BIGCOPY,0,0,kPort32to64+kCommPageBoth)