]> git.saurik.com Git - apple/xnu.git/blame - osfmk/ppc/commpage/bigcopy_970.s
xnu-792.25.20.tar.gz
[apple/xnu.git] / osfmk / ppc / commpage / bigcopy_970.s
CommitLineData
55e303ae
A
1/*
2 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
3 *
6601e61a 4 * @APPLE_LICENSE_HEADER_START@
55e303ae 5 *
6601e61a
A
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
8f6c56a5 11 *
6601e61a
A
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
8f6c56a5
A
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
6601e61a
A
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
18 * under the License.
8f6c56a5 19 *
6601e61a 20 * @APPLE_LICENSE_HEADER_END@
55e303ae
A
21 */
22/* ====================================
23 * Very Long Operand BCOPY for Mac OS X
24 * ====================================
25 *
91447636
A
26 * Version of 2/21/2004, tuned for the IBM 970. This is for operands at
27 * least several pages long. It is called from bcopy()/memcpy()/memmove(),
28 * and runs both in 32 and 64-bit mode.
55e303ae
A
29 *
30 * We use the following additional strategies not used by the shorter
31 * operand paths. Mostly, we try to optimize for memory bandwidth:
32 * 1. Use DCBZ128 to avoid reading destination lines. Because this code
33 * resides on the commmpage, it can use a private interface with the
34 * kernel to minimize alignment exceptions if the destination is
35 * uncached. The kernel will clear cr7 whenever it emulates a DCBZ or
36 * DCBZ128 on the commpage. Thus we take at most one exception per call,
37 * which is amortized across the very long operand.
38 * 2. Copy larger chunks per iteration to minimize R/W bus turnaround
39 * and maximize DRAM page locality (opening a new page is expensive.)
91447636 40 * We use 256-byte chunks.
55e303ae
A
41 * 3. Touch in one source chunk ahead with DCBT. This is probably the
42 * least important change, and probably only helps restart the
43 * hardware stream at the start of each source page.
55e303ae 44 */
91447636
A
45
46#define rs r13
47#define rd r14
48#define rc r15
49#define rx r16
50
51#define c16 r3
52#define c32 r4
53#define c48 r5
54#define c64 r6
55#define c80 r7
56#define c96 r8
57#define c112 r9
58#define c256 r10
59#define c384 r11
60#define rv r12 // vrsave
55e303ae
A
61
62// Offsets within the "red zone" (which is 224 bytes long):
63
91447636
A
64#define rzR3 -8
65#define rzR13 -16
66#define rzR14 -24
67#define rzR15 -32
68#define rzR16 -40
69
70#define rzV20 -64
71#define rzV21 -80
72#define rzV22 -96
73#define rzV23 -112
55e303ae
A
74
75
76#include <sys/appleapiopts.h>
77#include <ppc/asm.h>
78#include <machine/cpu_capabilities.h>
79#include <machine/commpage.h>
80
81 .text
91447636
A
82/*
83 * WARNING: this code is written for 32-bit mode, and ported by the kernel if necessary
84 * to 64-bit mode for use in the 64-bit commpage. This "port" consists of the following
85 * simple transformations:
86 * - all word compares are changed to doubleword
87 * - all "srwi[.]" opcodes are changed to "srdi[.]"
88 * Nothing else is done. For this to work, the following rules must be
89 * carefully followed:
90 * - do not use carry or overflow
91 * - only use record mode if you are sure the results are mode-invariant
92 * for example, all "andi." and almost all "rlwinm." are fine
93 * - do not use "slwi", "slw", or "srw"
94 * An imaginative programmer could break the porting model in other ways, but the above
95 * are the most likely problem areas. It is perhaps surprising how well in practice
96 * this simple method works.
97 */
55e303ae
A
98
99// Entry point. This is a subroutine of bcopy(). When called:
91447636
A
100// r0 = return address (also stored in caller's SF)
101// r4 = source ptr
102// r5 = length (at least several pages)
103// r12 = dest ptr
55e303ae 104//
91447636
A
105// We only do "forward" moves, ie non-overlapping or toward 0. We return with non-volatiles
106// and r3 preserved.
55e303ae
A
107
108 .align 5
109bigcopy_970:
91447636
A
110 neg r2,r12 // is destination cache-line-aligned?
111 std r3,rzR3(r1) // save caller's r3, which must be preserved for memcpy()
112 std r13,rzR13(r1) // spill non-volatile regs we use to redzone
113 std r14,rzR14(r1)
114 std r15,rzR15(r1)
115 andi. r2,r2,0x7F // #bytes to align
116 std r16,rzR16(r1)
117 mr rs,r4 // copy parameters into nonvolatile registers
118 mr rd,r12
119 mr rc,r5
120 mr rx,r0 // also save return address
121 beq 1f // skip if already aligned
55e303ae
A
122
123// Cache-line-align destination.
91447636
A
124
125 mr r3,rd // set up dest ptr for memcpy()
126 mr r5,r2 // number of bytes to copy
127 add rs,rs,r2 // then bump our parameters past initial copy
128 add rd,rd,r2
129 sub rc,rc,r2
130 bla _COMM_PAGE_MEMCPY // 128-byte-align destination
55e303ae
A
131
132
91447636
A
133// Load constant offsets and check whether source is 16-byte aligned.
134// NB: the kernel clears cr7 if it emulates a dcbz128 on the commpage,
135// and we dcbz only if cr7 beq is set.
55e303ae 136
91447636
A
1371:
138 dcbt 0,rs // touch in 1st line of source
55e303ae
A
139 andi. r0,rs,15 // check source alignment
140 mfspr rv,vrsave // save caller's bitmask
55e303ae
A
141 li c16,16 // load the constant offsets for x-form ops
142 li c32,32
91447636
A
143 srwi r2,rc,8 // get number of 256-byte chunks to xfer
144 li r0,-256 // we use 24 VRs (ie, 0-23)
55e303ae 145 li c48,48
91447636
A
146 li c64,64
147 li c80,80
148 or r0,r0,rv // add our bits to caller's
149 li c96,96
150 mtctr r2 // set up loop count
151 li c112,112
152 cmpd cr7,r2,r2 // initialize cr7_eq to "on", so we dcbz128
153 mtspr vrsave,r0 // say we use vr0..vr23
55e303ae
A
154 li c256,256
155 li c384,384
91447636 156 beq LalignedLoop // handle aligned sources
55e303ae 157
55e303ae 158
91447636 159// Set up for unaligned loop.
55e303ae 160
55e303ae
A
161 lvsl v0,0,rs // get permute vector for left shift
162 lvxl v1,0,rs // prime the loop
91447636
A
163 li r0,rzV20 // save non-volatile VRs in redzone
164 stvx v20,r1,r0
165 li r0,rzV21
166 stvx v21,r1,r0
167 li r0,rzV22
168 stvx v22,r1,r0
169 li r0,rzV23
170 stvx v23,r1,r0
55e303ae
A
171 b LunalignedLoop // enter unaligned loop
172
173
91447636
A
174// Main loop for unaligned operands. We loop over 256-byte chunks (2 cache lines).
175// Destination is 128-byte aligned, source is unaligned.
55e303ae
A
176
177 .align 5
178LunalignedLoop:
91447636
A
179 dcbt c256,rs // touch in next chunk
180 dcbt c384,rs
181 addi r2,rs,128 // point to 2nd 128 bytes of source
55e303ae
A
182 lvxl v2,c16,rs
183 lvxl v3,c32,rs
91447636
A
184 lvxl v4,c48,rs
185 lvxl v5,c64,rs
186 lvxl v6,c80,rs
187 lvxl v7,c96,rs
188 lvxl v8,c112,rs
189 lvxl v9,0,r2
190 addi rs,rs,256 // point to next source chunk
191 lvxl v10,c16,r2
192 lvxl v11,c32,r2
193 vperm v17,v1,v2,v0
194 lvxl v12,c48,r2
195 lvxl v13,c64,r2
196 vperm v18,v2,v3,v0
197 lvxl v14,c80,r2
198 lvxl v15,c96,r2
199 vperm v19,v3,v4,v0
200 lvxl v16,c112,r2
201 lvxl v1,0,rs // peek ahead at first source quad in next chunk
202 vperm v20,v4,v5,v0
203 addi r2,rd,128 // point to 2nd 128 bytes of dest
55e303ae 204 bne-- cr7,1f // skip dcbz's if cr7 beq has been turned off by kernel
91447636
A
205 dcbz128 0,rd
206 dcbz128 0,r2
55e303ae 2071:
91447636
A
208 vperm v21,v5,v6,v0
209 stvxl v17,0,rd
210 vperm v22,v6,v7,v0
211 stvxl v18,c16,rd
212 vperm v23,v7,v8,v0
213 stvxl v19,c32,rd
214 vperm v17,v8,v9,v0
215 stvxl v20,c48,rd
216 vperm v18,v9,v10,v0
217 stvxl v21,c64,rd
218 vperm v19,v10,v11,v0
219 stvxl v22,c80,rd
220 vperm v20,v11,v12,v0
221 stvxl v23,c96,rd
222 vperm v21,v12,v13,v0
223 stvxl v17,c112,rd
224 vperm v22,v13,v14,v0
225 addi rd,rd,256 // point to next dest chunk
226 stvxl v18,0,r2
227 vperm v23,v14,v15,v0
228 stvxl v19,c16,r2
229 vperm v17,v15,v16,v0
230 stvxl v20,c32,r2
231 vperm v18,v16,v1,v0
232 stvxl v21,c48,r2
233 stvxl v22,c64,r2
234 stvxl v23,c80,r2
235 stvxl v17,c96,r2
236 stvxl v18,c112,r2
237 bdnz++ LunalignedLoop // loop if another 256 bytes to go
238
239 li r6,rzV20 // restore non-volatile VRs
240 li r7,rzV21
241 li r8,rzV22
242 li r9,rzV23
243 lvx v20,r1,r6
244 lvx v21,r1,r7
245 lvx v22,r1,r8
246 lvx v23,r1,r9
247 b Ldone
55e303ae
A
248
249
250// Aligned loop. Destination is 128-byte aligned, and source is 16-byte
91447636 251// aligned. Loop over 256-byte chunks (2 cache lines.)
55e303ae
A
252
253 .align 5
254LalignedLoop:
91447636
A
255 dcbt c256,rs // touch in next chunk
256 dcbt c384,rs
257 addi r2,rs,128 // point to 2nd 128 bytes of source
55e303ae
A
258 lvxl v1,0,rs
259 lvxl v2,c16,rs
55e303ae
A
260 lvxl v3,c32,rs
261 lvxl v4,c48,rs
91447636
A
262 lvxl v5,c64,rs
263 lvxl v6,c80,rs
264 lvxl v7,c96,rs
265 lvxl v8,c112,rs
266 lvxl v9,0,r2
267 lvxl v10,c16,r2
268 lvxl v11,c32,r2
269 lvxl v12,c48,r2
270 lvxl v13,c64,r2
271 lvxl v14,c80,r2
272 lvxl v15,c96,r2
273 lvxl v16,c112,r2
274 addi r2,rd,128 // point to 2nd 128 bytes of dest
275 bne-- cr7,1f // skip dcbz's if cr7 beq has been turned off by kernel
276 dcbz128 0,rd
277 dcbz128 0,r2
2781:
279 addi rs,rs,256 // point to next source chunk
55e303ae
A
280 stvxl v1,0,rd
281 stvxl v2,c16,rd
282 stvxl v3,c32,rd
283 stvxl v4,c48,rd
91447636
A
284 stvxl v5,c64,rd
285 stvxl v6,c80,rd
286 stvxl v7,c96,rd
287 stvxl v8,c112,rd
288 addi rd,rd,256 // point to next dest chunk
289 stvxl v9,0,r2
290 stvxl v10,c16,r2
291 stvxl v11,c32,r2
292 stvxl v12,c48,r2
293 stvxl v13,c64,r2
294 stvxl v14,c80,r2
295 stvxl v15,c96,r2
296 stvxl v16,c112,r2
297 bdnz++ LalignedLoop // loop if another 256 bytes to go
298
299
300// Done, except for 0..255 leftover bytes at end.
55e303ae
A
301// rs = source ptr
302// rd = dest ptr
91447636 303// rc = remaining count in low 7 bits
55e303ae 304// rv = caller's vrsave
91447636 305// rx = caller's return address
55e303ae
A
306
307Ldone:
91447636
A
308 andi. r5,rc,0xFF // any leftover bytes? (0..255)
309 mtspr vrsave,rv // restore bitmap of live vr's
55e303ae 310
91447636
A
311 mr r3,rd
312 mr r4,rs
313 bnela _COMM_PAGE_MEMCPY // copy leftover bytes
314
315 mtlr rx // restore return address
316 ld r3,rzR3(r1) // restore non-volatile GPRs from redzone
317 ld r13,rzR13(r1)
318 ld r14,rzR14(r1)
319 ld r15,rzR15(r1)
320 ld r16,rzR16(r1)
55e303ae
A
321 blr
322
323
91447636 324 COMMPAGE_DESCRIPTOR(bigcopy_970,_COMM_PAGE_BIGCOPY,0,0,kPort32to64+kCommPageBoth)
55e303ae 325