]> git.saurik.com Git - apple/xnu.git/blob - osfmk/ppc/commpage/bigcopy_970.s
c7b033a5ff06c61ecce0fe50b964cbbe2f3de0e3
[apple/xnu.git] / osfmk / ppc / commpage / bigcopy_970.s
1 /*
2 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_OSREFERENCE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the
10 * License may not be used to create, or enable the creation or
11 * redistribution of, unlawful or unlicensed copies of an Apple operating
12 * system, or to circumvent, violate, or enable the circumvention or
13 * violation of, any terms of an Apple operating system software license
14 * agreement.
15 *
16 * Please obtain a copy of the License at
17 * http://www.opensource.apple.com/apsl/ and read it before using this
18 * file.
19 *
20 * The Original Code and all software distributed under the License are
21 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
22 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
23 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
24 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
25 * Please see the License for the specific language governing rights and
26 * limitations under the License.
27 *
28 * @APPLE_LICENSE_OSREFERENCE_HEADER_END@
29 */
30 /* ====================================
31 * Very Long Operand BCOPY for Mac OS X
32 * ====================================
33 *
34 * Version of 2/21/2004, tuned for the IBM 970. This is for operands at
35 * least several pages long. It is called from bcopy()/memcpy()/memmove(),
36 * and runs both in 32 and 64-bit mode.
37 *
38 * We use the following additional strategies not used by the shorter
39 * operand paths. Mostly, we try to optimize for memory bandwidth:
40 * 1. Use DCBZ128 to avoid reading destination lines. Because this code
41 * resides on the commmpage, it can use a private interface with the
42 * kernel to minimize alignment exceptions if the destination is
43 * uncached. The kernel will clear cr7 whenever it emulates a DCBZ or
44 * DCBZ128 on the commpage. Thus we take at most one exception per call,
45 * which is amortized across the very long operand.
46 * 2. Copy larger chunks per iteration to minimize R/W bus turnaround
47 * and maximize DRAM page locality (opening a new page is expensive.)
48 * We use 256-byte chunks.
49 * 3. Touch in one source chunk ahead with DCBT. This is probably the
50 * least important change, and probably only helps restart the
51 * hardware stream at the start of each source page.
52 */
53
54 #define rs r13
55 #define rd r14
56 #define rc r15
57 #define rx r16
58
59 #define c16 r3
60 #define c32 r4
61 #define c48 r5
62 #define c64 r6
63 #define c80 r7
64 #define c96 r8
65 #define c112 r9
66 #define c256 r10
67 #define c384 r11
68 #define rv r12 // vrsave
69
70 // Offsets within the "red zone" (which is 224 bytes long):
71
72 #define rzR3 -8
73 #define rzR13 -16
74 #define rzR14 -24
75 #define rzR15 -32
76 #define rzR16 -40
77
78 #define rzV20 -64
79 #define rzV21 -80
80 #define rzV22 -96
81 #define rzV23 -112
82
83
84 #include <sys/appleapiopts.h>
85 #include <ppc/asm.h>
86 #include <machine/cpu_capabilities.h>
87 #include <machine/commpage.h>
88
89 .text
90 /*
91 * WARNING: this code is written for 32-bit mode, and ported by the kernel if necessary
92 * to 64-bit mode for use in the 64-bit commpage. This "port" consists of the following
93 * simple transformations:
94 * - all word compares are changed to doubleword
95 * - all "srwi[.]" opcodes are changed to "srdi[.]"
96 * Nothing else is done. For this to work, the following rules must be
97 * carefully followed:
98 * - do not use carry or overflow
99 * - only use record mode if you are sure the results are mode-invariant
100 * for example, all "andi." and almost all "rlwinm." are fine
101 * - do not use "slwi", "slw", or "srw"
102 * An imaginative programmer could break the porting model in other ways, but the above
103 * are the most likely problem areas. It is perhaps surprising how well in practice
104 * this simple method works.
105 */
106
107 // Entry point. This is a subroutine of bcopy(). When called:
108 // r0 = return address (also stored in caller's SF)
109 // r4 = source ptr
110 // r5 = length (at least several pages)
111 // r12 = dest ptr
112 //
113 // We only do "forward" moves, ie non-overlapping or toward 0. We return with non-volatiles
114 // and r3 preserved.
115
116 .align 5
117 bigcopy_970:
118 neg r2,r12 // is destination cache-line-aligned?
119 std r3,rzR3(r1) // save caller's r3, which must be preserved for memcpy()
120 std r13,rzR13(r1) // spill non-volatile regs we use to redzone
121 std r14,rzR14(r1)
122 std r15,rzR15(r1)
123 andi. r2,r2,0x7F // #bytes to align
124 std r16,rzR16(r1)
125 mr rs,r4 // copy parameters into nonvolatile registers
126 mr rd,r12
127 mr rc,r5
128 mr rx,r0 // also save return address
129 beq 1f // skip if already aligned
130
131 // Cache-line-align destination.
132
133 mr r3,rd // set up dest ptr for memcpy()
134 mr r5,r2 // number of bytes to copy
135 add rs,rs,r2 // then bump our parameters past initial copy
136 add rd,rd,r2
137 sub rc,rc,r2
138 bla _COMM_PAGE_MEMCPY // 128-byte-align destination
139
140
141 // Load constant offsets and check whether source is 16-byte aligned.
142 // NB: the kernel clears cr7 if it emulates a dcbz128 on the commpage,
143 // and we dcbz only if cr7 beq is set.
144
145 1:
146 dcbt 0,rs // touch in 1st line of source
147 andi. r0,rs,15 // check source alignment
148 mfspr rv,vrsave // save caller's bitmask
149 li c16,16 // load the constant offsets for x-form ops
150 li c32,32
151 srwi r2,rc,8 // get number of 256-byte chunks to xfer
152 li r0,-256 // we use 24 VRs (ie, 0-23)
153 li c48,48
154 li c64,64
155 li c80,80
156 or r0,r0,rv // add our bits to caller's
157 li c96,96
158 mtctr r2 // set up loop count
159 li c112,112
160 cmpd cr7,r2,r2 // initialize cr7_eq to "on", so we dcbz128
161 mtspr vrsave,r0 // say we use vr0..vr23
162 li c256,256
163 li c384,384
164 beq LalignedLoop // handle aligned sources
165
166
167 // Set up for unaligned loop.
168
169 lvsl v0,0,rs // get permute vector for left shift
170 lvxl v1,0,rs // prime the loop
171 li r0,rzV20 // save non-volatile VRs in redzone
172 stvx v20,r1,r0
173 li r0,rzV21
174 stvx v21,r1,r0
175 li r0,rzV22
176 stvx v22,r1,r0
177 li r0,rzV23
178 stvx v23,r1,r0
179 b LunalignedLoop // enter unaligned loop
180
181
182 // Main loop for unaligned operands. We loop over 256-byte chunks (2 cache lines).
183 // Destination is 128-byte aligned, source is unaligned.
184
185 .align 5
186 LunalignedLoop:
187 dcbt c256,rs // touch in next chunk
188 dcbt c384,rs
189 addi r2,rs,128 // point to 2nd 128 bytes of source
190 lvxl v2,c16,rs
191 lvxl v3,c32,rs
192 lvxl v4,c48,rs
193 lvxl v5,c64,rs
194 lvxl v6,c80,rs
195 lvxl v7,c96,rs
196 lvxl v8,c112,rs
197 lvxl v9,0,r2
198 addi rs,rs,256 // point to next source chunk
199 lvxl v10,c16,r2
200 lvxl v11,c32,r2
201 vperm v17,v1,v2,v0
202 lvxl v12,c48,r2
203 lvxl v13,c64,r2
204 vperm v18,v2,v3,v0
205 lvxl v14,c80,r2
206 lvxl v15,c96,r2
207 vperm v19,v3,v4,v0
208 lvxl v16,c112,r2
209 lvxl v1,0,rs // peek ahead at first source quad in next chunk
210 vperm v20,v4,v5,v0
211 addi r2,rd,128 // point to 2nd 128 bytes of dest
212 bne-- cr7,1f // skip dcbz's if cr7 beq has been turned off by kernel
213 dcbz128 0,rd
214 dcbz128 0,r2
215 1:
216 vperm v21,v5,v6,v0
217 stvxl v17,0,rd
218 vperm v22,v6,v7,v0
219 stvxl v18,c16,rd
220 vperm v23,v7,v8,v0
221 stvxl v19,c32,rd
222 vperm v17,v8,v9,v0
223 stvxl v20,c48,rd
224 vperm v18,v9,v10,v0
225 stvxl v21,c64,rd
226 vperm v19,v10,v11,v0
227 stvxl v22,c80,rd
228 vperm v20,v11,v12,v0
229 stvxl v23,c96,rd
230 vperm v21,v12,v13,v0
231 stvxl v17,c112,rd
232 vperm v22,v13,v14,v0
233 addi rd,rd,256 // point to next dest chunk
234 stvxl v18,0,r2
235 vperm v23,v14,v15,v0
236 stvxl v19,c16,r2
237 vperm v17,v15,v16,v0
238 stvxl v20,c32,r2
239 vperm v18,v16,v1,v0
240 stvxl v21,c48,r2
241 stvxl v22,c64,r2
242 stvxl v23,c80,r2
243 stvxl v17,c96,r2
244 stvxl v18,c112,r2
245 bdnz++ LunalignedLoop // loop if another 256 bytes to go
246
247 li r6,rzV20 // restore non-volatile VRs
248 li r7,rzV21
249 li r8,rzV22
250 li r9,rzV23
251 lvx v20,r1,r6
252 lvx v21,r1,r7
253 lvx v22,r1,r8
254 lvx v23,r1,r9
255 b Ldone
256
257
258 // Aligned loop. Destination is 128-byte aligned, and source is 16-byte
259 // aligned. Loop over 256-byte chunks (2 cache lines.)
260
261 .align 5
262 LalignedLoop:
263 dcbt c256,rs // touch in next chunk
264 dcbt c384,rs
265 addi r2,rs,128 // point to 2nd 128 bytes of source
266 lvxl v1,0,rs
267 lvxl v2,c16,rs
268 lvxl v3,c32,rs
269 lvxl v4,c48,rs
270 lvxl v5,c64,rs
271 lvxl v6,c80,rs
272 lvxl v7,c96,rs
273 lvxl v8,c112,rs
274 lvxl v9,0,r2
275 lvxl v10,c16,r2
276 lvxl v11,c32,r2
277 lvxl v12,c48,r2
278 lvxl v13,c64,r2
279 lvxl v14,c80,r2
280 lvxl v15,c96,r2
281 lvxl v16,c112,r2
282 addi r2,rd,128 // point to 2nd 128 bytes of dest
283 bne-- cr7,1f // skip dcbz's if cr7 beq has been turned off by kernel
284 dcbz128 0,rd
285 dcbz128 0,r2
286 1:
287 addi rs,rs,256 // point to next source chunk
288 stvxl v1,0,rd
289 stvxl v2,c16,rd
290 stvxl v3,c32,rd
291 stvxl v4,c48,rd
292 stvxl v5,c64,rd
293 stvxl v6,c80,rd
294 stvxl v7,c96,rd
295 stvxl v8,c112,rd
296 addi rd,rd,256 // point to next dest chunk
297 stvxl v9,0,r2
298 stvxl v10,c16,r2
299 stvxl v11,c32,r2
300 stvxl v12,c48,r2
301 stvxl v13,c64,r2
302 stvxl v14,c80,r2
303 stvxl v15,c96,r2
304 stvxl v16,c112,r2
305 bdnz++ LalignedLoop // loop if another 256 bytes to go
306
307
308 // Done, except for 0..255 leftover bytes at end.
309 // rs = source ptr
310 // rd = dest ptr
311 // rc = remaining count in low 7 bits
312 // rv = caller's vrsave
313 // rx = caller's return address
314
315 Ldone:
316 andi. r5,rc,0xFF // any leftover bytes? (0..255)
317 mtspr vrsave,rv // restore bitmap of live vr's
318
319 mr r3,rd
320 mr r4,rs
321 bnela _COMM_PAGE_MEMCPY // copy leftover bytes
322
323 mtlr rx // restore return address
324 ld r3,rzR3(r1) // restore non-volatile GPRs from redzone
325 ld r13,rzR13(r1)
326 ld r14,rzR14(r1)
327 ld r15,rzR15(r1)
328 ld r16,rzR16(r1)
329 blr
330
331
332 COMMPAGE_DESCRIPTOR(bigcopy_970,_COMM_PAGE_BIGCOPY,0,0,kPort32to64+kCommPageBoth)
333