]> git.saurik.com Git - apple/xnu.git/blob - osfmk/ppc/commpage/bigcopy_970.s
xnu-1228.tar.gz
[apple/xnu.git] / osfmk / ppc / commpage / bigcopy_970.s
1 /*
2 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /* ====================================
29 * Very Long Operand BCOPY for Mac OS X
30 * ====================================
31 *
32 * Version of 2/21/2004, tuned for the IBM 970. This is for operands at
33 * least several pages long. It is called from bcopy()/memcpy()/memmove(),
34 * and runs both in 32 and 64-bit mode.
35 *
36 * We use the following additional strategies not used by the shorter
37 * operand paths. Mostly, we try to optimize for memory bandwidth:
38 * 1. Use DCBZ128 to avoid reading destination lines. Because this code
39 * resides on the commmpage, it can use a private interface with the
40 * kernel to minimize alignment exceptions if the destination is
41 * uncached. The kernel will clear cr7 whenever it emulates a DCBZ or
42 * DCBZ128 on the commpage. Thus we take at most one exception per call,
43 * which is amortized across the very long operand.
44 * 2. Copy larger chunks per iteration to minimize R/W bus turnaround
45 * and maximize DRAM page locality (opening a new page is expensive.)
46 * We use 256-byte chunks.
47 * 3. Touch in one source chunk ahead with DCBT. This is probably the
48 * least important change, and probably only helps restart the
49 * hardware stream at the start of each source page.
50 */
51
52 #define rs r13
53 #define rd r14
54 #define rc r15
55 #define rx r16
56
57 #define c16 r3
58 #define c32 r4
59 #define c48 r5
60 #define c64 r6
61 #define c80 r7
62 #define c96 r8
63 #define c112 r9
64 #define c256 r10
65 #define c384 r11
66 #define rv r12 // vrsave
67
68 // Offsets within the "red zone" (which is 224 bytes long):
69
70 #define rzR3 -8
71 #define rzR13 -16
72 #define rzR14 -24
73 #define rzR15 -32
74 #define rzR16 -40
75
76 #define rzV20 -64
77 #define rzV21 -80
78 #define rzV22 -96
79 #define rzV23 -112
80
81
82 #include <sys/appleapiopts.h>
83 #include <ppc/asm.h>
84 #include <machine/cpu_capabilities.h>
85 #include <machine/commpage.h>
86
87 .text
88 /*
89 * WARNING: this code is written for 32-bit mode, and ported by the kernel if necessary
90 * to 64-bit mode for use in the 64-bit commpage. This "port" consists of the following
91 * simple transformations:
92 * - all word compares are changed to doubleword
93 * - all "srwi[.]" opcodes are changed to "srdi[.]"
94 * Nothing else is done. For this to work, the following rules must be
95 * carefully followed:
96 * - do not use carry or overflow
97 * - only use record mode if you are sure the results are mode-invariant
98 * for example, all "andi." and almost all "rlwinm." are fine
99 * - do not use "slwi", "slw", or "srw"
100 * An imaginative programmer could break the porting model in other ways, but the above
101 * are the most likely problem areas. It is perhaps surprising how well in practice
102 * this simple method works.
103 */
104
105 // Entry point. This is a subroutine of bcopy(). When called:
106 // r0 = return address (also stored in caller's SF)
107 // r4 = source ptr
108 // r5 = length (at least several pages)
109 // r12 = dest ptr
110 //
111 // We only do "forward" moves, ie non-overlapping or toward 0. We return with non-volatiles
112 // and r3 preserved.
113
114 .align 5
115 bigcopy_970:
116 neg r2,r12 // is destination cache-line-aligned?
117 std r3,rzR3(r1) // save caller's r3, which must be preserved for memcpy()
118 std r13,rzR13(r1) // spill non-volatile regs we use to redzone
119 std r14,rzR14(r1)
120 std r15,rzR15(r1)
121 andi. r2,r2,0x7F // #bytes to align
122 std r16,rzR16(r1)
123 mr rs,r4 // copy parameters into nonvolatile registers
124 mr rd,r12
125 mr rc,r5
126 mr rx,r0 // also save return address
127 beq 1f // skip if already aligned
128
129 // Cache-line-align destination.
130
131 mr r3,rd // set up dest ptr for memcpy()
132 mr r5,r2 // number of bytes to copy
133 add rs,rs,r2 // then bump our parameters past initial copy
134 add rd,rd,r2
135 sub rc,rc,r2
136 bla _COMM_PAGE_MEMCPY // 128-byte-align destination
137
138
139 // Load constant offsets and check whether source is 16-byte aligned.
140 // NB: the kernel clears cr7 if it emulates a dcbz128 on the commpage,
141 // and we dcbz only if cr7 beq is set.
142
143 1:
144 dcbt 0,rs // touch in 1st line of source
145 andi. r0,rs,15 // check source alignment
146 mfspr rv,vrsave // save caller's bitmask
147 li c16,16 // load the constant offsets for x-form ops
148 li c32,32
149 srwi r2,rc,8 // get number of 256-byte chunks to xfer
150 li r0,-256 // we use 24 VRs (ie, 0-23)
151 li c48,48
152 li c64,64
153 li c80,80
154 or r0,r0,rv // add our bits to caller's
155 li c96,96
156 mtctr r2 // set up loop count
157 li c112,112
158 cmpd cr7,r2,r2 // initialize cr7_eq to "on", so we dcbz128
159 mtspr vrsave,r0 // say we use vr0..vr23
160 li c256,256
161 li c384,384
162 beq LalignedLoop // handle aligned sources
163
164
165 // Set up for unaligned loop.
166
167 lvsl v0,0,rs // get permute vector for left shift
168 lvxl v1,0,rs // prime the loop
169 li r0,rzV20 // save non-volatile VRs in redzone
170 stvx v20,r1,r0
171 li r0,rzV21
172 stvx v21,r1,r0
173 li r0,rzV22
174 stvx v22,r1,r0
175 li r0,rzV23
176 stvx v23,r1,r0
177 b LunalignedLoop // enter unaligned loop
178
179
180 // Main loop for unaligned operands. We loop over 256-byte chunks (2 cache lines).
181 // Destination is 128-byte aligned, source is unaligned.
182
183 .align 5
184 LunalignedLoop:
185 dcbt c256,rs // touch in next chunk
186 dcbt c384,rs
187 addi r2,rs,128 // point to 2nd 128 bytes of source
188 lvxl v2,c16,rs
189 lvxl v3,c32,rs
190 lvxl v4,c48,rs
191 lvxl v5,c64,rs
192 lvxl v6,c80,rs
193 lvxl v7,c96,rs
194 lvxl v8,c112,rs
195 lvxl v9,0,r2
196 addi rs,rs,256 // point to next source chunk
197 lvxl v10,c16,r2
198 lvxl v11,c32,r2
199 vperm v17,v1,v2,v0
200 lvxl v12,c48,r2
201 lvxl v13,c64,r2
202 vperm v18,v2,v3,v0
203 lvxl v14,c80,r2
204 lvxl v15,c96,r2
205 vperm v19,v3,v4,v0
206 lvxl v16,c112,r2
207 lvxl v1,0,rs // peek ahead at first source quad in next chunk
208 vperm v20,v4,v5,v0
209 addi r2,rd,128 // point to 2nd 128 bytes of dest
210 bne-- cr7,1f // skip dcbz's if cr7 beq has been turned off by kernel
211 dcbz128 0,rd
212 dcbz128 0,r2
213 1:
214 vperm v21,v5,v6,v0
215 stvxl v17,0,rd
216 vperm v22,v6,v7,v0
217 stvxl v18,c16,rd
218 vperm v23,v7,v8,v0
219 stvxl v19,c32,rd
220 vperm v17,v8,v9,v0
221 stvxl v20,c48,rd
222 vperm v18,v9,v10,v0
223 stvxl v21,c64,rd
224 vperm v19,v10,v11,v0
225 stvxl v22,c80,rd
226 vperm v20,v11,v12,v0
227 stvxl v23,c96,rd
228 vperm v21,v12,v13,v0
229 stvxl v17,c112,rd
230 vperm v22,v13,v14,v0
231 addi rd,rd,256 // point to next dest chunk
232 stvxl v18,0,r2
233 vperm v23,v14,v15,v0
234 stvxl v19,c16,r2
235 vperm v17,v15,v16,v0
236 stvxl v20,c32,r2
237 vperm v18,v16,v1,v0
238 stvxl v21,c48,r2
239 stvxl v22,c64,r2
240 stvxl v23,c80,r2
241 stvxl v17,c96,r2
242 stvxl v18,c112,r2
243 bdnz++ LunalignedLoop // loop if another 256 bytes to go
244
245 li r6,rzV20 // restore non-volatile VRs
246 li r7,rzV21
247 li r8,rzV22
248 li r9,rzV23
249 lvx v20,r1,r6
250 lvx v21,r1,r7
251 lvx v22,r1,r8
252 lvx v23,r1,r9
253 b Ldone
254
255
256 // Aligned loop. Destination is 128-byte aligned, and source is 16-byte
257 // aligned. Loop over 256-byte chunks (2 cache lines.)
258
259 .align 5
260 LalignedLoop:
261 dcbt c256,rs // touch in next chunk
262 dcbt c384,rs
263 addi r2,rs,128 // point to 2nd 128 bytes of source
264 lvxl v1,0,rs
265 lvxl v2,c16,rs
266 lvxl v3,c32,rs
267 lvxl v4,c48,rs
268 lvxl v5,c64,rs
269 lvxl v6,c80,rs
270 lvxl v7,c96,rs
271 lvxl v8,c112,rs
272 lvxl v9,0,r2
273 lvxl v10,c16,r2
274 lvxl v11,c32,r2
275 lvxl v12,c48,r2
276 lvxl v13,c64,r2
277 lvxl v14,c80,r2
278 lvxl v15,c96,r2
279 lvxl v16,c112,r2
280 addi r2,rd,128 // point to 2nd 128 bytes of dest
281 bne-- cr7,1f // skip dcbz's if cr7 beq has been turned off by kernel
282 dcbz128 0,rd
283 dcbz128 0,r2
284 1:
285 addi rs,rs,256 // point to next source chunk
286 stvxl v1,0,rd
287 stvxl v2,c16,rd
288 stvxl v3,c32,rd
289 stvxl v4,c48,rd
290 stvxl v5,c64,rd
291 stvxl v6,c80,rd
292 stvxl v7,c96,rd
293 stvxl v8,c112,rd
294 addi rd,rd,256 // point to next dest chunk
295 stvxl v9,0,r2
296 stvxl v10,c16,r2
297 stvxl v11,c32,r2
298 stvxl v12,c48,r2
299 stvxl v13,c64,r2
300 stvxl v14,c80,r2
301 stvxl v15,c96,r2
302 stvxl v16,c112,r2
303 bdnz++ LalignedLoop // loop if another 256 bytes to go
304
305
306 // Done, except for 0..255 leftover bytes at end.
307 // rs = source ptr
308 // rd = dest ptr
309 // rc = remaining count in low 7 bits
310 // rv = caller's vrsave
311 // rx = caller's return address
312
313 Ldone:
314 andi. r5,rc,0xFF // any leftover bytes? (0..255)
315 mtspr vrsave,rv // restore bitmap of live vr's
316
317 mr r3,rd
318 mr r4,rs
319 bnela _COMM_PAGE_MEMCPY // copy leftover bytes
320
321 mtlr rx // restore return address
322 ld r3,rzR3(r1) // restore non-volatile GPRs from redzone
323 ld r13,rzR13(r1)
324 ld r14,rzR14(r1)
325 ld r15,rzR15(r1)
326 ld r16,rzR16(r1)
327 blr
328
329
330 COMMPAGE_DESCRIPTOR(bigcopy_970,_COMM_PAGE_BIGCOPY,0,0,kPort32to64+kCommPageBoth)
331