]> git.saurik.com Git - apple/xnu.git/blob - osfmk/ppc/commpage/bigcopy_970.s
xnu-792.25.20.tar.gz
[apple/xnu.git] / osfmk / ppc / commpage / bigcopy_970.s
1 /*
2 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
11 *
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
18 * under the License.
19 *
20 * @APPLE_LICENSE_HEADER_END@
21 */
22 /* ====================================
23 * Very Long Operand BCOPY for Mac OS X
24 * ====================================
25 *
26 * Version of 2/21/2004, tuned for the IBM 970. This is for operands at
27 * least several pages long. It is called from bcopy()/memcpy()/memmove(),
28 * and runs both in 32 and 64-bit mode.
29 *
30 * We use the following additional strategies not used by the shorter
31 * operand paths. Mostly, we try to optimize for memory bandwidth:
32 * 1. Use DCBZ128 to avoid reading destination lines. Because this code
33 * resides on the commmpage, it can use a private interface with the
34 * kernel to minimize alignment exceptions if the destination is
35 * uncached. The kernel will clear cr7 whenever it emulates a DCBZ or
36 * DCBZ128 on the commpage. Thus we take at most one exception per call,
37 * which is amortized across the very long operand.
38 * 2. Copy larger chunks per iteration to minimize R/W bus turnaround
39 * and maximize DRAM page locality (opening a new page is expensive.)
40 * We use 256-byte chunks.
41 * 3. Touch in one source chunk ahead with DCBT. This is probably the
42 * least important change, and probably only helps restart the
43 * hardware stream at the start of each source page.
44 */
45
46 #define rs r13
47 #define rd r14
48 #define rc r15
49 #define rx r16
50
51 #define c16 r3
52 #define c32 r4
53 #define c48 r5
54 #define c64 r6
55 #define c80 r7
56 #define c96 r8
57 #define c112 r9
58 #define c256 r10
59 #define c384 r11
60 #define rv r12 // vrsave
61
62 // Offsets within the "red zone" (which is 224 bytes long):
63
64 #define rzR3 -8
65 #define rzR13 -16
66 #define rzR14 -24
67 #define rzR15 -32
68 #define rzR16 -40
69
70 #define rzV20 -64
71 #define rzV21 -80
72 #define rzV22 -96
73 #define rzV23 -112
74
75
76 #include <sys/appleapiopts.h>
77 #include <ppc/asm.h>
78 #include <machine/cpu_capabilities.h>
79 #include <machine/commpage.h>
80
81 .text
82 /*
83 * WARNING: this code is written for 32-bit mode, and ported by the kernel if necessary
84 * to 64-bit mode for use in the 64-bit commpage. This "port" consists of the following
85 * simple transformations:
86 * - all word compares are changed to doubleword
87 * - all "srwi[.]" opcodes are changed to "srdi[.]"
88 * Nothing else is done. For this to work, the following rules must be
89 * carefully followed:
90 * - do not use carry or overflow
91 * - only use record mode if you are sure the results are mode-invariant
92 * for example, all "andi." and almost all "rlwinm." are fine
93 * - do not use "slwi", "slw", or "srw"
94 * An imaginative programmer could break the porting model in other ways, but the above
95 * are the most likely problem areas. It is perhaps surprising how well in practice
96 * this simple method works.
97 */
98
99 // Entry point. This is a subroutine of bcopy(). When called:
100 // r0 = return address (also stored in caller's SF)
101 // r4 = source ptr
102 // r5 = length (at least several pages)
103 // r12 = dest ptr
104 //
105 // We only do "forward" moves, ie non-overlapping or toward 0. We return with non-volatiles
106 // and r3 preserved.
107
108 .align 5
109 bigcopy_970:
110 neg r2,r12 // is destination cache-line-aligned?
111 std r3,rzR3(r1) // save caller's r3, which must be preserved for memcpy()
112 std r13,rzR13(r1) // spill non-volatile regs we use to redzone
113 std r14,rzR14(r1)
114 std r15,rzR15(r1)
115 andi. r2,r2,0x7F // #bytes to align
116 std r16,rzR16(r1)
117 mr rs,r4 // copy parameters into nonvolatile registers
118 mr rd,r12
119 mr rc,r5
120 mr rx,r0 // also save return address
121 beq 1f // skip if already aligned
122
123 // Cache-line-align destination.
124
125 mr r3,rd // set up dest ptr for memcpy()
126 mr r5,r2 // number of bytes to copy
127 add rs,rs,r2 // then bump our parameters past initial copy
128 add rd,rd,r2
129 sub rc,rc,r2
130 bla _COMM_PAGE_MEMCPY // 128-byte-align destination
131
132
133 // Load constant offsets and check whether source is 16-byte aligned.
134 // NB: the kernel clears cr7 if it emulates a dcbz128 on the commpage,
135 // and we dcbz only if cr7 beq is set.
136
137 1:
138 dcbt 0,rs // touch in 1st line of source
139 andi. r0,rs,15 // check source alignment
140 mfspr rv,vrsave // save caller's bitmask
141 li c16,16 // load the constant offsets for x-form ops
142 li c32,32
143 srwi r2,rc,8 // get number of 256-byte chunks to xfer
144 li r0,-256 // we use 24 VRs (ie, 0-23)
145 li c48,48
146 li c64,64
147 li c80,80
148 or r0,r0,rv // add our bits to caller's
149 li c96,96
150 mtctr r2 // set up loop count
151 li c112,112
152 cmpd cr7,r2,r2 // initialize cr7_eq to "on", so we dcbz128
153 mtspr vrsave,r0 // say we use vr0..vr23
154 li c256,256
155 li c384,384
156 beq LalignedLoop // handle aligned sources
157
158
159 // Set up for unaligned loop.
160
161 lvsl v0,0,rs // get permute vector for left shift
162 lvxl v1,0,rs // prime the loop
163 li r0,rzV20 // save non-volatile VRs in redzone
164 stvx v20,r1,r0
165 li r0,rzV21
166 stvx v21,r1,r0
167 li r0,rzV22
168 stvx v22,r1,r0
169 li r0,rzV23
170 stvx v23,r1,r0
171 b LunalignedLoop // enter unaligned loop
172
173
174 // Main loop for unaligned operands. We loop over 256-byte chunks (2 cache lines).
175 // Destination is 128-byte aligned, source is unaligned.
176
177 .align 5
178 LunalignedLoop:
179 dcbt c256,rs // touch in next chunk
180 dcbt c384,rs
181 addi r2,rs,128 // point to 2nd 128 bytes of source
182 lvxl v2,c16,rs
183 lvxl v3,c32,rs
184 lvxl v4,c48,rs
185 lvxl v5,c64,rs
186 lvxl v6,c80,rs
187 lvxl v7,c96,rs
188 lvxl v8,c112,rs
189 lvxl v9,0,r2
190 addi rs,rs,256 // point to next source chunk
191 lvxl v10,c16,r2
192 lvxl v11,c32,r2
193 vperm v17,v1,v2,v0
194 lvxl v12,c48,r2
195 lvxl v13,c64,r2
196 vperm v18,v2,v3,v0
197 lvxl v14,c80,r2
198 lvxl v15,c96,r2
199 vperm v19,v3,v4,v0
200 lvxl v16,c112,r2
201 lvxl v1,0,rs // peek ahead at first source quad in next chunk
202 vperm v20,v4,v5,v0
203 addi r2,rd,128 // point to 2nd 128 bytes of dest
204 bne-- cr7,1f // skip dcbz's if cr7 beq has been turned off by kernel
205 dcbz128 0,rd
206 dcbz128 0,r2
207 1:
208 vperm v21,v5,v6,v0
209 stvxl v17,0,rd
210 vperm v22,v6,v7,v0
211 stvxl v18,c16,rd
212 vperm v23,v7,v8,v0
213 stvxl v19,c32,rd
214 vperm v17,v8,v9,v0
215 stvxl v20,c48,rd
216 vperm v18,v9,v10,v0
217 stvxl v21,c64,rd
218 vperm v19,v10,v11,v0
219 stvxl v22,c80,rd
220 vperm v20,v11,v12,v0
221 stvxl v23,c96,rd
222 vperm v21,v12,v13,v0
223 stvxl v17,c112,rd
224 vperm v22,v13,v14,v0
225 addi rd,rd,256 // point to next dest chunk
226 stvxl v18,0,r2
227 vperm v23,v14,v15,v0
228 stvxl v19,c16,r2
229 vperm v17,v15,v16,v0
230 stvxl v20,c32,r2
231 vperm v18,v16,v1,v0
232 stvxl v21,c48,r2
233 stvxl v22,c64,r2
234 stvxl v23,c80,r2
235 stvxl v17,c96,r2
236 stvxl v18,c112,r2
237 bdnz++ LunalignedLoop // loop if another 256 bytes to go
238
239 li r6,rzV20 // restore non-volatile VRs
240 li r7,rzV21
241 li r8,rzV22
242 li r9,rzV23
243 lvx v20,r1,r6
244 lvx v21,r1,r7
245 lvx v22,r1,r8
246 lvx v23,r1,r9
247 b Ldone
248
249
250 // Aligned loop. Destination is 128-byte aligned, and source is 16-byte
251 // aligned. Loop over 256-byte chunks (2 cache lines.)
252
253 .align 5
254 LalignedLoop:
255 dcbt c256,rs // touch in next chunk
256 dcbt c384,rs
257 addi r2,rs,128 // point to 2nd 128 bytes of source
258 lvxl v1,0,rs
259 lvxl v2,c16,rs
260 lvxl v3,c32,rs
261 lvxl v4,c48,rs
262 lvxl v5,c64,rs
263 lvxl v6,c80,rs
264 lvxl v7,c96,rs
265 lvxl v8,c112,rs
266 lvxl v9,0,r2
267 lvxl v10,c16,r2
268 lvxl v11,c32,r2
269 lvxl v12,c48,r2
270 lvxl v13,c64,r2
271 lvxl v14,c80,r2
272 lvxl v15,c96,r2
273 lvxl v16,c112,r2
274 addi r2,rd,128 // point to 2nd 128 bytes of dest
275 bne-- cr7,1f // skip dcbz's if cr7 beq has been turned off by kernel
276 dcbz128 0,rd
277 dcbz128 0,r2
278 1:
279 addi rs,rs,256 // point to next source chunk
280 stvxl v1,0,rd
281 stvxl v2,c16,rd
282 stvxl v3,c32,rd
283 stvxl v4,c48,rd
284 stvxl v5,c64,rd
285 stvxl v6,c80,rd
286 stvxl v7,c96,rd
287 stvxl v8,c112,rd
288 addi rd,rd,256 // point to next dest chunk
289 stvxl v9,0,r2
290 stvxl v10,c16,r2
291 stvxl v11,c32,r2
292 stvxl v12,c48,r2
293 stvxl v13,c64,r2
294 stvxl v14,c80,r2
295 stvxl v15,c96,r2
296 stvxl v16,c112,r2
297 bdnz++ LalignedLoop // loop if another 256 bytes to go
298
299
300 // Done, except for 0..255 leftover bytes at end.
301 // rs = source ptr
302 // rd = dest ptr
303 // rc = remaining count in low 7 bits
304 // rv = caller's vrsave
305 // rx = caller's return address
306
307 Ldone:
308 andi. r5,rc,0xFF // any leftover bytes? (0..255)
309 mtspr vrsave,rv // restore bitmap of live vr's
310
311 mr r3,rd
312 mr r4,rs
313 bnela _COMM_PAGE_MEMCPY // copy leftover bytes
314
315 mtlr rx // restore return address
316 ld r3,rzR3(r1) // restore non-volatile GPRs from redzone
317 ld r13,rzR13(r1)
318 ld r14,rzR14(r1)
319 ld r15,rzR15(r1)
320 ld r16,rzR16(r1)
321 blr
322
323
324 COMMPAGE_DESCRIPTOR(bigcopy_970,_COMM_PAGE_BIGCOPY,0,0,kPort32to64+kCommPageBoth)
325