]>
Commit | Line | Data |
---|---|---|
55e303ae A |
1 | /* |
2 | * Copyright (c) 2003 Apple Computer, Inc. All rights reserved. | |
3 | * | |
2d21ac55 | 4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ |
55e303ae | 5 | * |
2d21ac55 A |
6 | * This file contains Original Code and/or Modifications of Original Code |
7 | * as defined in and that are subject to the Apple Public Source License | |
8 | * Version 2.0 (the 'License'). You may not use this file except in | |
9 | * compliance with the License. The rights granted to you under the License | |
10 | * may not be used to create, or enable the creation or redistribution of, | |
11 | * unlawful or unlicensed copies of an Apple operating system, or to | |
12 | * circumvent, violate, or enable the circumvention or violation of, any | |
13 | * terms of an Apple operating system software license agreement. | |
8f6c56a5 | 14 | * |
2d21ac55 A |
15 | * Please obtain a copy of the License at |
16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. | |
17 | * | |
18 | * The Original Code and all software distributed under the License are | |
19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
8f6c56a5 A |
20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, |
21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
2d21ac55 A |
22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. |
23 | * Please see the License for the specific language governing rights and | |
24 | * limitations under the License. | |
8f6c56a5 | 25 | * |
2d21ac55 | 26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ |
55e303ae A |
27 | */ |
28 | /* ==================================== | |
29 | * Very Long Operand BCOPY for Mac OS X | |
30 | * ==================================== | |
31 | * | |
91447636 A |
32 | * Version of 2/21/2004, tuned for the IBM 970. This is for operands at |
33 | * least several pages long. It is called from bcopy()/memcpy()/memmove(), | |
34 | * and runs both in 32 and 64-bit mode. | |
55e303ae A |
35 | * |
36 | * We use the following additional strategies not used by the shorter | |
37 | * operand paths. Mostly, we try to optimize for memory bandwidth: | |
38 | * 1. Use DCBZ128 to avoid reading destination lines. Because this code | |
39 | * resides on the commmpage, it can use a private interface with the | |
40 | * kernel to minimize alignment exceptions if the destination is | |
41 | * uncached. The kernel will clear cr7 whenever it emulates a DCBZ or | |
42 | * DCBZ128 on the commpage. Thus we take at most one exception per call, | |
43 | * which is amortized across the very long operand. | |
44 | * 2. Copy larger chunks per iteration to minimize R/W bus turnaround | |
45 | * and maximize DRAM page locality (opening a new page is expensive.) | |
91447636 | 46 | * We use 256-byte chunks. |
55e303ae A |
47 | * 3. Touch in one source chunk ahead with DCBT. This is probably the |
48 | * least important change, and probably only helps restart the | |
49 | * hardware stream at the start of each source page. | |
55e303ae | 50 | */ |
91447636 A |
51 | |
52 | #define rs r13 | |
53 | #define rd r14 | |
54 | #define rc r15 | |
55 | #define rx r16 | |
56 | ||
57 | #define c16 r3 | |
58 | #define c32 r4 | |
59 | #define c48 r5 | |
60 | #define c64 r6 | |
61 | #define c80 r7 | |
62 | #define c96 r8 | |
63 | #define c112 r9 | |
64 | #define c256 r10 | |
65 | #define c384 r11 | |
66 | #define rv r12 // vrsave | |
55e303ae A |
67 | |
68 | // Offsets within the "red zone" (which is 224 bytes long): | |
69 | ||
91447636 A |
70 | #define rzR3 -8 |
71 | #define rzR13 -16 | |
72 | #define rzR14 -24 | |
73 | #define rzR15 -32 | |
74 | #define rzR16 -40 | |
75 | ||
76 | #define rzV20 -64 | |
77 | #define rzV21 -80 | |
78 | #define rzV22 -96 | |
79 | #define rzV23 -112 | |
55e303ae A |
80 | |
81 | ||
82 | #include <sys/appleapiopts.h> | |
83 | #include <ppc/asm.h> | |
84 | #include <machine/cpu_capabilities.h> | |
85 | #include <machine/commpage.h> | |
86 | ||
87 | .text | |
91447636 A |
88 | /* |
89 | * WARNING: this code is written for 32-bit mode, and ported by the kernel if necessary | |
90 | * to 64-bit mode for use in the 64-bit commpage. This "port" consists of the following | |
91 | * simple transformations: | |
92 | * - all word compares are changed to doubleword | |
93 | * - all "srwi[.]" opcodes are changed to "srdi[.]" | |
94 | * Nothing else is done. For this to work, the following rules must be | |
95 | * carefully followed: | |
96 | * - do not use carry or overflow | |
97 | * - only use record mode if you are sure the results are mode-invariant | |
98 | * for example, all "andi." and almost all "rlwinm." are fine | |
99 | * - do not use "slwi", "slw", or "srw" | |
100 | * An imaginative programmer could break the porting model in other ways, but the above | |
101 | * are the most likely problem areas. It is perhaps surprising how well in practice | |
102 | * this simple method works. | |
103 | */ | |
55e303ae A |
104 | |
105 | // Entry point. This is a subroutine of bcopy(). When called: | |
91447636 A |
106 | // r0 = return address (also stored in caller's SF) |
107 | // r4 = source ptr | |
108 | // r5 = length (at least several pages) | |
109 | // r12 = dest ptr | |
55e303ae | 110 | // |
91447636 A |
111 | // We only do "forward" moves, ie non-overlapping or toward 0. We return with non-volatiles |
112 | // and r3 preserved. | |
55e303ae A |
113 | |
114 | .align 5 | |
115 | bigcopy_970: | |
91447636 A |
116 | neg r2,r12 // is destination cache-line-aligned? |
117 | std r3,rzR3(r1) // save caller's r3, which must be preserved for memcpy() | |
118 | std r13,rzR13(r1) // spill non-volatile regs we use to redzone | |
119 | std r14,rzR14(r1) | |
120 | std r15,rzR15(r1) | |
121 | andi. r2,r2,0x7F // #bytes to align | |
122 | std r16,rzR16(r1) | |
123 | mr rs,r4 // copy parameters into nonvolatile registers | |
124 | mr rd,r12 | |
125 | mr rc,r5 | |
126 | mr rx,r0 // also save return address | |
127 | beq 1f // skip if already aligned | |
55e303ae A |
128 | |
129 | // Cache-line-align destination. | |
91447636 A |
130 | |
131 | mr r3,rd // set up dest ptr for memcpy() | |
132 | mr r5,r2 // number of bytes to copy | |
133 | add rs,rs,r2 // then bump our parameters past initial copy | |
134 | add rd,rd,r2 | |
135 | sub rc,rc,r2 | |
136 | bla _COMM_PAGE_MEMCPY // 128-byte-align destination | |
55e303ae A |
137 | |
138 | ||
91447636 A |
139 | // Load constant offsets and check whether source is 16-byte aligned. |
140 | // NB: the kernel clears cr7 if it emulates a dcbz128 on the commpage, | |
141 | // and we dcbz only if cr7 beq is set. | |
55e303ae | 142 | |
91447636 A |
143 | 1: |
144 | dcbt 0,rs // touch in 1st line of source | |
55e303ae A |
145 | andi. r0,rs,15 // check source alignment |
146 | mfspr rv,vrsave // save caller's bitmask | |
55e303ae A |
147 | li c16,16 // load the constant offsets for x-form ops |
148 | li c32,32 | |
91447636 A |
149 | srwi r2,rc,8 // get number of 256-byte chunks to xfer |
150 | li r0,-256 // we use 24 VRs (ie, 0-23) | |
55e303ae | 151 | li c48,48 |
91447636 A |
152 | li c64,64 |
153 | li c80,80 | |
154 | or r0,r0,rv // add our bits to caller's | |
155 | li c96,96 | |
156 | mtctr r2 // set up loop count | |
157 | li c112,112 | |
158 | cmpd cr7,r2,r2 // initialize cr7_eq to "on", so we dcbz128 | |
159 | mtspr vrsave,r0 // say we use vr0..vr23 | |
55e303ae A |
160 | li c256,256 |
161 | li c384,384 | |
91447636 | 162 | beq LalignedLoop // handle aligned sources |
55e303ae | 163 | |
55e303ae | 164 | |
91447636 | 165 | // Set up for unaligned loop. |
55e303ae | 166 | |
55e303ae A |
167 | lvsl v0,0,rs // get permute vector for left shift |
168 | lvxl v1,0,rs // prime the loop | |
91447636 A |
169 | li r0,rzV20 // save non-volatile VRs in redzone |
170 | stvx v20,r1,r0 | |
171 | li r0,rzV21 | |
172 | stvx v21,r1,r0 | |
173 | li r0,rzV22 | |
174 | stvx v22,r1,r0 | |
175 | li r0,rzV23 | |
176 | stvx v23,r1,r0 | |
55e303ae A |
177 | b LunalignedLoop // enter unaligned loop |
178 | ||
179 | ||
91447636 A |
180 | // Main loop for unaligned operands. We loop over 256-byte chunks (2 cache lines). |
181 | // Destination is 128-byte aligned, source is unaligned. | |
55e303ae A |
182 | |
183 | .align 5 | |
184 | LunalignedLoop: | |
91447636 A |
185 | dcbt c256,rs // touch in next chunk |
186 | dcbt c384,rs | |
187 | addi r2,rs,128 // point to 2nd 128 bytes of source | |
55e303ae A |
188 | lvxl v2,c16,rs |
189 | lvxl v3,c32,rs | |
91447636 A |
190 | lvxl v4,c48,rs |
191 | lvxl v5,c64,rs | |
192 | lvxl v6,c80,rs | |
193 | lvxl v7,c96,rs | |
194 | lvxl v8,c112,rs | |
195 | lvxl v9,0,r2 | |
196 | addi rs,rs,256 // point to next source chunk | |
197 | lvxl v10,c16,r2 | |
198 | lvxl v11,c32,r2 | |
199 | vperm v17,v1,v2,v0 | |
200 | lvxl v12,c48,r2 | |
201 | lvxl v13,c64,r2 | |
202 | vperm v18,v2,v3,v0 | |
203 | lvxl v14,c80,r2 | |
204 | lvxl v15,c96,r2 | |
205 | vperm v19,v3,v4,v0 | |
206 | lvxl v16,c112,r2 | |
207 | lvxl v1,0,rs // peek ahead at first source quad in next chunk | |
208 | vperm v20,v4,v5,v0 | |
209 | addi r2,rd,128 // point to 2nd 128 bytes of dest | |
55e303ae | 210 | bne-- cr7,1f // skip dcbz's if cr7 beq has been turned off by kernel |
91447636 A |
211 | dcbz128 0,rd |
212 | dcbz128 0,r2 | |
55e303ae | 213 | 1: |
91447636 A |
214 | vperm v21,v5,v6,v0 |
215 | stvxl v17,0,rd | |
216 | vperm v22,v6,v7,v0 | |
217 | stvxl v18,c16,rd | |
218 | vperm v23,v7,v8,v0 | |
219 | stvxl v19,c32,rd | |
220 | vperm v17,v8,v9,v0 | |
221 | stvxl v20,c48,rd | |
222 | vperm v18,v9,v10,v0 | |
223 | stvxl v21,c64,rd | |
224 | vperm v19,v10,v11,v0 | |
225 | stvxl v22,c80,rd | |
226 | vperm v20,v11,v12,v0 | |
227 | stvxl v23,c96,rd | |
228 | vperm v21,v12,v13,v0 | |
229 | stvxl v17,c112,rd | |
230 | vperm v22,v13,v14,v0 | |
231 | addi rd,rd,256 // point to next dest chunk | |
232 | stvxl v18,0,r2 | |
233 | vperm v23,v14,v15,v0 | |
234 | stvxl v19,c16,r2 | |
235 | vperm v17,v15,v16,v0 | |
236 | stvxl v20,c32,r2 | |
237 | vperm v18,v16,v1,v0 | |
238 | stvxl v21,c48,r2 | |
239 | stvxl v22,c64,r2 | |
240 | stvxl v23,c80,r2 | |
241 | stvxl v17,c96,r2 | |
242 | stvxl v18,c112,r2 | |
243 | bdnz++ LunalignedLoop // loop if another 256 bytes to go | |
244 | ||
245 | li r6,rzV20 // restore non-volatile VRs | |
246 | li r7,rzV21 | |
247 | li r8,rzV22 | |
248 | li r9,rzV23 | |
249 | lvx v20,r1,r6 | |
250 | lvx v21,r1,r7 | |
251 | lvx v22,r1,r8 | |
252 | lvx v23,r1,r9 | |
253 | b Ldone | |
55e303ae A |
254 | |
255 | ||
256 | // Aligned loop. Destination is 128-byte aligned, and source is 16-byte | |
91447636 | 257 | // aligned. Loop over 256-byte chunks (2 cache lines.) |
55e303ae A |
258 | |
259 | .align 5 | |
260 | LalignedLoop: | |
91447636 A |
261 | dcbt c256,rs // touch in next chunk |
262 | dcbt c384,rs | |
263 | addi r2,rs,128 // point to 2nd 128 bytes of source | |
55e303ae A |
264 | lvxl v1,0,rs |
265 | lvxl v2,c16,rs | |
55e303ae A |
266 | lvxl v3,c32,rs |
267 | lvxl v4,c48,rs | |
91447636 A |
268 | lvxl v5,c64,rs |
269 | lvxl v6,c80,rs | |
270 | lvxl v7,c96,rs | |
271 | lvxl v8,c112,rs | |
272 | lvxl v9,0,r2 | |
273 | lvxl v10,c16,r2 | |
274 | lvxl v11,c32,r2 | |
275 | lvxl v12,c48,r2 | |
276 | lvxl v13,c64,r2 | |
277 | lvxl v14,c80,r2 | |
278 | lvxl v15,c96,r2 | |
279 | lvxl v16,c112,r2 | |
280 | addi r2,rd,128 // point to 2nd 128 bytes of dest | |
281 | bne-- cr7,1f // skip dcbz's if cr7 beq has been turned off by kernel | |
282 | dcbz128 0,rd | |
283 | dcbz128 0,r2 | |
284 | 1: | |
285 | addi rs,rs,256 // point to next source chunk | |
55e303ae A |
286 | stvxl v1,0,rd |
287 | stvxl v2,c16,rd | |
288 | stvxl v3,c32,rd | |
289 | stvxl v4,c48,rd | |
91447636 A |
290 | stvxl v5,c64,rd |
291 | stvxl v6,c80,rd | |
292 | stvxl v7,c96,rd | |
293 | stvxl v8,c112,rd | |
294 | addi rd,rd,256 // point to next dest chunk | |
295 | stvxl v9,0,r2 | |
296 | stvxl v10,c16,r2 | |
297 | stvxl v11,c32,r2 | |
298 | stvxl v12,c48,r2 | |
299 | stvxl v13,c64,r2 | |
300 | stvxl v14,c80,r2 | |
301 | stvxl v15,c96,r2 | |
302 | stvxl v16,c112,r2 | |
303 | bdnz++ LalignedLoop // loop if another 256 bytes to go | |
304 | ||
305 | ||
306 | // Done, except for 0..255 leftover bytes at end. | |
55e303ae A |
307 | // rs = source ptr |
308 | // rd = dest ptr | |
91447636 | 309 | // rc = remaining count in low 7 bits |
55e303ae | 310 | // rv = caller's vrsave |
91447636 | 311 | // rx = caller's return address |
55e303ae A |
312 | |
313 | Ldone: | |
91447636 A |
314 | andi. r5,rc,0xFF // any leftover bytes? (0..255) |
315 | mtspr vrsave,rv // restore bitmap of live vr's | |
55e303ae | 316 | |
91447636 A |
317 | mr r3,rd |
318 | mr r4,rs | |
319 | bnela _COMM_PAGE_MEMCPY // copy leftover bytes | |
320 | ||
321 | mtlr rx // restore return address | |
322 | ld r3,rzR3(r1) // restore non-volatile GPRs from redzone | |
323 | ld r13,rzR13(r1) | |
324 | ld r14,rzR14(r1) | |
325 | ld r15,rzR15(r1) | |
326 | ld r16,rzR16(r1) | |
55e303ae A |
327 | blr |
328 | ||
329 | ||
91447636 | 330 | COMMPAGE_DESCRIPTOR(bigcopy_970,_COMM_PAGE_BIGCOPY,0,0,kPort32to64+kCommPageBoth) |
55e303ae | 331 |