]>
Commit | Line | Data |
---|---|---|
55e303ae A |
1 | /* |
2 | * Copyright (c) 2003 Apple Computer, Inc. All rights reserved. | |
3 | * | |
8ad349bb | 4 | * @APPLE_LICENSE_OSREFERENCE_HEADER_START@ |
55e303ae | 5 | * |
8ad349bb A |
6 | * This file contains Original Code and/or Modifications of Original Code |
7 | * as defined in and that are subject to the Apple Public Source License | |
8 | * Version 2.0 (the 'License'). You may not use this file except in | |
9 | * compliance with the License. The rights granted to you under the | |
10 | * License may not be used to create, or enable the creation or | |
11 | * redistribution of, unlawful or unlicensed copies of an Apple operating | |
12 | * system, or to circumvent, violate, or enable the circumvention or | |
13 | * violation of, any terms of an Apple operating system software license | |
14 | * agreement. | |
15 | * | |
16 | * Please obtain a copy of the License at | |
17 | * http://www.opensource.apple.com/apsl/ and read it before using this | |
18 | * file. | |
19 | * | |
20 | * The Original Code and all software distributed under the License are | |
21 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
22 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, | |
23 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
24 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. | |
25 | * Please see the License for the specific language governing rights and | |
26 | * limitations under the License. | |
27 | * | |
28 | * @APPLE_LICENSE_OSREFERENCE_HEADER_END@ | |
55e303ae A |
29 | */ |
30 | /* ==================================== | |
31 | * Very Long Operand BCOPY for Mac OS X | |
32 | * ==================================== | |
33 | * | |
91447636 A |
34 | * Version of 2/21/2004, tuned for the IBM 970. This is for operands at |
35 | * least several pages long. It is called from bcopy()/memcpy()/memmove(), | |
36 | * and runs both in 32 and 64-bit mode. | |
55e303ae A |
37 | * |
38 | * We use the following additional strategies not used by the shorter | |
39 | * operand paths. Mostly, we try to optimize for memory bandwidth: | |
40 | * 1. Use DCBZ128 to avoid reading destination lines. Because this code | |
41 | * resides on the commmpage, it can use a private interface with the | |
42 | * kernel to minimize alignment exceptions if the destination is | |
43 | * uncached. The kernel will clear cr7 whenever it emulates a DCBZ or | |
44 | * DCBZ128 on the commpage. Thus we take at most one exception per call, | |
45 | * which is amortized across the very long operand. | |
46 | * 2. Copy larger chunks per iteration to minimize R/W bus turnaround | |
47 | * and maximize DRAM page locality (opening a new page is expensive.) | |
91447636 | 48 | * We use 256-byte chunks. |
55e303ae A |
49 | * 3. Touch in one source chunk ahead with DCBT. This is probably the |
50 | * least important change, and probably only helps restart the | |
51 | * hardware stream at the start of each source page. | |
55e303ae | 52 | */ |
91447636 A |
53 | |
54 | #define rs r13 | |
55 | #define rd r14 | |
56 | #define rc r15 | |
57 | #define rx r16 | |
58 | ||
59 | #define c16 r3 | |
60 | #define c32 r4 | |
61 | #define c48 r5 | |
62 | #define c64 r6 | |
63 | #define c80 r7 | |
64 | #define c96 r8 | |
65 | #define c112 r9 | |
66 | #define c256 r10 | |
67 | #define c384 r11 | |
68 | #define rv r12 // vrsave | |
55e303ae A |
69 | |
70 | // Offsets within the "red zone" (which is 224 bytes long): | |
71 | ||
91447636 A |
72 | #define rzR3 -8 |
73 | #define rzR13 -16 | |
74 | #define rzR14 -24 | |
75 | #define rzR15 -32 | |
76 | #define rzR16 -40 | |
77 | ||
78 | #define rzV20 -64 | |
79 | #define rzV21 -80 | |
80 | #define rzV22 -96 | |
81 | #define rzV23 -112 | |
55e303ae A |
82 | |
83 | ||
84 | #include <sys/appleapiopts.h> | |
85 | #include <ppc/asm.h> | |
86 | #include <machine/cpu_capabilities.h> | |
87 | #include <machine/commpage.h> | |
88 | ||
89 | .text | |
91447636 A |
90 | /* |
91 | * WARNING: this code is written for 32-bit mode, and ported by the kernel if necessary | |
92 | * to 64-bit mode for use in the 64-bit commpage. This "port" consists of the following | |
93 | * simple transformations: | |
94 | * - all word compares are changed to doubleword | |
95 | * - all "srwi[.]" opcodes are changed to "srdi[.]" | |
96 | * Nothing else is done. For this to work, the following rules must be | |
97 | * carefully followed: | |
98 | * - do not use carry or overflow | |
99 | * - only use record mode if you are sure the results are mode-invariant | |
100 | * for example, all "andi." and almost all "rlwinm." are fine | |
101 | * - do not use "slwi", "slw", or "srw" | |
102 | * An imaginative programmer could break the porting model in other ways, but the above | |
103 | * are the most likely problem areas. It is perhaps surprising how well in practice | |
104 | * this simple method works. | |
105 | */ | |
55e303ae A |
106 | |
107 | // Entry point. This is a subroutine of bcopy(). When called: | |
91447636 A |
108 | // r0 = return address (also stored in caller's SF) |
109 | // r4 = source ptr | |
110 | // r5 = length (at least several pages) | |
111 | // r12 = dest ptr | |
55e303ae | 112 | // |
91447636 A |
113 | // We only do "forward" moves, ie non-overlapping or toward 0. We return with non-volatiles |
114 | // and r3 preserved. | |
55e303ae A |
115 | |
116 | .align 5 | |
117 | bigcopy_970: | |
91447636 A |
118 | neg r2,r12 // is destination cache-line-aligned? |
119 | std r3,rzR3(r1) // save caller's r3, which must be preserved for memcpy() | |
120 | std r13,rzR13(r1) // spill non-volatile regs we use to redzone | |
121 | std r14,rzR14(r1) | |
122 | std r15,rzR15(r1) | |
123 | andi. r2,r2,0x7F // #bytes to align | |
124 | std r16,rzR16(r1) | |
125 | mr rs,r4 // copy parameters into nonvolatile registers | |
126 | mr rd,r12 | |
127 | mr rc,r5 | |
128 | mr rx,r0 // also save return address | |
129 | beq 1f // skip if already aligned | |
55e303ae A |
130 | |
131 | // Cache-line-align destination. | |
91447636 A |
132 | |
133 | mr r3,rd // set up dest ptr for memcpy() | |
134 | mr r5,r2 // number of bytes to copy | |
135 | add rs,rs,r2 // then bump our parameters past initial copy | |
136 | add rd,rd,r2 | |
137 | sub rc,rc,r2 | |
138 | bla _COMM_PAGE_MEMCPY // 128-byte-align destination | |
55e303ae A |
139 | |
140 | ||
91447636 A |
141 | // Load constant offsets and check whether source is 16-byte aligned. |
142 | // NB: the kernel clears cr7 if it emulates a dcbz128 on the commpage, | |
143 | // and we dcbz only if cr7 beq is set. | |
55e303ae | 144 | |
91447636 A |
145 | 1: |
146 | dcbt 0,rs // touch in 1st line of source | |
55e303ae A |
147 | andi. r0,rs,15 // check source alignment |
148 | mfspr rv,vrsave // save caller's bitmask | |
55e303ae A |
149 | li c16,16 // load the constant offsets for x-form ops |
150 | li c32,32 | |
91447636 A |
151 | srwi r2,rc,8 // get number of 256-byte chunks to xfer |
152 | li r0,-256 // we use 24 VRs (ie, 0-23) | |
55e303ae | 153 | li c48,48 |
91447636 A |
154 | li c64,64 |
155 | li c80,80 | |
156 | or r0,r0,rv // add our bits to caller's | |
157 | li c96,96 | |
158 | mtctr r2 // set up loop count | |
159 | li c112,112 | |
160 | cmpd cr7,r2,r2 // initialize cr7_eq to "on", so we dcbz128 | |
161 | mtspr vrsave,r0 // say we use vr0..vr23 | |
55e303ae A |
162 | li c256,256 |
163 | li c384,384 | |
91447636 | 164 | beq LalignedLoop // handle aligned sources |
55e303ae | 165 | |
55e303ae | 166 | |
91447636 | 167 | // Set up for unaligned loop. |
55e303ae | 168 | |
55e303ae A |
169 | lvsl v0,0,rs // get permute vector for left shift |
170 | lvxl v1,0,rs // prime the loop | |
91447636 A |
171 | li r0,rzV20 // save non-volatile VRs in redzone |
172 | stvx v20,r1,r0 | |
173 | li r0,rzV21 | |
174 | stvx v21,r1,r0 | |
175 | li r0,rzV22 | |
176 | stvx v22,r1,r0 | |
177 | li r0,rzV23 | |
178 | stvx v23,r1,r0 | |
55e303ae A |
179 | b LunalignedLoop // enter unaligned loop |
180 | ||
181 | ||
91447636 A |
182 | // Main loop for unaligned operands. We loop over 256-byte chunks (2 cache lines). |
183 | // Destination is 128-byte aligned, source is unaligned. | |
55e303ae A |
184 | |
185 | .align 5 | |
186 | LunalignedLoop: | |
91447636 A |
187 | dcbt c256,rs // touch in next chunk |
188 | dcbt c384,rs | |
189 | addi r2,rs,128 // point to 2nd 128 bytes of source | |
55e303ae A |
190 | lvxl v2,c16,rs |
191 | lvxl v3,c32,rs | |
91447636 A |
192 | lvxl v4,c48,rs |
193 | lvxl v5,c64,rs | |
194 | lvxl v6,c80,rs | |
195 | lvxl v7,c96,rs | |
196 | lvxl v8,c112,rs | |
197 | lvxl v9,0,r2 | |
198 | addi rs,rs,256 // point to next source chunk | |
199 | lvxl v10,c16,r2 | |
200 | lvxl v11,c32,r2 | |
201 | vperm v17,v1,v2,v0 | |
202 | lvxl v12,c48,r2 | |
203 | lvxl v13,c64,r2 | |
204 | vperm v18,v2,v3,v0 | |
205 | lvxl v14,c80,r2 | |
206 | lvxl v15,c96,r2 | |
207 | vperm v19,v3,v4,v0 | |
208 | lvxl v16,c112,r2 | |
209 | lvxl v1,0,rs // peek ahead at first source quad in next chunk | |
210 | vperm v20,v4,v5,v0 | |
211 | addi r2,rd,128 // point to 2nd 128 bytes of dest | |
55e303ae | 212 | bne-- cr7,1f // skip dcbz's if cr7 beq has been turned off by kernel |
91447636 A |
213 | dcbz128 0,rd |
214 | dcbz128 0,r2 | |
55e303ae | 215 | 1: |
91447636 A |
216 | vperm v21,v5,v6,v0 |
217 | stvxl v17,0,rd | |
218 | vperm v22,v6,v7,v0 | |
219 | stvxl v18,c16,rd | |
220 | vperm v23,v7,v8,v0 | |
221 | stvxl v19,c32,rd | |
222 | vperm v17,v8,v9,v0 | |
223 | stvxl v20,c48,rd | |
224 | vperm v18,v9,v10,v0 | |
225 | stvxl v21,c64,rd | |
226 | vperm v19,v10,v11,v0 | |
227 | stvxl v22,c80,rd | |
228 | vperm v20,v11,v12,v0 | |
229 | stvxl v23,c96,rd | |
230 | vperm v21,v12,v13,v0 | |
231 | stvxl v17,c112,rd | |
232 | vperm v22,v13,v14,v0 | |
233 | addi rd,rd,256 // point to next dest chunk | |
234 | stvxl v18,0,r2 | |
235 | vperm v23,v14,v15,v0 | |
236 | stvxl v19,c16,r2 | |
237 | vperm v17,v15,v16,v0 | |
238 | stvxl v20,c32,r2 | |
239 | vperm v18,v16,v1,v0 | |
240 | stvxl v21,c48,r2 | |
241 | stvxl v22,c64,r2 | |
242 | stvxl v23,c80,r2 | |
243 | stvxl v17,c96,r2 | |
244 | stvxl v18,c112,r2 | |
245 | bdnz++ LunalignedLoop // loop if another 256 bytes to go | |
246 | ||
247 | li r6,rzV20 // restore non-volatile VRs | |
248 | li r7,rzV21 | |
249 | li r8,rzV22 | |
250 | li r9,rzV23 | |
251 | lvx v20,r1,r6 | |
252 | lvx v21,r1,r7 | |
253 | lvx v22,r1,r8 | |
254 | lvx v23,r1,r9 | |
255 | b Ldone | |
55e303ae A |
256 | |
257 | ||
258 | // Aligned loop. Destination is 128-byte aligned, and source is 16-byte | |
91447636 | 259 | // aligned. Loop over 256-byte chunks (2 cache lines.) |
55e303ae A |
260 | |
261 | .align 5 | |
262 | LalignedLoop: | |
91447636 A |
263 | dcbt c256,rs // touch in next chunk |
264 | dcbt c384,rs | |
265 | addi r2,rs,128 // point to 2nd 128 bytes of source | |
55e303ae A |
266 | lvxl v1,0,rs |
267 | lvxl v2,c16,rs | |
55e303ae A |
268 | lvxl v3,c32,rs |
269 | lvxl v4,c48,rs | |
91447636 A |
270 | lvxl v5,c64,rs |
271 | lvxl v6,c80,rs | |
272 | lvxl v7,c96,rs | |
273 | lvxl v8,c112,rs | |
274 | lvxl v9,0,r2 | |
275 | lvxl v10,c16,r2 | |
276 | lvxl v11,c32,r2 | |
277 | lvxl v12,c48,r2 | |
278 | lvxl v13,c64,r2 | |
279 | lvxl v14,c80,r2 | |
280 | lvxl v15,c96,r2 | |
281 | lvxl v16,c112,r2 | |
282 | addi r2,rd,128 // point to 2nd 128 bytes of dest | |
283 | bne-- cr7,1f // skip dcbz's if cr7 beq has been turned off by kernel | |
284 | dcbz128 0,rd | |
285 | dcbz128 0,r2 | |
286 | 1: | |
287 | addi rs,rs,256 // point to next source chunk | |
55e303ae A |
288 | stvxl v1,0,rd |
289 | stvxl v2,c16,rd | |
290 | stvxl v3,c32,rd | |
291 | stvxl v4,c48,rd | |
91447636 A |
292 | stvxl v5,c64,rd |
293 | stvxl v6,c80,rd | |
294 | stvxl v7,c96,rd | |
295 | stvxl v8,c112,rd | |
296 | addi rd,rd,256 // point to next dest chunk | |
297 | stvxl v9,0,r2 | |
298 | stvxl v10,c16,r2 | |
299 | stvxl v11,c32,r2 | |
300 | stvxl v12,c48,r2 | |
301 | stvxl v13,c64,r2 | |
302 | stvxl v14,c80,r2 | |
303 | stvxl v15,c96,r2 | |
304 | stvxl v16,c112,r2 | |
305 | bdnz++ LalignedLoop // loop if another 256 bytes to go | |
306 | ||
307 | ||
308 | // Done, except for 0..255 leftover bytes at end. | |
55e303ae A |
309 | // rs = source ptr |
310 | // rd = dest ptr | |
91447636 | 311 | // rc = remaining count in low 7 bits |
55e303ae | 312 | // rv = caller's vrsave |
91447636 | 313 | // rx = caller's return address |
55e303ae A |
314 | |
315 | Ldone: | |
91447636 A |
316 | andi. r5,rc,0xFF // any leftover bytes? (0..255) |
317 | mtspr vrsave,rv // restore bitmap of live vr's | |
55e303ae | 318 | |
91447636 A |
319 | mr r3,rd |
320 | mr r4,rs | |
321 | bnela _COMM_PAGE_MEMCPY // copy leftover bytes | |
322 | ||
323 | mtlr rx // restore return address | |
324 | ld r3,rzR3(r1) // restore non-volatile GPRs from redzone | |
325 | ld r13,rzR13(r1) | |
326 | ld r14,rzR14(r1) | |
327 | ld r15,rzR15(r1) | |
328 | ld r16,rzR16(r1) | |
55e303ae A |
329 | blr |
330 | ||
331 | ||
91447636 | 332 | COMMPAGE_DESCRIPTOR(bigcopy_970,_COMM_PAGE_BIGCOPY,0,0,kPort32to64+kCommPageBoth) |
55e303ae | 333 |