]>
Commit | Line | Data |
---|---|---|
55e303ae A |
1 | /* |
2 | * Copyright (c) 2003 Apple Computer, Inc. All rights reserved. | |
3 | * | |
6601e61a | 4 | * @APPLE_LICENSE_HEADER_START@ |
55e303ae | 5 | * |
6601e61a A |
6 | * The contents of this file constitute Original Code as defined in and |
7 | * are subject to the Apple Public Source License Version 1.1 (the | |
8 | * "License"). You may not use this file except in compliance with the | |
9 | * License. Please obtain a copy of the License at | |
10 | * http://www.apple.com/publicsource and read it before using this file. | |
8f6c56a5 | 11 | * |
6601e61a A |
12 | * This Original Code and all software distributed under the License are |
13 | * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
8f6c56a5 A |
14 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, |
15 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
6601e61a A |
16 | * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the |
17 | * License for the specific language governing rights and limitations | |
18 | * under the License. | |
8f6c56a5 | 19 | * |
6601e61a | 20 | * @APPLE_LICENSE_HEADER_END@ |
55e303ae A |
21 | */ |
22 | /* ==================================== | |
23 | * Very Long Operand BCOPY for Mac OS X | |
24 | * ==================================== | |
25 | * | |
91447636 A |
26 | * Version of 2/21/2004, tuned for the IBM 970. This is for operands at |
27 | * least several pages long. It is called from bcopy()/memcpy()/memmove(), | |
28 | * and runs both in 32 and 64-bit mode. | |
55e303ae A |
29 | * |
30 | * We use the following additional strategies not used by the shorter | |
31 | * operand paths. Mostly, we try to optimize for memory bandwidth: | |
32 | * 1. Use DCBZ128 to avoid reading destination lines. Because this code | |
33 | * resides on the commmpage, it can use a private interface with the | |
34 | * kernel to minimize alignment exceptions if the destination is | |
35 | * uncached. The kernel will clear cr7 whenever it emulates a DCBZ or | |
36 | * DCBZ128 on the commpage. Thus we take at most one exception per call, | |
37 | * which is amortized across the very long operand. | |
38 | * 2. Copy larger chunks per iteration to minimize R/W bus turnaround | |
39 | * and maximize DRAM page locality (opening a new page is expensive.) | |
91447636 | 40 | * We use 256-byte chunks. |
55e303ae A |
41 | * 3. Touch in one source chunk ahead with DCBT. This is probably the |
42 | * least important change, and probably only helps restart the | |
43 | * hardware stream at the start of each source page. | |
55e303ae | 44 | */ |
91447636 A |
45 | |
46 | #define rs r13 | |
47 | #define rd r14 | |
48 | #define rc r15 | |
49 | #define rx r16 | |
50 | ||
51 | #define c16 r3 | |
52 | #define c32 r4 | |
53 | #define c48 r5 | |
54 | #define c64 r6 | |
55 | #define c80 r7 | |
56 | #define c96 r8 | |
57 | #define c112 r9 | |
58 | #define c256 r10 | |
59 | #define c384 r11 | |
60 | #define rv r12 // vrsave | |
55e303ae A |
61 | |
62 | // Offsets within the "red zone" (which is 224 bytes long): | |
63 | ||
91447636 A |
64 | #define rzR3 -8 |
65 | #define rzR13 -16 | |
66 | #define rzR14 -24 | |
67 | #define rzR15 -32 | |
68 | #define rzR16 -40 | |
69 | ||
70 | #define rzV20 -64 | |
71 | #define rzV21 -80 | |
72 | #define rzV22 -96 | |
73 | #define rzV23 -112 | |
55e303ae A |
74 | |
75 | ||
76 | #include <sys/appleapiopts.h> | |
77 | #include <ppc/asm.h> | |
78 | #include <machine/cpu_capabilities.h> | |
79 | #include <machine/commpage.h> | |
80 | ||
81 | .text | |
91447636 A |
82 | /* |
83 | * WARNING: this code is written for 32-bit mode, and ported by the kernel if necessary | |
84 | * to 64-bit mode for use in the 64-bit commpage. This "port" consists of the following | |
85 | * simple transformations: | |
86 | * - all word compares are changed to doubleword | |
87 | * - all "srwi[.]" opcodes are changed to "srdi[.]" | |
88 | * Nothing else is done. For this to work, the following rules must be | |
89 | * carefully followed: | |
90 | * - do not use carry or overflow | |
91 | * - only use record mode if you are sure the results are mode-invariant | |
92 | * for example, all "andi." and almost all "rlwinm." are fine | |
93 | * - do not use "slwi", "slw", or "srw" | |
94 | * An imaginative programmer could break the porting model in other ways, but the above | |
95 | * are the most likely problem areas. It is perhaps surprising how well in practice | |
96 | * this simple method works. | |
97 | */ | |
55e303ae A |
98 | |
99 | // Entry point. This is a subroutine of bcopy(). When called: | |
91447636 A |
100 | // r0 = return address (also stored in caller's SF) |
101 | // r4 = source ptr | |
102 | // r5 = length (at least several pages) | |
103 | // r12 = dest ptr | |
55e303ae | 104 | // |
91447636 A |
105 | // We only do "forward" moves, ie non-overlapping or toward 0. We return with non-volatiles |
106 | // and r3 preserved. | |
55e303ae A |
107 | |
108 | .align 5 | |
109 | bigcopy_970: | |
91447636 A |
110 | neg r2,r12 // is destination cache-line-aligned? |
111 | std r3,rzR3(r1) // save caller's r3, which must be preserved for memcpy() | |
112 | std r13,rzR13(r1) // spill non-volatile regs we use to redzone | |
113 | std r14,rzR14(r1) | |
114 | std r15,rzR15(r1) | |
115 | andi. r2,r2,0x7F // #bytes to align | |
116 | std r16,rzR16(r1) | |
117 | mr rs,r4 // copy parameters into nonvolatile registers | |
118 | mr rd,r12 | |
119 | mr rc,r5 | |
120 | mr rx,r0 // also save return address | |
121 | beq 1f // skip if already aligned | |
55e303ae A |
122 | |
123 | // Cache-line-align destination. | |
91447636 A |
124 | |
125 | mr r3,rd // set up dest ptr for memcpy() | |
126 | mr r5,r2 // number of bytes to copy | |
127 | add rs,rs,r2 // then bump our parameters past initial copy | |
128 | add rd,rd,r2 | |
129 | sub rc,rc,r2 | |
130 | bla _COMM_PAGE_MEMCPY // 128-byte-align destination | |
55e303ae A |
131 | |
132 | ||
91447636 A |
133 | // Load constant offsets and check whether source is 16-byte aligned. |
134 | // NB: the kernel clears cr7 if it emulates a dcbz128 on the commpage, | |
135 | // and we dcbz only if cr7 beq is set. | |
55e303ae | 136 | |
91447636 A |
137 | 1: |
138 | dcbt 0,rs // touch in 1st line of source | |
55e303ae A |
139 | andi. r0,rs,15 // check source alignment |
140 | mfspr rv,vrsave // save caller's bitmask | |
55e303ae A |
141 | li c16,16 // load the constant offsets for x-form ops |
142 | li c32,32 | |
91447636 A |
143 | srwi r2,rc,8 // get number of 256-byte chunks to xfer |
144 | li r0,-256 // we use 24 VRs (ie, 0-23) | |
55e303ae | 145 | li c48,48 |
91447636 A |
146 | li c64,64 |
147 | li c80,80 | |
148 | or r0,r0,rv // add our bits to caller's | |
149 | li c96,96 | |
150 | mtctr r2 // set up loop count | |
151 | li c112,112 | |
152 | cmpd cr7,r2,r2 // initialize cr7_eq to "on", so we dcbz128 | |
153 | mtspr vrsave,r0 // say we use vr0..vr23 | |
55e303ae A |
154 | li c256,256 |
155 | li c384,384 | |
91447636 | 156 | beq LalignedLoop // handle aligned sources |
55e303ae | 157 | |
55e303ae | 158 | |
91447636 | 159 | // Set up for unaligned loop. |
55e303ae | 160 | |
55e303ae A |
161 | lvsl v0,0,rs // get permute vector for left shift |
162 | lvxl v1,0,rs // prime the loop | |
91447636 A |
163 | li r0,rzV20 // save non-volatile VRs in redzone |
164 | stvx v20,r1,r0 | |
165 | li r0,rzV21 | |
166 | stvx v21,r1,r0 | |
167 | li r0,rzV22 | |
168 | stvx v22,r1,r0 | |
169 | li r0,rzV23 | |
170 | stvx v23,r1,r0 | |
55e303ae A |
171 | b LunalignedLoop // enter unaligned loop |
172 | ||
173 | ||
91447636 A |
174 | // Main loop for unaligned operands. We loop over 256-byte chunks (2 cache lines). |
175 | // Destination is 128-byte aligned, source is unaligned. | |
55e303ae A |
176 | |
177 | .align 5 | |
178 | LunalignedLoop: | |
91447636 A |
179 | dcbt c256,rs // touch in next chunk |
180 | dcbt c384,rs | |
181 | addi r2,rs,128 // point to 2nd 128 bytes of source | |
55e303ae A |
182 | lvxl v2,c16,rs |
183 | lvxl v3,c32,rs | |
91447636 A |
184 | lvxl v4,c48,rs |
185 | lvxl v5,c64,rs | |
186 | lvxl v6,c80,rs | |
187 | lvxl v7,c96,rs | |
188 | lvxl v8,c112,rs | |
189 | lvxl v9,0,r2 | |
190 | addi rs,rs,256 // point to next source chunk | |
191 | lvxl v10,c16,r2 | |
192 | lvxl v11,c32,r2 | |
193 | vperm v17,v1,v2,v0 | |
194 | lvxl v12,c48,r2 | |
195 | lvxl v13,c64,r2 | |
196 | vperm v18,v2,v3,v0 | |
197 | lvxl v14,c80,r2 | |
198 | lvxl v15,c96,r2 | |
199 | vperm v19,v3,v4,v0 | |
200 | lvxl v16,c112,r2 | |
201 | lvxl v1,0,rs // peek ahead at first source quad in next chunk | |
202 | vperm v20,v4,v5,v0 | |
203 | addi r2,rd,128 // point to 2nd 128 bytes of dest | |
55e303ae | 204 | bne-- cr7,1f // skip dcbz's if cr7 beq has been turned off by kernel |
91447636 A |
205 | dcbz128 0,rd |
206 | dcbz128 0,r2 | |
55e303ae | 207 | 1: |
91447636 A |
208 | vperm v21,v5,v6,v0 |
209 | stvxl v17,0,rd | |
210 | vperm v22,v6,v7,v0 | |
211 | stvxl v18,c16,rd | |
212 | vperm v23,v7,v8,v0 | |
213 | stvxl v19,c32,rd | |
214 | vperm v17,v8,v9,v0 | |
215 | stvxl v20,c48,rd | |
216 | vperm v18,v9,v10,v0 | |
217 | stvxl v21,c64,rd | |
218 | vperm v19,v10,v11,v0 | |
219 | stvxl v22,c80,rd | |
220 | vperm v20,v11,v12,v0 | |
221 | stvxl v23,c96,rd | |
222 | vperm v21,v12,v13,v0 | |
223 | stvxl v17,c112,rd | |
224 | vperm v22,v13,v14,v0 | |
225 | addi rd,rd,256 // point to next dest chunk | |
226 | stvxl v18,0,r2 | |
227 | vperm v23,v14,v15,v0 | |
228 | stvxl v19,c16,r2 | |
229 | vperm v17,v15,v16,v0 | |
230 | stvxl v20,c32,r2 | |
231 | vperm v18,v16,v1,v0 | |
232 | stvxl v21,c48,r2 | |
233 | stvxl v22,c64,r2 | |
234 | stvxl v23,c80,r2 | |
235 | stvxl v17,c96,r2 | |
236 | stvxl v18,c112,r2 | |
237 | bdnz++ LunalignedLoop // loop if another 256 bytes to go | |
238 | ||
239 | li r6,rzV20 // restore non-volatile VRs | |
240 | li r7,rzV21 | |
241 | li r8,rzV22 | |
242 | li r9,rzV23 | |
243 | lvx v20,r1,r6 | |
244 | lvx v21,r1,r7 | |
245 | lvx v22,r1,r8 | |
246 | lvx v23,r1,r9 | |
247 | b Ldone | |
55e303ae A |
248 | |
249 | ||
250 | // Aligned loop. Destination is 128-byte aligned, and source is 16-byte | |
91447636 | 251 | // aligned. Loop over 256-byte chunks (2 cache lines.) |
55e303ae A |
252 | |
253 | .align 5 | |
254 | LalignedLoop: | |
91447636 A |
255 | dcbt c256,rs // touch in next chunk |
256 | dcbt c384,rs | |
257 | addi r2,rs,128 // point to 2nd 128 bytes of source | |
55e303ae A |
258 | lvxl v1,0,rs |
259 | lvxl v2,c16,rs | |
55e303ae A |
260 | lvxl v3,c32,rs |
261 | lvxl v4,c48,rs | |
91447636 A |
262 | lvxl v5,c64,rs |
263 | lvxl v6,c80,rs | |
264 | lvxl v7,c96,rs | |
265 | lvxl v8,c112,rs | |
266 | lvxl v9,0,r2 | |
267 | lvxl v10,c16,r2 | |
268 | lvxl v11,c32,r2 | |
269 | lvxl v12,c48,r2 | |
270 | lvxl v13,c64,r2 | |
271 | lvxl v14,c80,r2 | |
272 | lvxl v15,c96,r2 | |
273 | lvxl v16,c112,r2 | |
274 | addi r2,rd,128 // point to 2nd 128 bytes of dest | |
275 | bne-- cr7,1f // skip dcbz's if cr7 beq has been turned off by kernel | |
276 | dcbz128 0,rd | |
277 | dcbz128 0,r2 | |
278 | 1: | |
279 | addi rs,rs,256 // point to next source chunk | |
55e303ae A |
280 | stvxl v1,0,rd |
281 | stvxl v2,c16,rd | |
282 | stvxl v3,c32,rd | |
283 | stvxl v4,c48,rd | |
91447636 A |
284 | stvxl v5,c64,rd |
285 | stvxl v6,c80,rd | |
286 | stvxl v7,c96,rd | |
287 | stvxl v8,c112,rd | |
288 | addi rd,rd,256 // point to next dest chunk | |
289 | stvxl v9,0,r2 | |
290 | stvxl v10,c16,r2 | |
291 | stvxl v11,c32,r2 | |
292 | stvxl v12,c48,r2 | |
293 | stvxl v13,c64,r2 | |
294 | stvxl v14,c80,r2 | |
295 | stvxl v15,c96,r2 | |
296 | stvxl v16,c112,r2 | |
297 | bdnz++ LalignedLoop // loop if another 256 bytes to go | |
298 | ||
299 | ||
300 | // Done, except for 0..255 leftover bytes at end. | |
55e303ae A |
301 | // rs = source ptr |
302 | // rd = dest ptr | |
91447636 | 303 | // rc = remaining count in low 7 bits |
55e303ae | 304 | // rv = caller's vrsave |
91447636 | 305 | // rx = caller's return address |
55e303ae A |
306 | |
307 | Ldone: | |
91447636 A |
308 | andi. r5,rc,0xFF // any leftover bytes? (0..255) |
309 | mtspr vrsave,rv // restore bitmap of live vr's | |
55e303ae | 310 | |
91447636 A |
311 | mr r3,rd |
312 | mr r4,rs | |
313 | bnela _COMM_PAGE_MEMCPY // copy leftover bytes | |
314 | ||
315 | mtlr rx // restore return address | |
316 | ld r3,rzR3(r1) // restore non-volatile GPRs from redzone | |
317 | ld r13,rzR13(r1) | |
318 | ld r14,rzR14(r1) | |
319 | ld r15,rzR15(r1) | |
320 | ld r16,rzR16(r1) | |
55e303ae A |
321 | blr |
322 | ||
323 | ||
91447636 | 324 | COMMPAGE_DESCRIPTOR(bigcopy_970,_COMM_PAGE_BIGCOPY,0,0,kPort32to64+kCommPageBoth) |
55e303ae | 325 |