]>
Commit | Line | Data |
---|---|---|
55e303ae A |
1 | /* |
2 | * Copyright (c) 2003 Apple Computer, Inc. All rights reserved. | |
3 | * | |
4 | * @APPLE_LICENSE_HEADER_START@ | |
5 | * | |
e5568f75 A |
6 | * The contents of this file constitute Original Code as defined in and |
7 | * are subject to the Apple Public Source License Version 1.1 (the | |
8 | * "License"). You may not use this file except in compliance with the | |
9 | * License. Please obtain a copy of the License at | |
10 | * http://www.apple.com/publicsource and read it before using this file. | |
55e303ae | 11 | * |
e5568f75 A |
12 | * This Original Code and all software distributed under the License are |
13 | * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
55e303ae A |
14 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, |
15 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
e5568f75 A |
16 | * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the |
17 | * License for the specific language governing rights and limitations | |
18 | * under the License. | |
55e303ae A |
19 | * |
20 | * @APPLE_LICENSE_HEADER_END@ | |
21 | */ | |
22 | /* ==================================== | |
23 | * Very Long Operand BCOPY for Mac OS X | |
24 | * ==================================== | |
25 | * | |
26 | * Version of 6/11/2003, tuned for the IBM 970. This is for operands at | |
27 | * least several pages long. It is called from bcopy()/memcpy()/memmove(). | |
28 | * | |
29 | * We use the following additional strategies not used by the shorter | |
30 | * operand paths. Mostly, we try to optimize for memory bandwidth: | |
31 | * 1. Use DCBZ128 to avoid reading destination lines. Because this code | |
32 | * resides on the commmpage, it can use a private interface with the | |
33 | * kernel to minimize alignment exceptions if the destination is | |
34 | * uncached. The kernel will clear cr7 whenever it emulates a DCBZ or | |
35 | * DCBZ128 on the commpage. Thus we take at most one exception per call, | |
36 | * which is amortized across the very long operand. | |
37 | * 2. Copy larger chunks per iteration to minimize R/W bus turnaround | |
38 | * and maximize DRAM page locality (opening a new page is expensive.) | |
39 | * 3. Touch in one source chunk ahead with DCBT. This is probably the | |
40 | * least important change, and probably only helps restart the | |
41 | * hardware stream at the start of each source page. | |
42 | * | |
43 | * Register usage. Note the rather delicate way we assign multiple uses | |
44 | * to the same register. Beware. | |
45 | * r0 = temp (NB: cannot use r0 for any constant such as "c16") | |
46 | * r3 = not used, as memcpy and memmove return 1st parameter as a value | |
47 | * r4 = source ptr ("rs") | |
48 | * r5 = count of bytes to move ("rc") | |
49 | * r6 = constant 16 ("c16") | |
50 | * r7 = constant 32 (""c32") | |
51 | * r8 = constant 48 (""c48") | |
52 | * r9 = constant 128 (""c128") | |
53 | * r10 = vrsave ("rv") | |
54 | * r11 = constant 256 (""c256") | |
55 | * r12 = destination ptr ("rd") | |
56 | * r13 = constant 384 (""c384") | |
57 | * r14 = temp ("rx") | |
58 | * r15 = temp ("rt") | |
59 | */ | |
60 | #define rs r4 | |
61 | #define rd r12 | |
62 | #define rc r5 | |
63 | #define rv r10 | |
64 | #define rx r14 | |
65 | #define rt r15 | |
66 | ||
67 | #define c16 r6 | |
68 | #define c32 r7 | |
69 | #define c48 r8 | |
70 | #define c128 r9 | |
71 | #define c256 r11 | |
72 | #define c384 r13 | |
73 | ||
74 | // Offsets within the "red zone" (which is 224 bytes long): | |
75 | ||
76 | #define rzR13 -8 | |
77 | #define rzR14 -12 | |
78 | #define rzR15 -16 | |
79 | #define rzV20 -32 | |
80 | #define rzV21 -48 | |
81 | #define rzV22 -64 | |
82 | #define rzV23 -80 | |
83 | #define rzV24 -96 | |
84 | #define rzV25 -112 | |
85 | #define rzV26 -128 | |
86 | #define rzV27 -144 | |
87 | #define rzV28 -160 | |
88 | #define rzV29 -176 | |
89 | #define rzV30 -192 | |
90 | #define rzV31 -208 | |
91 | ||
92 | ||
93 | #include <sys/appleapiopts.h> | |
94 | #include <ppc/asm.h> | |
95 | #include <machine/cpu_capabilities.h> | |
96 | #include <machine/commpage.h> | |
97 | ||
98 | .text | |
99 | .globl EXT(bigcopy_970) | |
100 | ||
101 | ||
102 | // Entry point. This is a subroutine of bcopy(). When called: | |
103 | // r4 = source ptr (aka "rs") | |
104 | // r12 = dest ptr (aka "rd") | |
105 | // r5 = length (>= 16K bytes) (aka "rc") | |
106 | // | |
107 | // We only do "forward" moves, ie non-overlapping or toward 0. | |
108 | // | |
109 | // We return with non-volatiles and r3 preserved. | |
110 | ||
111 | .align 5 | |
112 | bigcopy_970: | |
113 | stw r13,rzR13(r1) // spill non-volatile regs we use to redzone | |
114 | stw r14,rzR14(r1) | |
115 | stw r15,rzR15(r1) | |
116 | li r0,rzV20 | |
117 | neg rt,rd // start to cache-line-align destination | |
118 | stvx v20,r1,r0 // we use all 32 VRs | |
119 | li r0,rzV21 | |
120 | stvx v21,r1,r0 | |
121 | li r0,rzV22 | |
122 | stvx v22,r1,r0 | |
123 | li r0,rzV23 | |
124 | stvx v23,r1,r0 | |
125 | li r0,rzV24 | |
126 | andi. rt,rt,127 // get #bytes to 128-byte align | |
127 | stvx v24,r1,r0 | |
128 | li r0,rzV25 | |
129 | stvx v25,r1,r0 | |
130 | li r0,rzV26 | |
131 | sub rc,rc,rt // adjust length by #bytes to align destination | |
132 | stvx v26,r1,r0 | |
133 | li r0,rzV27 | |
134 | stvx v27,r1,r0 | |
135 | li r0,rzV28 | |
136 | mtctr rt // #bytes to align destination | |
137 | stvx v28,r1,r0 | |
138 | li r0,rzV29 | |
139 | stvx v29,r1,r0 | |
140 | li r0,rzV30 | |
141 | stvx v30,r1,r0 | |
142 | li r0,rzV31 | |
143 | stvx v31,r1,r0 | |
144 | beq 2f // dest already 128-byte aligned | |
145 | b 1f | |
146 | ||
147 | ||
148 | // Cache-line-align destination. | |
149 | ||
150 | .align 5 | |
151 | 1: | |
152 | lbz r0,0(rs) | |
153 | addi rs,rs,1 | |
154 | stb r0,0(rd) | |
155 | addi rd,rd,1 | |
156 | bdnz 1b | |
157 | ||
158 | ||
159 | // Is source 16-byte aligned? Load constant offsets. | |
160 | ||
161 | 2: | |
162 | andi. r0,rs,15 // check source alignment | |
163 | mfspr rv,vrsave // save caller's bitmask | |
164 | li r0,-1 // we use all 32 VRs | |
165 | li c16,16 // load the constant offsets for x-form ops | |
166 | li c32,32 | |
167 | li c48,48 | |
168 | li c128,128 | |
169 | li c256,256 | |
170 | li c384,384 | |
171 | mtspr vrsave,r0 | |
172 | ||
173 | // NB: the kernel clears cr7 if it emulates a dcbz128 on the commpage, | |
174 | // and we dcbz only if cr7 beq is set. We check to be sure the dcbz's | |
175 | // won't zero source bytes before we load them, since we zero before | |
176 | // loading as this is faster than zeroing after loading and before storing. | |
177 | ||
178 | cmpw cr7,r0,r0 // initialize cr7 beq to use dcbz128 | |
179 | sub rt,rs,rd // get (rs-rd) | |
180 | cmplwi cr1,rt,512 // are we moving down less than 512 bytes? | |
181 | ||
182 | // Start fetching in source cache lines. | |
183 | ||
184 | dcbt c128,rs // first line already touched in | |
185 | dcbt c256,rs | |
186 | dcbt c384,rs | |
187 | ||
188 | bge++ cr1,3f // skip if not moving down less than 512 bytes | |
189 | cmpw cr7,c16,c32 // cannot dcbz since it would zero source bytes | |
190 | 3: | |
191 | beq LalignedLoop // handle aligned sources | |
192 | lvsl v0,0,rs // get permute vector for left shift | |
193 | lvxl v1,0,rs // prime the loop | |
194 | b LunalignedLoop // enter unaligned loop | |
195 | ||
196 | ||
197 | // Main loop for unaligned operands. We loop over 384-byte chunks (3 cache lines) | |
198 | // since we need a few VRs for permuted destination QWs and the permute vector. | |
199 | ||
200 | .align 5 | |
201 | LunalignedLoop: | |
202 | subi rc,rc,384 // decrement byte count | |
203 | addi rx,rs,384 // get address of next chunk | |
204 | lvxl v2,c16,rs | |
205 | lvxl v3,c32,rs | |
206 | bne-- cr7,1f // skip dcbz's if cr7 beq has been turned off by kernel | |
207 | dcbz128 0,rd // (also skip if moving down less than 512 bytes) | |
208 | bne-- cr7,1f // catch it first time through | |
209 | dcbz128 c128,rd | |
210 | dcbz128 c256,rd | |
211 | 1: | |
212 | addi rt,rs,64 | |
213 | dcbt 0,rx // touch in next chunk | |
214 | dcbt c128,rx | |
215 | dcbt c256,rx | |
216 | lvxl v4,c48,rs | |
217 | addi rs,rs,128 | |
218 | lvxl v5,0,rt | |
219 | cmplwi rc,384 // another chunk to go? | |
220 | lvxl v6,c16,rt | |
221 | lvxl v7,c32,rt | |
222 | lvxl v8,c48,rt | |
223 | addi rt,rs,64 | |
224 | vperm v25,v1,v2,v0 | |
225 | lvxl v9,0,rs | |
226 | lvxl v10,c16,rs | |
227 | vperm v26,v2,v3,v0 | |
228 | lvxl v11,c32,rs | |
229 | lvxl v12,c48,rs | |
230 | vperm v27,v3,v4,v0 | |
231 | addi rs,rs,128 | |
232 | lvxl v13,0,rt | |
233 | lvxl v14,c16,rt | |
234 | vperm v28,v4,v5,v0 | |
235 | lvxl v15,c32,rt | |
236 | lvxl v16,c48,rt | |
237 | vperm v29,v5,v6,v0 | |
238 | addi rt,rs,64 | |
239 | lvxl v17,0,rs | |
240 | lvxl v18,c16,rs | |
241 | vperm v30,v6,v7,v0 | |
242 | lvxl v19,c32,rs | |
243 | lvxl v20,c48,rs | |
244 | vperm v31,v7,v8,v0 | |
245 | addi rs,rs,128 | |
246 | lvxl v21,0,rt | |
247 | lvxl v22,c16,rt | |
248 | vperm v2,v8,v9,v0 | |
249 | lvxl v23,c32,rt | |
250 | lvxl v24,c48,rt | |
251 | vperm v3,v9,v10,v0 | |
252 | lvx v1,0,rs // get 1st qw of next chunk | |
253 | vperm v4,v10,v11,v0 | |
254 | ||
255 | addi rt,rd,64 | |
256 | stvxl v25,0,rd | |
257 | stvxl v26,c16,rd | |
258 | vperm v5,v11,v12,v0 | |
259 | stvxl v27,c32,rd | |
260 | stvxl v28,c48,rd | |
261 | vperm v6,v12,v13,v0 | |
262 | addi rd,rd,128 | |
263 | stvxl v29,0,rt | |
264 | stvxl v30,c16,rt | |
265 | vperm v7,v13,v14,v0 | |
266 | stvxl v31,c32,rt | |
267 | stvxl v2,c48,rt | |
268 | vperm v8,v14,v15,v0 | |
269 | addi rt,rd,64 | |
270 | stvxl v3,0,rd | |
271 | stvxl v4,c16,rd | |
272 | vperm v9,v15,v16,v0 | |
273 | stvxl v5,c32,rd | |
274 | stvxl v6,c48,rd | |
275 | vperm v10,v16,v17,v0 | |
276 | addi rd,rd,128 | |
277 | stvxl v7,0,rt | |
278 | vperm v11,v17,v18,v0 | |
279 | stvxl v8,c16,rt | |
280 | stvxl v9,c32,rt | |
281 | vperm v12,v18,v19,v0 | |
282 | stvxl v10,c48,rt | |
283 | addi rt,rd,64 | |
284 | vperm v13,v19,v20,v0 | |
285 | stvxl v11,0,rd | |
286 | stvxl v12,c16,rd | |
287 | vperm v14,v20,v21,v0 | |
288 | stvxl v13,c32,rd | |
289 | vperm v15,v21,v22,v0 | |
290 | stvxl v14,c48,rd | |
291 | vperm v16,v22,v23,v0 | |
292 | addi rd,rd,128 | |
293 | stvxl v15,0,rt | |
294 | vperm v17,v23,v24,v0 | |
295 | stvxl v16,c16,rt | |
296 | vperm v18,v24,v1,v0 | |
297 | stvxl v17,c32,rt | |
298 | stvxl v18,c48,rt | |
299 | bge++ LunalignedLoop // loop if another 384 bytes to go | |
300 | ||
301 | // End of unaligned main loop. Handle up to 384 leftover bytes. | |
302 | ||
303 | srwi. r0,rc,5 // get count of 32-byte chunks remaining | |
304 | beq Ldone // none | |
305 | rlwinm rc,rc,0,0x1F // mask count down to 0..31 leftover bytes | |
306 | mtctr r0 | |
307 | 1: // loop over 32-byte chunks | |
308 | lvx v2,c16,rs | |
309 | lvx v3,c32,rs | |
310 | addi rs,rs,32 | |
311 | vperm v8,v1,v2,v0 | |
312 | vperm v9,v2,v3,v0 | |
313 | vor v1,v3,v3 // v1 <- v3 | |
314 | stvx v8,0,rd | |
315 | stvx v9,c16,rd | |
316 | addi rd,rd,32 | |
317 | bdnz 1b | |
318 | ||
319 | b Ldone | |
320 | ||
321 | ||
322 | // Aligned loop. Destination is 128-byte aligned, and source is 16-byte | |
323 | // aligned. Loop over 512-byte chunks (4 cache lines.) | |
324 | ||
325 | .align 5 | |
326 | LalignedLoop: | |
327 | subi rc,rc,512 // decrement count | |
328 | addi rx,rs,512 // address of next chunk | |
329 | lvxl v1,0,rs | |
330 | lvxl v2,c16,rs | |
331 | bne-- cr7,1f // skip dcbz's if cr7 beq has been turned off by kernel | |
332 | dcbz128 0,rd // (also skip if moving down less than 512 bytes) | |
333 | bne-- cr7,1f // catch it first time through | |
334 | dcbz128 c128,rd | |
335 | dcbz128 c256,rd | |
336 | dcbz128 c384,rd | |
337 | 1: | |
338 | addi rt,rs,64 | |
339 | dcbt 0,rx // touch in next chunk | |
340 | dcbt c128,rx | |
341 | dcbt c256,rx | |
342 | dcbt c384,rx | |
343 | lvxl v3,c32,rs | |
344 | lvxl v4,c48,rs | |
345 | addi rs,rs,128 | |
346 | lvxl v5,0,rt | |
347 | cmplwi rc,512 // another chunk to go? | |
348 | lvxl v6,c16,rt | |
349 | lvxl v7,c32,rt | |
350 | lvxl v8,c48,rt | |
351 | addi rt,rs,64 | |
352 | lvxl v9,0,rs | |
353 | lvxl v10,c16,rs | |
354 | lvxl v11,c32,rs | |
355 | lvxl v12,c48,rs | |
356 | addi rs,rs,128 | |
357 | lvxl v13,0,rt | |
358 | lvxl v14,c16,rt | |
359 | lvxl v15,c32,rt | |
360 | lvxl v16,c48,rt | |
361 | addi rt,rs,64 | |
362 | lvxl v17,0,rs | |
363 | lvxl v18,c16,rs | |
364 | lvxl v19,c32,rs | |
365 | lvxl v20,c48,rs | |
366 | addi rs,rs,128 | |
367 | lvxl v21,0,rt | |
368 | lvxl v22,c16,rt | |
369 | lvxl v23,c32,rt | |
370 | lvxl v24,c48,rt | |
371 | addi rt,rs,64 | |
372 | lvxl v25,0,rs | |
373 | lvxl v26,c16,rs | |
374 | lvxl v27,c32,rs | |
375 | lvxl v28,c48,rs | |
376 | addi rs,rs,128 | |
377 | lvxl v29,0,rt | |
378 | lvxl v30,c16,rt | |
379 | lvxl v31,c32,rt | |
380 | lvxl v0,c48,rt | |
381 | ||
382 | addi rt,rd,64 | |
383 | stvxl v1,0,rd | |
384 | stvxl v2,c16,rd | |
385 | stvxl v3,c32,rd | |
386 | stvxl v4,c48,rd | |
387 | addi rd,rd,128 | |
388 | stvxl v5,0,rt | |
389 | stvxl v6,c16,rt | |
390 | stvxl v7,c32,rt | |
391 | stvxl v8,c48,rt | |
392 | addi rt,rd,64 | |
393 | stvxl v9,0,rd | |
394 | stvxl v10,c16,rd | |
395 | stvxl v11,c32,rd | |
396 | stvxl v12,c48,rd | |
397 | addi rd,rd,128 | |
398 | stvxl v13,0,rt | |
399 | stvxl v14,c16,rt | |
400 | stvxl v15,c32,rt | |
401 | stvxl v16,c48,rt | |
402 | addi rt,rd,64 | |
403 | stvxl v17,0,rd | |
404 | stvxl v18,c16,rd | |
405 | stvxl v19,c32,rd | |
406 | stvxl v20,c48,rd | |
407 | addi rd,rd,128 | |
408 | stvxl v21,0,rt | |
409 | stvxl v22,c16,rt | |
410 | stvxl v23,c32,rt | |
411 | stvxl v24,c48,rt | |
412 | addi rt,rd,64 | |
413 | stvxl v25,0,rd | |
414 | stvxl v26,c16,rd | |
415 | stvxl v27,c32,rd | |
416 | stvxl v28,c48,rd | |
417 | addi rd,rd,128 | |
418 | stvxl v29,0,rt | |
419 | stvxl v30,c16,rt | |
420 | stvxl v31,c32,rt | |
421 | stvxl v0,c48,rt | |
422 | bge++ LalignedLoop // loop if another 512 bytes to go | |
423 | ||
424 | // End of aligned main loop. Handle up to 511 leftover bytes. | |
425 | ||
426 | srwi. r0,rc,5 // get count of 32-byte chunks remaining | |
427 | beq Ldone // none | |
428 | rlwinm rc,rc,0,0x1F // mask count down to 0..31 leftover bytes | |
429 | mtctr r0 | |
430 | 1: // loop over 32-byte chunks | |
431 | lvx v1,0,rs | |
432 | lvx v2,c16,rs | |
433 | addi rs,rs,32 | |
434 | stvx v1,0,rd | |
435 | stvx v2,c16,rd | |
436 | addi rd,rd,32 | |
437 | bdnz 1b | |
438 | ||
439 | ||
440 | // Done, except for 0..31 leftovers at end. Restore non-volatiles. | |
441 | // rs = source ptr | |
442 | // rd = dest ptr | |
443 | // rc = count (0..31) | |
444 | // rv = caller's vrsave | |
445 | ||
446 | Ldone: | |
447 | cmpwi rc,0 // any leftover bytes? | |
448 | lwz r13,rzR13(r1) // restore non-volatiles from redzone | |
449 | lwz r14,rzR14(r1) | |
450 | lwz r15,rzR15(r1) | |
451 | li r0,rzV20 | |
452 | lvx v20,r1,r0 | |
453 | li r0,rzV21 | |
454 | lvx v21,r1,r0 | |
455 | li r0,rzV22 | |
456 | lvx v22,r1,r0 | |
457 | li r0,rzV23 | |
458 | lvx v23,r1,r0 | |
459 | li r0,rzV24 | |
460 | lvx v24,r1,r0 | |
461 | li r0,rzV25 | |
462 | lvx v25,r1,r0 | |
463 | li r0,rzV26 | |
464 | lvx v26,r1,r0 | |
465 | li r0,rzV27 | |
466 | lvx v27,r1,r0 | |
467 | li r0,rzV28 | |
468 | lvx v28,r1,r0 | |
469 | li r0,rzV29 | |
470 | lvx v29,r1,r0 | |
471 | li r0,rzV30 | |
472 | lvx v30,r1,r0 | |
473 | li r0,rzV31 | |
474 | lvx v31,r1,r0 | |
475 | mtspr vrsave,rv // restore caller's bitmask | |
476 | beqlr // done if no leftover bytes | |
477 | ||
478 | ||
479 | // Handle 1..31 leftover bytes at end. | |
480 | ||
481 | mtctr rc // set up loop count | |
482 | b 1f | |
483 | ||
484 | .align 5 | |
485 | 1: | |
486 | lbz r0,0(rs) | |
487 | addi rs,rs,1 | |
488 | stb r0,0(rd) | |
489 | addi rd,rd,1 | |
490 | bdnz 1b | |
491 | ||
492 | blr | |
493 | ||
494 | ||
495 | COMMPAGE_DESCRIPTOR(bigcopy_970,_COMM_PAGE_BIGCOPY,0,0,0) // load on all machines for now | |
496 |