]>
Commit | Line | Data |
---|---|---|
d7e50217 A |
1 | /* |
2 | * Copyright (c) 2003 Apple Computer, Inc. All rights reserved. | |
3 | * | |
4 | * @APPLE_LICENSE_HEADER_START@ | |
5 | * | |
6 | * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved. | |
7 | * | |
8 | * This file contains Original Code and/or Modifications of Original Code | |
9 | * as defined in and that are subject to the Apple Public Source License | |
10 | * Version 2.0 (the 'License'). You may not use this file except in | |
11 | * compliance with the License. Please obtain a copy of the License at | |
12 | * http://www.opensource.apple.com/apsl/ and read it before using this | |
13 | * file. | |
14 | * | |
15 | * The Original Code and all software distributed under the License are | |
16 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER | |
17 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, | |
18 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, | |
19 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. | |
20 | * Please see the License for the specific language governing rights and | |
21 | * limitations under the License. | |
22 | * | |
23 | * @APPLE_LICENSE_HEADER_END@ | |
24 | */ | |
25 | /* ==================================== | |
26 | * Very Long Operand BCOPY for Mac OS X | |
27 | * ==================================== | |
28 | * | |
29 | * Version of 6/11/2003, tuned for the IBM 970. This is for operands at | |
30 | * least several pages long. It is called from bcopy()/memcpy()/memmove(). | |
31 | * | |
32 | * We use the following additional strategies not used by the shorter | |
33 | * operand paths. Mostly, we try to optimize for memory bandwidth: | |
34 | * 1. Use DCBZ128 to avoid reading destination lines. Because this code | |
35 | * resides on the commmpage, it can use a private interface with the | |
36 | * kernel to minimize alignment exceptions if the destination is | |
37 | * uncached. The kernel will clear cr7 whenever it emulates a DCBZ or | |
38 | * DCBZ128 on the commpage. Thus we take at most one exception per call, | |
39 | * which is amortized across the very long operand. | |
40 | * 2. Copy larger chunks per iteration to minimize R/W bus turnaround | |
41 | * and maximize DRAM page locality (opening a new page is expensive.) | |
42 | * 3. Touch in one source chunk ahead with DCBT. This is probably the | |
43 | * least important change, and probably only helps restart the | |
44 | * hardware stream at the start of each source page. | |
45 | * | |
46 | * Register usage. Note the rather delicate way we assign multiple uses | |
47 | * to the same register. Beware. | |
48 | * r0 = temp (NB: cannot use r0 for any constant such as "c16") | |
49 | * r3 = not used, as memcpy and memmove return 1st parameter as a value | |
50 | * r4 = source ptr ("rs") | |
51 | * r5 = count of bytes to move ("rc") | |
52 | * r6 = constant 16 ("c16") | |
53 | * r7 = constant 32 (""c32") | |
54 | * r8 = constant 48 (""c48") | |
55 | * r9 = constant 128 (""c128") | |
56 | * r10 = vrsave ("rv") | |
57 | * r11 = constant 256 (""c256") | |
58 | * r12 = destination ptr ("rd") | |
59 | * r13 = constant 384 (""c384") | |
60 | * r14 = temp ("rx") | |
61 | * r15 = temp ("rt") | |
62 | */ | |
63 | #define rs r4 | |
64 | #define rd r12 | |
65 | #define rc r5 | |
66 | #define rv r10 | |
67 | #define rx r14 | |
68 | #define rt r15 | |
69 | ||
70 | #define c16 r6 | |
71 | #define c32 r7 | |
72 | #define c48 r8 | |
73 | #define c128 r9 | |
74 | #define c256 r11 | |
75 | #define c384 r13 | |
76 | ||
77 | // Offsets within the "red zone" (which is 224 bytes long): | |
78 | ||
79 | #define rzR13 -8 | |
80 | #define rzR14 -12 | |
81 | #define rzR15 -16 | |
82 | #define rzV20 -32 | |
83 | #define rzV21 -48 | |
84 | #define rzV22 -64 | |
85 | #define rzV23 -80 | |
86 | #define rzV24 -96 | |
87 | #define rzV25 -112 | |
88 | #define rzV26 -128 | |
89 | #define rzV27 -144 | |
90 | #define rzV28 -160 | |
91 | #define rzV29 -176 | |
92 | #define rzV30 -192 | |
93 | #define rzV31 -208 | |
94 | ||
95 | ||
96 | #include <sys/appleapiopts.h> | |
97 | #include <ppc/asm.h> | |
98 | #include <machine/cpu_capabilities.h> | |
99 | #include <machine/commpage.h> | |
100 | ||
101 | .text | |
102 | .globl EXT(bigcopy_970) | |
103 | ||
104 | ||
105 | // Entry point. This is a subroutine of bcopy(). When called: | |
106 | // r4 = source ptr (aka "rs") | |
107 | // r12 = dest ptr (aka "rd") | |
108 | // r5 = length (>= 16K bytes) (aka "rc") | |
109 | // | |
110 | // We only do "forward" moves, ie non-overlapping or toward 0. | |
111 | // | |
112 | // We return with non-volatiles and r3 preserved. | |
113 | ||
114 | .align 5 | |
115 | bigcopy_970: | |
116 | stw r13,rzR13(r1) // spill non-volatile regs we use to redzone | |
117 | stw r14,rzR14(r1) | |
118 | stw r15,rzR15(r1) | |
119 | li r0,rzV20 | |
120 | neg rt,rd // start to cache-line-align destination | |
121 | stvx v20,r1,r0 // we use all 32 VRs | |
122 | li r0,rzV21 | |
123 | stvx v21,r1,r0 | |
124 | li r0,rzV22 | |
125 | stvx v22,r1,r0 | |
126 | li r0,rzV23 | |
127 | stvx v23,r1,r0 | |
128 | li r0,rzV24 | |
129 | andi. rt,rt,127 // get #bytes to 128-byte align | |
130 | stvx v24,r1,r0 | |
131 | li r0,rzV25 | |
132 | stvx v25,r1,r0 | |
133 | li r0,rzV26 | |
134 | sub rc,rc,rt // adjust length by #bytes to align destination | |
135 | stvx v26,r1,r0 | |
136 | li r0,rzV27 | |
137 | stvx v27,r1,r0 | |
138 | li r0,rzV28 | |
139 | mtctr rt // #bytes to align destination | |
140 | stvx v28,r1,r0 | |
141 | li r0,rzV29 | |
142 | stvx v29,r1,r0 | |
143 | li r0,rzV30 | |
144 | stvx v30,r1,r0 | |
145 | li r0,rzV31 | |
146 | stvx v31,r1,r0 | |
147 | beq 2f // dest already 128-byte aligned | |
148 | b 1f | |
149 | ||
150 | ||
151 | // Cache-line-align destination. | |
152 | ||
153 | .align 5 | |
154 | 1: | |
155 | lbz r0,0(rs) | |
156 | addi rs,rs,1 | |
157 | stb r0,0(rd) | |
158 | addi rd,rd,1 | |
159 | bdnz 1b | |
160 | ||
161 | ||
162 | // Is source 16-byte aligned? Load constant offsets. | |
163 | ||
164 | 2: | |
165 | andi. r0,rs,15 // check source alignment | |
166 | mfspr rv,vrsave // save caller's bitmask | |
167 | li r0,-1 // we use all 32 VRs | |
168 | li c16,16 // load the constant offsets for x-form ops | |
169 | li c32,32 | |
170 | li c48,48 | |
171 | li c128,128 | |
172 | li c256,256 | |
173 | li c384,384 | |
174 | mtspr vrsave,r0 | |
175 | ||
176 | // NB: the kernel clears cr7 if it emulates a dcbz128 on the commpage, | |
177 | // and we dcbz only if cr7 beq is set. We check to be sure the dcbz's | |
178 | // won't zero source bytes before we load them, since we zero before | |
179 | // loading as this is faster than zeroing after loading and before storing. | |
180 | ||
181 | cmpw cr7,r0,r0 // initialize cr7 beq to use dcbz128 | |
182 | sub rt,rs,rd // get (rs-rd) | |
183 | cmplwi cr1,rt,512 // are we moving down less than 512 bytes? | |
184 | ||
185 | // Start fetching in source cache lines. | |
186 | ||
187 | dcbt c128,rs // first line already touched in | |
188 | dcbt c256,rs | |
189 | dcbt c384,rs | |
190 | ||
191 | bge++ cr1,3f // skip if not moving down less than 512 bytes | |
192 | cmpw cr7,c16,c32 // cannot dcbz since it would zero source bytes | |
193 | 3: | |
194 | beq LalignedLoop // handle aligned sources | |
195 | lvsl v0,0,rs // get permute vector for left shift | |
196 | lvxl v1,0,rs // prime the loop | |
197 | b LunalignedLoop // enter unaligned loop | |
198 | ||
199 | ||
200 | // Main loop for unaligned operands. We loop over 384-byte chunks (3 cache lines) | |
201 | // since we need a few VRs for permuted destination QWs and the permute vector. | |
202 | ||
203 | .align 5 | |
204 | LunalignedLoop: | |
205 | subi rc,rc,384 // decrement byte count | |
206 | addi rx,rs,384 // get address of next chunk | |
207 | lvxl v2,c16,rs | |
208 | lvxl v3,c32,rs | |
209 | bne-- cr7,1f // skip dcbz's if cr7 beq has been turned off by kernel | |
210 | dcbz128 0,rd // (also skip if moving down less than 512 bytes) | |
211 | bne-- cr7,1f // catch it first time through | |
212 | dcbz128 c128,rd | |
213 | dcbz128 c256,rd | |
214 | 1: | |
215 | addi rt,rs,64 | |
216 | dcbt 0,rx // touch in next chunk | |
217 | dcbt c128,rx | |
218 | dcbt c256,rx | |
219 | lvxl v4,c48,rs | |
220 | addi rs,rs,128 | |
221 | lvxl v5,0,rt | |
222 | cmplwi rc,384 // another chunk to go? | |
223 | lvxl v6,c16,rt | |
224 | lvxl v7,c32,rt | |
225 | lvxl v8,c48,rt | |
226 | addi rt,rs,64 | |
227 | vperm v25,v1,v2,v0 | |
228 | lvxl v9,0,rs | |
229 | lvxl v10,c16,rs | |
230 | vperm v26,v2,v3,v0 | |
231 | lvxl v11,c32,rs | |
232 | lvxl v12,c48,rs | |
233 | vperm v27,v3,v4,v0 | |
234 | addi rs,rs,128 | |
235 | lvxl v13,0,rt | |
236 | lvxl v14,c16,rt | |
237 | vperm v28,v4,v5,v0 | |
238 | lvxl v15,c32,rt | |
239 | lvxl v16,c48,rt | |
240 | vperm v29,v5,v6,v0 | |
241 | addi rt,rs,64 | |
242 | lvxl v17,0,rs | |
243 | lvxl v18,c16,rs | |
244 | vperm v30,v6,v7,v0 | |
245 | lvxl v19,c32,rs | |
246 | lvxl v20,c48,rs | |
247 | vperm v31,v7,v8,v0 | |
248 | addi rs,rs,128 | |
249 | lvxl v21,0,rt | |
250 | lvxl v22,c16,rt | |
251 | vperm v2,v8,v9,v0 | |
252 | lvxl v23,c32,rt | |
253 | lvxl v24,c48,rt | |
254 | vperm v3,v9,v10,v0 | |
255 | lvx v1,0,rs // get 1st qw of next chunk | |
256 | vperm v4,v10,v11,v0 | |
257 | ||
258 | addi rt,rd,64 | |
259 | stvxl v25,0,rd | |
260 | stvxl v26,c16,rd | |
261 | vperm v5,v11,v12,v0 | |
262 | stvxl v27,c32,rd | |
263 | stvxl v28,c48,rd | |
264 | vperm v6,v12,v13,v0 | |
265 | addi rd,rd,128 | |
266 | stvxl v29,0,rt | |
267 | stvxl v30,c16,rt | |
268 | vperm v7,v13,v14,v0 | |
269 | stvxl v31,c32,rt | |
270 | stvxl v2,c48,rt | |
271 | vperm v8,v14,v15,v0 | |
272 | addi rt,rd,64 | |
273 | stvxl v3,0,rd | |
274 | stvxl v4,c16,rd | |
275 | vperm v9,v15,v16,v0 | |
276 | stvxl v5,c32,rd | |
277 | stvxl v6,c48,rd | |
278 | vperm v10,v16,v17,v0 | |
279 | addi rd,rd,128 | |
280 | stvxl v7,0,rt | |
281 | vperm v11,v17,v18,v0 | |
282 | stvxl v8,c16,rt | |
283 | stvxl v9,c32,rt | |
284 | vperm v12,v18,v19,v0 | |
285 | stvxl v10,c48,rt | |
286 | addi rt,rd,64 | |
287 | vperm v13,v19,v20,v0 | |
288 | stvxl v11,0,rd | |
289 | stvxl v12,c16,rd | |
290 | vperm v14,v20,v21,v0 | |
291 | stvxl v13,c32,rd | |
292 | vperm v15,v21,v22,v0 | |
293 | stvxl v14,c48,rd | |
294 | vperm v16,v22,v23,v0 | |
295 | addi rd,rd,128 | |
296 | stvxl v15,0,rt | |
297 | vperm v17,v23,v24,v0 | |
298 | stvxl v16,c16,rt | |
299 | vperm v18,v24,v1,v0 | |
300 | stvxl v17,c32,rt | |
301 | stvxl v18,c48,rt | |
302 | bge++ LunalignedLoop // loop if another 384 bytes to go | |
303 | ||
304 | // End of unaligned main loop. Handle up to 384 leftover bytes. | |
305 | ||
306 | srwi. r0,rc,5 // get count of 32-byte chunks remaining | |
307 | beq Ldone // none | |
308 | rlwinm rc,rc,0,0x1F // mask count down to 0..31 leftover bytes | |
309 | mtctr r0 | |
310 | 1: // loop over 32-byte chunks | |
311 | lvx v2,c16,rs | |
312 | lvx v3,c32,rs | |
313 | addi rs,rs,32 | |
314 | vperm v8,v1,v2,v0 | |
315 | vperm v9,v2,v3,v0 | |
316 | vor v1,v3,v3 // v1 <- v3 | |
317 | stvx v8,0,rd | |
318 | stvx v9,c16,rd | |
319 | addi rd,rd,32 | |
320 | bdnz 1b | |
321 | ||
322 | b Ldone | |
323 | ||
324 | ||
325 | // Aligned loop. Destination is 128-byte aligned, and source is 16-byte | |
326 | // aligned. Loop over 512-byte chunks (4 cache lines.) | |
327 | ||
328 | .align 5 | |
329 | LalignedLoop: | |
330 | subi rc,rc,512 // decrement count | |
331 | addi rx,rs,512 // address of next chunk | |
332 | lvxl v1,0,rs | |
333 | lvxl v2,c16,rs | |
334 | bne-- cr7,1f // skip dcbz's if cr7 beq has been turned off by kernel | |
335 | dcbz128 0,rd // (also skip if moving down less than 512 bytes) | |
336 | bne-- cr7,1f // catch it first time through | |
337 | dcbz128 c128,rd | |
338 | dcbz128 c256,rd | |
339 | dcbz128 c384,rd | |
340 | 1: | |
341 | addi rt,rs,64 | |
342 | dcbt 0,rx // touch in next chunk | |
343 | dcbt c128,rx | |
344 | dcbt c256,rx | |
345 | dcbt c384,rx | |
346 | lvxl v3,c32,rs | |
347 | lvxl v4,c48,rs | |
348 | addi rs,rs,128 | |
349 | lvxl v5,0,rt | |
350 | cmplwi rc,512 // another chunk to go? | |
351 | lvxl v6,c16,rt | |
352 | lvxl v7,c32,rt | |
353 | lvxl v8,c48,rt | |
354 | addi rt,rs,64 | |
355 | lvxl v9,0,rs | |
356 | lvxl v10,c16,rs | |
357 | lvxl v11,c32,rs | |
358 | lvxl v12,c48,rs | |
359 | addi rs,rs,128 | |
360 | lvxl v13,0,rt | |
361 | lvxl v14,c16,rt | |
362 | lvxl v15,c32,rt | |
363 | lvxl v16,c48,rt | |
364 | addi rt,rs,64 | |
365 | lvxl v17,0,rs | |
366 | lvxl v18,c16,rs | |
367 | lvxl v19,c32,rs | |
368 | lvxl v20,c48,rs | |
369 | addi rs,rs,128 | |
370 | lvxl v21,0,rt | |
371 | lvxl v22,c16,rt | |
372 | lvxl v23,c32,rt | |
373 | lvxl v24,c48,rt | |
374 | addi rt,rs,64 | |
375 | lvxl v25,0,rs | |
376 | lvxl v26,c16,rs | |
377 | lvxl v27,c32,rs | |
378 | lvxl v28,c48,rs | |
379 | addi rs,rs,128 | |
380 | lvxl v29,0,rt | |
381 | lvxl v30,c16,rt | |
382 | lvxl v31,c32,rt | |
383 | lvxl v0,c48,rt | |
384 | ||
385 | addi rt,rd,64 | |
386 | stvxl v1,0,rd | |
387 | stvxl v2,c16,rd | |
388 | stvxl v3,c32,rd | |
389 | stvxl v4,c48,rd | |
390 | addi rd,rd,128 | |
391 | stvxl v5,0,rt | |
392 | stvxl v6,c16,rt | |
393 | stvxl v7,c32,rt | |
394 | stvxl v8,c48,rt | |
395 | addi rt,rd,64 | |
396 | stvxl v9,0,rd | |
397 | stvxl v10,c16,rd | |
398 | stvxl v11,c32,rd | |
399 | stvxl v12,c48,rd | |
400 | addi rd,rd,128 | |
401 | stvxl v13,0,rt | |
402 | stvxl v14,c16,rt | |
403 | stvxl v15,c32,rt | |
404 | stvxl v16,c48,rt | |
405 | addi rt,rd,64 | |
406 | stvxl v17,0,rd | |
407 | stvxl v18,c16,rd | |
408 | stvxl v19,c32,rd | |
409 | stvxl v20,c48,rd | |
410 | addi rd,rd,128 | |
411 | stvxl v21,0,rt | |
412 | stvxl v22,c16,rt | |
413 | stvxl v23,c32,rt | |
414 | stvxl v24,c48,rt | |
415 | addi rt,rd,64 | |
416 | stvxl v25,0,rd | |
417 | stvxl v26,c16,rd | |
418 | stvxl v27,c32,rd | |
419 | stvxl v28,c48,rd | |
420 | addi rd,rd,128 | |
421 | stvxl v29,0,rt | |
422 | stvxl v30,c16,rt | |
423 | stvxl v31,c32,rt | |
424 | stvxl v0,c48,rt | |
425 | bge++ LalignedLoop // loop if another 512 bytes to go | |
426 | ||
427 | // End of aligned main loop. Handle up to 511 leftover bytes. | |
428 | ||
429 | srwi. r0,rc,5 // get count of 32-byte chunks remaining | |
430 | beq Ldone // none | |
431 | rlwinm rc,rc,0,0x1F // mask count down to 0..31 leftover bytes | |
432 | mtctr r0 | |
433 | 1: // loop over 32-byte chunks | |
434 | lvx v1,0,rs | |
435 | lvx v2,c16,rs | |
436 | addi rs,rs,32 | |
437 | stvx v1,0,rd | |
438 | stvx v2,c16,rd | |
439 | addi rd,rd,32 | |
440 | bdnz 1b | |
441 | ||
442 | ||
443 | // Done, except for 0..31 leftovers at end. Restore non-volatiles. | |
444 | // rs = source ptr | |
445 | // rd = dest ptr | |
446 | // rc = count (0..31) | |
447 | // rv = caller's vrsave | |
448 | ||
449 | Ldone: | |
450 | cmpwi rc,0 // any leftover bytes? | |
451 | lwz r13,rzR13(r1) // restore non-volatiles from redzone | |
452 | lwz r14,rzR14(r1) | |
453 | lwz r15,rzR15(r1) | |
454 | li r0,rzV20 | |
455 | lvx v20,r1,r0 | |
456 | li r0,rzV21 | |
457 | lvx v21,r1,r0 | |
458 | li r0,rzV22 | |
459 | lvx v22,r1,r0 | |
460 | li r0,rzV23 | |
461 | lvx v23,r1,r0 | |
462 | li r0,rzV24 | |
463 | lvx v24,r1,r0 | |
464 | li r0,rzV25 | |
465 | lvx v25,r1,r0 | |
466 | li r0,rzV26 | |
467 | lvx v26,r1,r0 | |
468 | li r0,rzV27 | |
469 | lvx v27,r1,r0 | |
470 | li r0,rzV28 | |
471 | lvx v28,r1,r0 | |
472 | li r0,rzV29 | |
473 | lvx v29,r1,r0 | |
474 | li r0,rzV30 | |
475 | lvx v30,r1,r0 | |
476 | li r0,rzV31 | |
477 | lvx v31,r1,r0 | |
478 | mtspr vrsave,rv // restore caller's bitmask | |
479 | beqlr // done if no leftover bytes | |
480 | ||
481 | ||
482 | // Handle 1..31 leftover bytes at end. | |
483 | ||
484 | mtctr rc // set up loop count | |
485 | b 1f | |
486 | ||
487 | .align 5 | |
488 | 1: | |
489 | lbz r0,0(rs) | |
490 | addi rs,rs,1 | |
491 | stb r0,0(rd) | |
492 | addi rd,rd,1 | |
493 | bdnz 1b | |
494 | ||
495 | blr | |
496 | ||
497 | ||
498 | COMMPAGE_DESCRIPTOR(bigcopy_970,_COMM_PAGE_BIGCOPY,0,0,0) // load on all machines for now | |
499 |