]> git.saurik.com Git - apple/xnu.git/blob - osfmk/ppc/commpage/bigcopy_970.s
xnu-517.tar.gz
[apple/xnu.git] / osfmk / ppc / commpage / bigcopy_970.s
1 /*
2 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved.
7 *
8 * This file contains Original Code and/or Modifications of Original Code
9 * as defined in and that are subject to the Apple Public Source License
10 * Version 2.0 (the 'License'). You may not use this file except in
11 * compliance with the License. Please obtain a copy of the License at
12 * http://www.opensource.apple.com/apsl/ and read it before using this
13 * file.
14 *
15 * The Original Code and all software distributed under the License are
16 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
17 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
18 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
20 * Please see the License for the specific language governing rights and
21 * limitations under the License.
22 *
23 * @APPLE_LICENSE_HEADER_END@
24 */
25 /* ====================================
26 * Very Long Operand BCOPY for Mac OS X
27 * ====================================
28 *
29 * Version of 6/11/2003, tuned for the IBM 970. This is for operands at
30 * least several pages long. It is called from bcopy()/memcpy()/memmove().
31 *
32 * We use the following additional strategies not used by the shorter
33 * operand paths. Mostly, we try to optimize for memory bandwidth:
34 * 1. Use DCBZ128 to avoid reading destination lines. Because this code
35 * resides on the commmpage, it can use a private interface with the
36 * kernel to minimize alignment exceptions if the destination is
37 * uncached. The kernel will clear cr7 whenever it emulates a DCBZ or
38 * DCBZ128 on the commpage. Thus we take at most one exception per call,
39 * which is amortized across the very long operand.
40 * 2. Copy larger chunks per iteration to minimize R/W bus turnaround
41 * and maximize DRAM page locality (opening a new page is expensive.)
42 * 3. Touch in one source chunk ahead with DCBT. This is probably the
43 * least important change, and probably only helps restart the
44 * hardware stream at the start of each source page.
45 *
46 * Register usage. Note the rather delicate way we assign multiple uses
47 * to the same register. Beware.
48 * r0 = temp (NB: cannot use r0 for any constant such as "c16")
49 * r3 = not used, as memcpy and memmove return 1st parameter as a value
50 * r4 = source ptr ("rs")
51 * r5 = count of bytes to move ("rc")
52 * r6 = constant 16 ("c16")
53 * r7 = constant 32 (""c32")
54 * r8 = constant 48 (""c48")
55 * r9 = constant 128 (""c128")
56 * r10 = vrsave ("rv")
57 * r11 = constant 256 (""c256")
58 * r12 = destination ptr ("rd")
59 * r13 = constant 384 (""c384")
60 * r14 = temp ("rx")
61 * r15 = temp ("rt")
62 */
63 #define rs r4
64 #define rd r12
65 #define rc r5
66 #define rv r10
67 #define rx r14
68 #define rt r15
69
70 #define c16 r6
71 #define c32 r7
72 #define c48 r8
73 #define c128 r9
74 #define c256 r11
75 #define c384 r13
76
77 // Offsets within the "red zone" (which is 224 bytes long):
78
79 #define rzR13 -8
80 #define rzR14 -12
81 #define rzR15 -16
82 #define rzV20 -32
83 #define rzV21 -48
84 #define rzV22 -64
85 #define rzV23 -80
86 #define rzV24 -96
87 #define rzV25 -112
88 #define rzV26 -128
89 #define rzV27 -144
90 #define rzV28 -160
91 #define rzV29 -176
92 #define rzV30 -192
93 #define rzV31 -208
94
95
96 #include <sys/appleapiopts.h>
97 #include <ppc/asm.h>
98 #include <machine/cpu_capabilities.h>
99 #include <machine/commpage.h>
100
101 .text
102 .globl EXT(bigcopy_970)
103
104
105 // Entry point. This is a subroutine of bcopy(). When called:
106 // r4 = source ptr (aka "rs")
107 // r12 = dest ptr (aka "rd")
108 // r5 = length (>= 16K bytes) (aka "rc")
109 //
110 // We only do "forward" moves, ie non-overlapping or toward 0.
111 //
112 // We return with non-volatiles and r3 preserved.
113
114 .align 5
115 bigcopy_970:
116 stw r13,rzR13(r1) // spill non-volatile regs we use to redzone
117 stw r14,rzR14(r1)
118 stw r15,rzR15(r1)
119 li r0,rzV20
120 neg rt,rd // start to cache-line-align destination
121 stvx v20,r1,r0 // we use all 32 VRs
122 li r0,rzV21
123 stvx v21,r1,r0
124 li r0,rzV22
125 stvx v22,r1,r0
126 li r0,rzV23
127 stvx v23,r1,r0
128 li r0,rzV24
129 andi. rt,rt,127 // get #bytes to 128-byte align
130 stvx v24,r1,r0
131 li r0,rzV25
132 stvx v25,r1,r0
133 li r0,rzV26
134 sub rc,rc,rt // adjust length by #bytes to align destination
135 stvx v26,r1,r0
136 li r0,rzV27
137 stvx v27,r1,r0
138 li r0,rzV28
139 mtctr rt // #bytes to align destination
140 stvx v28,r1,r0
141 li r0,rzV29
142 stvx v29,r1,r0
143 li r0,rzV30
144 stvx v30,r1,r0
145 li r0,rzV31
146 stvx v31,r1,r0
147 beq 2f // dest already 128-byte aligned
148 b 1f
149
150
151 // Cache-line-align destination.
152
153 .align 5
154 1:
155 lbz r0,0(rs)
156 addi rs,rs,1
157 stb r0,0(rd)
158 addi rd,rd,1
159 bdnz 1b
160
161
162 // Is source 16-byte aligned? Load constant offsets.
163
164 2:
165 andi. r0,rs,15 // check source alignment
166 mfspr rv,vrsave // save caller's bitmask
167 li r0,-1 // we use all 32 VRs
168 li c16,16 // load the constant offsets for x-form ops
169 li c32,32
170 li c48,48
171 li c128,128
172 li c256,256
173 li c384,384
174 mtspr vrsave,r0
175
176 // NB: the kernel clears cr7 if it emulates a dcbz128 on the commpage,
177 // and we dcbz only if cr7 beq is set. We check to be sure the dcbz's
178 // won't zero source bytes before we load them, since we zero before
179 // loading as this is faster than zeroing after loading and before storing.
180
181 cmpw cr7,r0,r0 // initialize cr7 beq to use dcbz128
182 sub rt,rs,rd // get (rs-rd)
183 cmplwi cr1,rt,512 // are we moving down less than 512 bytes?
184
185 // Start fetching in source cache lines.
186
187 dcbt c128,rs // first line already touched in
188 dcbt c256,rs
189 dcbt c384,rs
190
191 bge++ cr1,3f // skip if not moving down less than 512 bytes
192 cmpw cr7,c16,c32 // cannot dcbz since it would zero source bytes
193 3:
194 beq LalignedLoop // handle aligned sources
195 lvsl v0,0,rs // get permute vector for left shift
196 lvxl v1,0,rs // prime the loop
197 b LunalignedLoop // enter unaligned loop
198
199
200 // Main loop for unaligned operands. We loop over 384-byte chunks (3 cache lines)
201 // since we need a few VRs for permuted destination QWs and the permute vector.
202
203 .align 5
204 LunalignedLoop:
205 subi rc,rc,384 // decrement byte count
206 addi rx,rs,384 // get address of next chunk
207 lvxl v2,c16,rs
208 lvxl v3,c32,rs
209 bne-- cr7,1f // skip dcbz's if cr7 beq has been turned off by kernel
210 dcbz128 0,rd // (also skip if moving down less than 512 bytes)
211 bne-- cr7,1f // catch it first time through
212 dcbz128 c128,rd
213 dcbz128 c256,rd
214 1:
215 addi rt,rs,64
216 dcbt 0,rx // touch in next chunk
217 dcbt c128,rx
218 dcbt c256,rx
219 lvxl v4,c48,rs
220 addi rs,rs,128
221 lvxl v5,0,rt
222 cmplwi rc,384 // another chunk to go?
223 lvxl v6,c16,rt
224 lvxl v7,c32,rt
225 lvxl v8,c48,rt
226 addi rt,rs,64
227 vperm v25,v1,v2,v0
228 lvxl v9,0,rs
229 lvxl v10,c16,rs
230 vperm v26,v2,v3,v0
231 lvxl v11,c32,rs
232 lvxl v12,c48,rs
233 vperm v27,v3,v4,v0
234 addi rs,rs,128
235 lvxl v13,0,rt
236 lvxl v14,c16,rt
237 vperm v28,v4,v5,v0
238 lvxl v15,c32,rt
239 lvxl v16,c48,rt
240 vperm v29,v5,v6,v0
241 addi rt,rs,64
242 lvxl v17,0,rs
243 lvxl v18,c16,rs
244 vperm v30,v6,v7,v0
245 lvxl v19,c32,rs
246 lvxl v20,c48,rs
247 vperm v31,v7,v8,v0
248 addi rs,rs,128
249 lvxl v21,0,rt
250 lvxl v22,c16,rt
251 vperm v2,v8,v9,v0
252 lvxl v23,c32,rt
253 lvxl v24,c48,rt
254 vperm v3,v9,v10,v0
255 lvx v1,0,rs // get 1st qw of next chunk
256 vperm v4,v10,v11,v0
257
258 addi rt,rd,64
259 stvxl v25,0,rd
260 stvxl v26,c16,rd
261 vperm v5,v11,v12,v0
262 stvxl v27,c32,rd
263 stvxl v28,c48,rd
264 vperm v6,v12,v13,v0
265 addi rd,rd,128
266 stvxl v29,0,rt
267 stvxl v30,c16,rt
268 vperm v7,v13,v14,v0
269 stvxl v31,c32,rt
270 stvxl v2,c48,rt
271 vperm v8,v14,v15,v0
272 addi rt,rd,64
273 stvxl v3,0,rd
274 stvxl v4,c16,rd
275 vperm v9,v15,v16,v0
276 stvxl v5,c32,rd
277 stvxl v6,c48,rd
278 vperm v10,v16,v17,v0
279 addi rd,rd,128
280 stvxl v7,0,rt
281 vperm v11,v17,v18,v0
282 stvxl v8,c16,rt
283 stvxl v9,c32,rt
284 vperm v12,v18,v19,v0
285 stvxl v10,c48,rt
286 addi rt,rd,64
287 vperm v13,v19,v20,v0
288 stvxl v11,0,rd
289 stvxl v12,c16,rd
290 vperm v14,v20,v21,v0
291 stvxl v13,c32,rd
292 vperm v15,v21,v22,v0
293 stvxl v14,c48,rd
294 vperm v16,v22,v23,v0
295 addi rd,rd,128
296 stvxl v15,0,rt
297 vperm v17,v23,v24,v0
298 stvxl v16,c16,rt
299 vperm v18,v24,v1,v0
300 stvxl v17,c32,rt
301 stvxl v18,c48,rt
302 bge++ LunalignedLoop // loop if another 384 bytes to go
303
304 // End of unaligned main loop. Handle up to 384 leftover bytes.
305
306 srwi. r0,rc,5 // get count of 32-byte chunks remaining
307 beq Ldone // none
308 rlwinm rc,rc,0,0x1F // mask count down to 0..31 leftover bytes
309 mtctr r0
310 1: // loop over 32-byte chunks
311 lvx v2,c16,rs
312 lvx v3,c32,rs
313 addi rs,rs,32
314 vperm v8,v1,v2,v0
315 vperm v9,v2,v3,v0
316 vor v1,v3,v3 // v1 <- v3
317 stvx v8,0,rd
318 stvx v9,c16,rd
319 addi rd,rd,32
320 bdnz 1b
321
322 b Ldone
323
324
325 // Aligned loop. Destination is 128-byte aligned, and source is 16-byte
326 // aligned. Loop over 512-byte chunks (4 cache lines.)
327
328 .align 5
329 LalignedLoop:
330 subi rc,rc,512 // decrement count
331 addi rx,rs,512 // address of next chunk
332 lvxl v1,0,rs
333 lvxl v2,c16,rs
334 bne-- cr7,1f // skip dcbz's if cr7 beq has been turned off by kernel
335 dcbz128 0,rd // (also skip if moving down less than 512 bytes)
336 bne-- cr7,1f // catch it first time through
337 dcbz128 c128,rd
338 dcbz128 c256,rd
339 dcbz128 c384,rd
340 1:
341 addi rt,rs,64
342 dcbt 0,rx // touch in next chunk
343 dcbt c128,rx
344 dcbt c256,rx
345 dcbt c384,rx
346 lvxl v3,c32,rs
347 lvxl v4,c48,rs
348 addi rs,rs,128
349 lvxl v5,0,rt
350 cmplwi rc,512 // another chunk to go?
351 lvxl v6,c16,rt
352 lvxl v7,c32,rt
353 lvxl v8,c48,rt
354 addi rt,rs,64
355 lvxl v9,0,rs
356 lvxl v10,c16,rs
357 lvxl v11,c32,rs
358 lvxl v12,c48,rs
359 addi rs,rs,128
360 lvxl v13,0,rt
361 lvxl v14,c16,rt
362 lvxl v15,c32,rt
363 lvxl v16,c48,rt
364 addi rt,rs,64
365 lvxl v17,0,rs
366 lvxl v18,c16,rs
367 lvxl v19,c32,rs
368 lvxl v20,c48,rs
369 addi rs,rs,128
370 lvxl v21,0,rt
371 lvxl v22,c16,rt
372 lvxl v23,c32,rt
373 lvxl v24,c48,rt
374 addi rt,rs,64
375 lvxl v25,0,rs
376 lvxl v26,c16,rs
377 lvxl v27,c32,rs
378 lvxl v28,c48,rs
379 addi rs,rs,128
380 lvxl v29,0,rt
381 lvxl v30,c16,rt
382 lvxl v31,c32,rt
383 lvxl v0,c48,rt
384
385 addi rt,rd,64
386 stvxl v1,0,rd
387 stvxl v2,c16,rd
388 stvxl v3,c32,rd
389 stvxl v4,c48,rd
390 addi rd,rd,128
391 stvxl v5,0,rt
392 stvxl v6,c16,rt
393 stvxl v7,c32,rt
394 stvxl v8,c48,rt
395 addi rt,rd,64
396 stvxl v9,0,rd
397 stvxl v10,c16,rd
398 stvxl v11,c32,rd
399 stvxl v12,c48,rd
400 addi rd,rd,128
401 stvxl v13,0,rt
402 stvxl v14,c16,rt
403 stvxl v15,c32,rt
404 stvxl v16,c48,rt
405 addi rt,rd,64
406 stvxl v17,0,rd
407 stvxl v18,c16,rd
408 stvxl v19,c32,rd
409 stvxl v20,c48,rd
410 addi rd,rd,128
411 stvxl v21,0,rt
412 stvxl v22,c16,rt
413 stvxl v23,c32,rt
414 stvxl v24,c48,rt
415 addi rt,rd,64
416 stvxl v25,0,rd
417 stvxl v26,c16,rd
418 stvxl v27,c32,rd
419 stvxl v28,c48,rd
420 addi rd,rd,128
421 stvxl v29,0,rt
422 stvxl v30,c16,rt
423 stvxl v31,c32,rt
424 stvxl v0,c48,rt
425 bge++ LalignedLoop // loop if another 512 bytes to go
426
427 // End of aligned main loop. Handle up to 511 leftover bytes.
428
429 srwi. r0,rc,5 // get count of 32-byte chunks remaining
430 beq Ldone // none
431 rlwinm rc,rc,0,0x1F // mask count down to 0..31 leftover bytes
432 mtctr r0
433 1: // loop over 32-byte chunks
434 lvx v1,0,rs
435 lvx v2,c16,rs
436 addi rs,rs,32
437 stvx v1,0,rd
438 stvx v2,c16,rd
439 addi rd,rd,32
440 bdnz 1b
441
442
443 // Done, except for 0..31 leftovers at end. Restore non-volatiles.
444 // rs = source ptr
445 // rd = dest ptr
446 // rc = count (0..31)
447 // rv = caller's vrsave
448
449 Ldone:
450 cmpwi rc,0 // any leftover bytes?
451 lwz r13,rzR13(r1) // restore non-volatiles from redzone
452 lwz r14,rzR14(r1)
453 lwz r15,rzR15(r1)
454 li r0,rzV20
455 lvx v20,r1,r0
456 li r0,rzV21
457 lvx v21,r1,r0
458 li r0,rzV22
459 lvx v22,r1,r0
460 li r0,rzV23
461 lvx v23,r1,r0
462 li r0,rzV24
463 lvx v24,r1,r0
464 li r0,rzV25
465 lvx v25,r1,r0
466 li r0,rzV26
467 lvx v26,r1,r0
468 li r0,rzV27
469 lvx v27,r1,r0
470 li r0,rzV28
471 lvx v28,r1,r0
472 li r0,rzV29
473 lvx v29,r1,r0
474 li r0,rzV30
475 lvx v30,r1,r0
476 li r0,rzV31
477 lvx v31,r1,r0
478 mtspr vrsave,rv // restore caller's bitmask
479 beqlr // done if no leftover bytes
480
481
482 // Handle 1..31 leftover bytes at end.
483
484 mtctr rc // set up loop count
485 b 1f
486
487 .align 5
488 1:
489 lbz r0,0(rs)
490 addi rs,rs,1
491 stb r0,0(rd)
492 addi rd,rd,1
493 bdnz 1b
494
495 blr
496
497
498 COMMPAGE_DESCRIPTOR(bigcopy_970,_COMM_PAGE_BIGCOPY,0,0,0) // load on all machines for now
499