]> git.saurik.com Git - apple/xnu.git/blame - osfmk/ppc/commpage/bigcopy_970.s
xnu-517.12.7.tar.gz
[apple/xnu.git] / osfmk / ppc / commpage / bigcopy_970.s
CommitLineData
55e303ae
A
1/*
2 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
e5568f75
A
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
55e303ae 11 *
e5568f75
A
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
55e303ae
A
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
e5568f75
A
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
18 * under the License.
55e303ae
A
19 *
20 * @APPLE_LICENSE_HEADER_END@
21 */
22/* ====================================
23 * Very Long Operand BCOPY for Mac OS X
24 * ====================================
25 *
26 * Version of 6/11/2003, tuned for the IBM 970. This is for operands at
27 * least several pages long. It is called from bcopy()/memcpy()/memmove().
28 *
29 * We use the following additional strategies not used by the shorter
30 * operand paths. Mostly, we try to optimize for memory bandwidth:
31 * 1. Use DCBZ128 to avoid reading destination lines. Because this code
32 * resides on the commmpage, it can use a private interface with the
33 * kernel to minimize alignment exceptions if the destination is
34 * uncached. The kernel will clear cr7 whenever it emulates a DCBZ or
35 * DCBZ128 on the commpage. Thus we take at most one exception per call,
36 * which is amortized across the very long operand.
37 * 2. Copy larger chunks per iteration to minimize R/W bus turnaround
38 * and maximize DRAM page locality (opening a new page is expensive.)
39 * 3. Touch in one source chunk ahead with DCBT. This is probably the
40 * least important change, and probably only helps restart the
41 * hardware stream at the start of each source page.
42 *
43 * Register usage. Note the rather delicate way we assign multiple uses
44 * to the same register. Beware.
45 * r0 = temp (NB: cannot use r0 for any constant such as "c16")
46 * r3 = not used, as memcpy and memmove return 1st parameter as a value
47 * r4 = source ptr ("rs")
48 * r5 = count of bytes to move ("rc")
49 * r6 = constant 16 ("c16")
50 * r7 = constant 32 (""c32")
51 * r8 = constant 48 (""c48")
52 * r9 = constant 128 (""c128")
53 * r10 = vrsave ("rv")
54 * r11 = constant 256 (""c256")
55 * r12 = destination ptr ("rd")
56 * r13 = constant 384 (""c384")
57 * r14 = temp ("rx")
58 * r15 = temp ("rt")
59 */
60#define rs r4
61#define rd r12
62#define rc r5
63#define rv r10
64#define rx r14
65#define rt r15
66
67#define c16 r6
68#define c32 r7
69#define c48 r8
70#define c128 r9
71#define c256 r11
72#define c384 r13
73
74// Offsets within the "red zone" (which is 224 bytes long):
75
76#define rzR13 -8
77#define rzR14 -12
78#define rzR15 -16
79#define rzV20 -32
80#define rzV21 -48
81#define rzV22 -64
82#define rzV23 -80
83#define rzV24 -96
84#define rzV25 -112
85#define rzV26 -128
86#define rzV27 -144
87#define rzV28 -160
88#define rzV29 -176
89#define rzV30 -192
90#define rzV31 -208
91
92
93#include <sys/appleapiopts.h>
94#include <ppc/asm.h>
95#include <machine/cpu_capabilities.h>
96#include <machine/commpage.h>
97
98 .text
99 .globl EXT(bigcopy_970)
100
101
102// Entry point. This is a subroutine of bcopy(). When called:
103// r4 = source ptr (aka "rs")
104// r12 = dest ptr (aka "rd")
105// r5 = length (>= 16K bytes) (aka "rc")
106//
107// We only do "forward" moves, ie non-overlapping or toward 0.
108//
109// We return with non-volatiles and r3 preserved.
110
111 .align 5
112bigcopy_970:
113 stw r13,rzR13(r1) // spill non-volatile regs we use to redzone
114 stw r14,rzR14(r1)
115 stw r15,rzR15(r1)
116 li r0,rzV20
117 neg rt,rd // start to cache-line-align destination
118 stvx v20,r1,r0 // we use all 32 VRs
119 li r0,rzV21
120 stvx v21,r1,r0
121 li r0,rzV22
122 stvx v22,r1,r0
123 li r0,rzV23
124 stvx v23,r1,r0
125 li r0,rzV24
126 andi. rt,rt,127 // get #bytes to 128-byte align
127 stvx v24,r1,r0
128 li r0,rzV25
129 stvx v25,r1,r0
130 li r0,rzV26
131 sub rc,rc,rt // adjust length by #bytes to align destination
132 stvx v26,r1,r0
133 li r0,rzV27
134 stvx v27,r1,r0
135 li r0,rzV28
136 mtctr rt // #bytes to align destination
137 stvx v28,r1,r0
138 li r0,rzV29
139 stvx v29,r1,r0
140 li r0,rzV30
141 stvx v30,r1,r0
142 li r0,rzV31
143 stvx v31,r1,r0
144 beq 2f // dest already 128-byte aligned
145 b 1f
146
147
148// Cache-line-align destination.
149
150 .align 5
1511:
152 lbz r0,0(rs)
153 addi rs,rs,1
154 stb r0,0(rd)
155 addi rd,rd,1
156 bdnz 1b
157
158
159// Is source 16-byte aligned? Load constant offsets.
160
1612:
162 andi. r0,rs,15 // check source alignment
163 mfspr rv,vrsave // save caller's bitmask
164 li r0,-1 // we use all 32 VRs
165 li c16,16 // load the constant offsets for x-form ops
166 li c32,32
167 li c48,48
168 li c128,128
169 li c256,256
170 li c384,384
171 mtspr vrsave,r0
172
173// NB: the kernel clears cr7 if it emulates a dcbz128 on the commpage,
174// and we dcbz only if cr7 beq is set. We check to be sure the dcbz's
175// won't zero source bytes before we load them, since we zero before
176// loading as this is faster than zeroing after loading and before storing.
177
178 cmpw cr7,r0,r0 // initialize cr7 beq to use dcbz128
179 sub rt,rs,rd // get (rs-rd)
180 cmplwi cr1,rt,512 // are we moving down less than 512 bytes?
181
182// Start fetching in source cache lines.
183
184 dcbt c128,rs // first line already touched in
185 dcbt c256,rs
186 dcbt c384,rs
187
188 bge++ cr1,3f // skip if not moving down less than 512 bytes
189 cmpw cr7,c16,c32 // cannot dcbz since it would zero source bytes
1903:
191 beq LalignedLoop // handle aligned sources
192 lvsl v0,0,rs // get permute vector for left shift
193 lvxl v1,0,rs // prime the loop
194 b LunalignedLoop // enter unaligned loop
195
196
197// Main loop for unaligned operands. We loop over 384-byte chunks (3 cache lines)
198// since we need a few VRs for permuted destination QWs and the permute vector.
199
200 .align 5
201LunalignedLoop:
202 subi rc,rc,384 // decrement byte count
203 addi rx,rs,384 // get address of next chunk
204 lvxl v2,c16,rs
205 lvxl v3,c32,rs
206 bne-- cr7,1f // skip dcbz's if cr7 beq has been turned off by kernel
207 dcbz128 0,rd // (also skip if moving down less than 512 bytes)
208 bne-- cr7,1f // catch it first time through
209 dcbz128 c128,rd
210 dcbz128 c256,rd
2111:
212 addi rt,rs,64
213 dcbt 0,rx // touch in next chunk
214 dcbt c128,rx
215 dcbt c256,rx
216 lvxl v4,c48,rs
217 addi rs,rs,128
218 lvxl v5,0,rt
219 cmplwi rc,384 // another chunk to go?
220 lvxl v6,c16,rt
221 lvxl v7,c32,rt
222 lvxl v8,c48,rt
223 addi rt,rs,64
224 vperm v25,v1,v2,v0
225 lvxl v9,0,rs
226 lvxl v10,c16,rs
227 vperm v26,v2,v3,v0
228 lvxl v11,c32,rs
229 lvxl v12,c48,rs
230 vperm v27,v3,v4,v0
231 addi rs,rs,128
232 lvxl v13,0,rt
233 lvxl v14,c16,rt
234 vperm v28,v4,v5,v0
235 lvxl v15,c32,rt
236 lvxl v16,c48,rt
237 vperm v29,v5,v6,v0
238 addi rt,rs,64
239 lvxl v17,0,rs
240 lvxl v18,c16,rs
241 vperm v30,v6,v7,v0
242 lvxl v19,c32,rs
243 lvxl v20,c48,rs
244 vperm v31,v7,v8,v0
245 addi rs,rs,128
246 lvxl v21,0,rt
247 lvxl v22,c16,rt
248 vperm v2,v8,v9,v0
249 lvxl v23,c32,rt
250 lvxl v24,c48,rt
251 vperm v3,v9,v10,v0
252 lvx v1,0,rs // get 1st qw of next chunk
253 vperm v4,v10,v11,v0
254
255 addi rt,rd,64
256 stvxl v25,0,rd
257 stvxl v26,c16,rd
258 vperm v5,v11,v12,v0
259 stvxl v27,c32,rd
260 stvxl v28,c48,rd
261 vperm v6,v12,v13,v0
262 addi rd,rd,128
263 stvxl v29,0,rt
264 stvxl v30,c16,rt
265 vperm v7,v13,v14,v0
266 stvxl v31,c32,rt
267 stvxl v2,c48,rt
268 vperm v8,v14,v15,v0
269 addi rt,rd,64
270 stvxl v3,0,rd
271 stvxl v4,c16,rd
272 vperm v9,v15,v16,v0
273 stvxl v5,c32,rd
274 stvxl v6,c48,rd
275 vperm v10,v16,v17,v0
276 addi rd,rd,128
277 stvxl v7,0,rt
278 vperm v11,v17,v18,v0
279 stvxl v8,c16,rt
280 stvxl v9,c32,rt
281 vperm v12,v18,v19,v0
282 stvxl v10,c48,rt
283 addi rt,rd,64
284 vperm v13,v19,v20,v0
285 stvxl v11,0,rd
286 stvxl v12,c16,rd
287 vperm v14,v20,v21,v0
288 stvxl v13,c32,rd
289 vperm v15,v21,v22,v0
290 stvxl v14,c48,rd
291 vperm v16,v22,v23,v0
292 addi rd,rd,128
293 stvxl v15,0,rt
294 vperm v17,v23,v24,v0
295 stvxl v16,c16,rt
296 vperm v18,v24,v1,v0
297 stvxl v17,c32,rt
298 stvxl v18,c48,rt
299 bge++ LunalignedLoop // loop if another 384 bytes to go
300
301// End of unaligned main loop. Handle up to 384 leftover bytes.
302
303 srwi. r0,rc,5 // get count of 32-byte chunks remaining
304 beq Ldone // none
305 rlwinm rc,rc,0,0x1F // mask count down to 0..31 leftover bytes
306 mtctr r0
3071: // loop over 32-byte chunks
308 lvx v2,c16,rs
309 lvx v3,c32,rs
310 addi rs,rs,32
311 vperm v8,v1,v2,v0
312 vperm v9,v2,v3,v0
313 vor v1,v3,v3 // v1 <- v3
314 stvx v8,0,rd
315 stvx v9,c16,rd
316 addi rd,rd,32
317 bdnz 1b
318
319 b Ldone
320
321
322// Aligned loop. Destination is 128-byte aligned, and source is 16-byte
323// aligned. Loop over 512-byte chunks (4 cache lines.)
324
325 .align 5
326LalignedLoop:
327 subi rc,rc,512 // decrement count
328 addi rx,rs,512 // address of next chunk
329 lvxl v1,0,rs
330 lvxl v2,c16,rs
331 bne-- cr7,1f // skip dcbz's if cr7 beq has been turned off by kernel
332 dcbz128 0,rd // (also skip if moving down less than 512 bytes)
333 bne-- cr7,1f // catch it first time through
334 dcbz128 c128,rd
335 dcbz128 c256,rd
336 dcbz128 c384,rd
3371:
338 addi rt,rs,64
339 dcbt 0,rx // touch in next chunk
340 dcbt c128,rx
341 dcbt c256,rx
342 dcbt c384,rx
343 lvxl v3,c32,rs
344 lvxl v4,c48,rs
345 addi rs,rs,128
346 lvxl v5,0,rt
347 cmplwi rc,512 // another chunk to go?
348 lvxl v6,c16,rt
349 lvxl v7,c32,rt
350 lvxl v8,c48,rt
351 addi rt,rs,64
352 lvxl v9,0,rs
353 lvxl v10,c16,rs
354 lvxl v11,c32,rs
355 lvxl v12,c48,rs
356 addi rs,rs,128
357 lvxl v13,0,rt
358 lvxl v14,c16,rt
359 lvxl v15,c32,rt
360 lvxl v16,c48,rt
361 addi rt,rs,64
362 lvxl v17,0,rs
363 lvxl v18,c16,rs
364 lvxl v19,c32,rs
365 lvxl v20,c48,rs
366 addi rs,rs,128
367 lvxl v21,0,rt
368 lvxl v22,c16,rt
369 lvxl v23,c32,rt
370 lvxl v24,c48,rt
371 addi rt,rs,64
372 lvxl v25,0,rs
373 lvxl v26,c16,rs
374 lvxl v27,c32,rs
375 lvxl v28,c48,rs
376 addi rs,rs,128
377 lvxl v29,0,rt
378 lvxl v30,c16,rt
379 lvxl v31,c32,rt
380 lvxl v0,c48,rt
381
382 addi rt,rd,64
383 stvxl v1,0,rd
384 stvxl v2,c16,rd
385 stvxl v3,c32,rd
386 stvxl v4,c48,rd
387 addi rd,rd,128
388 stvxl v5,0,rt
389 stvxl v6,c16,rt
390 stvxl v7,c32,rt
391 stvxl v8,c48,rt
392 addi rt,rd,64
393 stvxl v9,0,rd
394 stvxl v10,c16,rd
395 stvxl v11,c32,rd
396 stvxl v12,c48,rd
397 addi rd,rd,128
398 stvxl v13,0,rt
399 stvxl v14,c16,rt
400 stvxl v15,c32,rt
401 stvxl v16,c48,rt
402 addi rt,rd,64
403 stvxl v17,0,rd
404 stvxl v18,c16,rd
405 stvxl v19,c32,rd
406 stvxl v20,c48,rd
407 addi rd,rd,128
408 stvxl v21,0,rt
409 stvxl v22,c16,rt
410 stvxl v23,c32,rt
411 stvxl v24,c48,rt
412 addi rt,rd,64
413 stvxl v25,0,rd
414 stvxl v26,c16,rd
415 stvxl v27,c32,rd
416 stvxl v28,c48,rd
417 addi rd,rd,128
418 stvxl v29,0,rt
419 stvxl v30,c16,rt
420 stvxl v31,c32,rt
421 stvxl v0,c48,rt
422 bge++ LalignedLoop // loop if another 512 bytes to go
423
424// End of aligned main loop. Handle up to 511 leftover bytes.
425
426 srwi. r0,rc,5 // get count of 32-byte chunks remaining
427 beq Ldone // none
428 rlwinm rc,rc,0,0x1F // mask count down to 0..31 leftover bytes
429 mtctr r0
4301: // loop over 32-byte chunks
431 lvx v1,0,rs
432 lvx v2,c16,rs
433 addi rs,rs,32
434 stvx v1,0,rd
435 stvx v2,c16,rd
436 addi rd,rd,32
437 bdnz 1b
438
439
440// Done, except for 0..31 leftovers at end. Restore non-volatiles.
441// rs = source ptr
442// rd = dest ptr
443// rc = count (0..31)
444// rv = caller's vrsave
445
446Ldone:
447 cmpwi rc,0 // any leftover bytes?
448 lwz r13,rzR13(r1) // restore non-volatiles from redzone
449 lwz r14,rzR14(r1)
450 lwz r15,rzR15(r1)
451 li r0,rzV20
452 lvx v20,r1,r0
453 li r0,rzV21
454 lvx v21,r1,r0
455 li r0,rzV22
456 lvx v22,r1,r0
457 li r0,rzV23
458 lvx v23,r1,r0
459 li r0,rzV24
460 lvx v24,r1,r0
461 li r0,rzV25
462 lvx v25,r1,r0
463 li r0,rzV26
464 lvx v26,r1,r0
465 li r0,rzV27
466 lvx v27,r1,r0
467 li r0,rzV28
468 lvx v28,r1,r0
469 li r0,rzV29
470 lvx v29,r1,r0
471 li r0,rzV30
472 lvx v30,r1,r0
473 li r0,rzV31
474 lvx v31,r1,r0
475 mtspr vrsave,rv // restore caller's bitmask
476 beqlr // done if no leftover bytes
477
478
479// Handle 1..31 leftover bytes at end.
480
481 mtctr rc // set up loop count
482 b 1f
483
484 .align 5
4851:
486 lbz r0,0(rs)
487 addi rs,rs,1
488 stb r0,0(rd)
489 addi rd,rd,1
490 bdnz 1b
491
492 blr
493
494
495 COMMPAGE_DESCRIPTOR(bigcopy_970,_COMM_PAGE_BIGCOPY,0,0,0) // load on all machines for now
496