2 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
4 * @APPLE_LICENSE_HEADER_START@
6 * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved.
8 * This file contains Original Code and/or Modifications of Original Code
9 * as defined in and that are subject to the Apple Public Source License
10 * Version 2.0 (the 'License'). You may not use this file except in
11 * compliance with the License. Please obtain a copy of the License at
12 * http://www.opensource.apple.com/apsl/ and read it before using this
15 * The Original Code and all software distributed under the License are
16 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
17 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
18 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
20 * Please see the License for the specific language governing rights and
21 * limitations under the License.
23 * @APPLE_LICENSE_HEADER_END@
25 /* ====================================
26 * Very Long Operand BCOPY for Mac OS X
27 * ====================================
29 * Version of 6/11/2003, tuned for the IBM 970. This is for operands at
30 * least several pages long. It is called from bcopy()/memcpy()/memmove().
32 * We use the following additional strategies not used by the shorter
33 * operand paths. Mostly, we try to optimize for memory bandwidth:
34 * 1. Use DCBZ128 to avoid reading destination lines. Because this code
35 * resides on the commmpage, it can use a private interface with the
36 * kernel to minimize alignment exceptions if the destination is
37 * uncached. The kernel will clear cr7 whenever it emulates a DCBZ or
38 * DCBZ128 on the commpage. Thus we take at most one exception per call,
39 * which is amortized across the very long operand.
40 * 2. Copy larger chunks per iteration to minimize R/W bus turnaround
41 * and maximize DRAM page locality (opening a new page is expensive.)
42 * 3. Touch in one source chunk ahead with DCBT. This is probably the
43 * least important change, and probably only helps restart the
44 * hardware stream at the start of each source page.
46 * Register usage. Note the rather delicate way we assign multiple uses
47 * to the same register. Beware.
48 * r0 = temp (NB: cannot use r0 for any constant such as "c16")
49 * r3 = not used, as memcpy and memmove return 1st parameter as a value
50 * r4 = source ptr ("rs")
51 * r5 = count of bytes to move ("rc")
52 * r6 = constant 16 ("c16")
53 * r7 = constant 32 (""c32")
54 * r8 = constant 48 (""c48")
55 * r9 = constant 128 (""c128")
57 * r11 = constant 256 (""c256")
58 * r12 = destination ptr ("rd")
59 * r13 = constant 384 (""c384")
77 // Offsets within the "red zone" (which is 224 bytes long):
96 #include <sys/appleapiopts.h>
98 #include <machine/cpu_capabilities.h>
99 #include <machine/commpage.h>
102 .globl EXT(bigcopy_970)
105 // Entry point. This is a subroutine of bcopy(). When called:
106 // r4 = source ptr (aka "rs")
107 // r12 = dest ptr (aka "rd")
108 // r5 = length (>= 16K bytes) (aka "rc")
110 // We only do "forward" moves, ie non-overlapping or toward 0.
112 // We return with non-volatiles and r3 preserved.
116 stw r13,rzR13(r1) // spill non-volatile regs we use to redzone
120 neg rt,rd // start to cache-line-align destination
121 stvx v20,r1,r0 // we use all 32 VRs
129 andi. rt,rt,127 // get #bytes to 128-byte align
134 sub rc,rc,rt // adjust length by #bytes to align destination
139 mtctr rt // #bytes to align destination
147 beq 2f // dest already 128-byte aligned
151 // Cache-line-align destination.
162 // Is source 16-byte aligned? Load constant offsets.
165 andi. r0,rs,15 // check source alignment
166 mfspr rv,vrsave // save caller's bitmask
167 li r0,-1 // we use all 32 VRs
168 li c16,16 // load the constant offsets for x-form ops
176 // NB: the kernel clears cr7 if it emulates a dcbz128 on the commpage,
177 // and we dcbz only if cr7 beq is set. We check to be sure the dcbz's
178 // won't zero source bytes before we load them, since we zero before
179 // loading as this is faster than zeroing after loading and before storing.
181 cmpw cr7,r0,r0 // initialize cr7 beq to use dcbz128
182 sub rt,rs,rd // get (rs-rd)
183 cmplwi cr1,rt,512 // are we moving down less than 512 bytes?
185 // Start fetching in source cache lines.
187 dcbt c128,rs // first line already touched in
191 bge++ cr1,3f // skip if not moving down less than 512 bytes
192 cmpw cr7,c16,c32 // cannot dcbz since it would zero source bytes
194 beq LalignedLoop // handle aligned sources
195 lvsl v0,0,rs // get permute vector for left shift
196 lvxl v1,0,rs // prime the loop
197 b LunalignedLoop // enter unaligned loop
200 // Main loop for unaligned operands. We loop over 384-byte chunks (3 cache lines)
201 // since we need a few VRs for permuted destination QWs and the permute vector.
205 subi rc,rc,384 // decrement byte count
206 addi rx,rs,384 // get address of next chunk
209 bne-- cr7,1f // skip dcbz's if cr7 beq has been turned off by kernel
210 dcbz128 0,rd // (also skip if moving down less than 512 bytes)
211 bne-- cr7,1f // catch it first time through
216 dcbt 0,rx // touch in next chunk
222 cmplwi rc,384 // another chunk to go?
255 lvx v1,0,rs // get 1st qw of next chunk
302 bge++ LunalignedLoop // loop if another 384 bytes to go
304 // End of unaligned main loop. Handle up to 384 leftover bytes.
306 srwi. r0,rc,5 // get count of 32-byte chunks remaining
308 rlwinm rc,rc,0,0x1F // mask count down to 0..31 leftover bytes
310 1: // loop over 32-byte chunks
316 vor v1,v3,v3 // v1 <- v3
325 // Aligned loop. Destination is 128-byte aligned, and source is 16-byte
326 // aligned. Loop over 512-byte chunks (4 cache lines.)
330 subi rc,rc,512 // decrement count
331 addi rx,rs,512 // address of next chunk
334 bne-- cr7,1f // skip dcbz's if cr7 beq has been turned off by kernel
335 dcbz128 0,rd // (also skip if moving down less than 512 bytes)
336 bne-- cr7,1f // catch it first time through
342 dcbt 0,rx // touch in next chunk
350 cmplwi rc,512 // another chunk to go?
425 bge++ LalignedLoop // loop if another 512 bytes to go
427 // End of aligned main loop. Handle up to 511 leftover bytes.
429 srwi. r0,rc,5 // get count of 32-byte chunks remaining
431 rlwinm rc,rc,0,0x1F // mask count down to 0..31 leftover bytes
433 1: // loop over 32-byte chunks
443 // Done, except for 0..31 leftovers at end. Restore non-volatiles.
446 // rc = count (0..31)
447 // rv = caller's vrsave
450 cmpwi rc,0 // any leftover bytes?
451 lwz r13,rzR13(r1) // restore non-volatiles from redzone
478 mtspr vrsave,rv // restore caller's bitmask
479 beqlr // done if no leftover bytes
482 // Handle 1..31 leftover bytes at end.
484 mtctr rc // set up loop count
498 COMMPAGE_DESCRIPTOR(bigcopy_970,_COMM_PAGE_BIGCOPY,0,0,0) // load on all machines for now