2 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
4 * @APPLE_LICENSE_HEADER_START@
6 * The contents of this file constitute Original Code as defined in and
7 * are subject to the Apple Public Source License Version 1.1 (the
8 * "License"). You may not use this file except in compliance with the
9 * License. Please obtain a copy of the License at
10 * http://www.apple.com/publicsource and read it before using this file.
12 * This Original Code and all software distributed under the License are
13 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
17 * License for the specific language governing rights and limitations
20 * @APPLE_LICENSE_HEADER_END@
22 /* ====================================
23 * Very Long Operand BCOPY for Mac OS X
24 * ====================================
26 * Version of 6/11/2003, tuned for the IBM 970. This is for operands at
27 * least several pages long. It is called from bcopy()/memcpy()/memmove().
29 * We use the following additional strategies not used by the shorter
30 * operand paths. Mostly, we try to optimize for memory bandwidth:
31 * 1. Use DCBZ128 to avoid reading destination lines. Because this code
32 * resides on the commmpage, it can use a private interface with the
33 * kernel to minimize alignment exceptions if the destination is
34 * uncached. The kernel will clear cr7 whenever it emulates a DCBZ or
35 * DCBZ128 on the commpage. Thus we take at most one exception per call,
36 * which is amortized across the very long operand.
37 * 2. Copy larger chunks per iteration to minimize R/W bus turnaround
38 * and maximize DRAM page locality (opening a new page is expensive.)
39 * 3. Touch in one source chunk ahead with DCBT. This is probably the
40 * least important change, and probably only helps restart the
41 * hardware stream at the start of each source page.
43 * Register usage. Note the rather delicate way we assign multiple uses
44 * to the same register. Beware.
45 * r0 = temp (NB: cannot use r0 for any constant such as "c16")
46 * r3 = not used, as memcpy and memmove return 1st parameter as a value
47 * r4 = source ptr ("rs")
48 * r5 = count of bytes to move ("rc")
49 * r6 = constant 16 ("c16")
50 * r7 = constant 32 (""c32")
51 * r8 = constant 48 (""c48")
52 * r9 = constant 128 (""c128")
54 * r11 = constant 256 (""c256")
55 * r12 = destination ptr ("rd")
56 * r13 = constant 384 (""c384")
74 // Offsets within the "red zone" (which is 224 bytes long):
93 #include <sys/appleapiopts.h>
95 #include <machine/cpu_capabilities.h>
96 #include <machine/commpage.h>
99 .globl EXT(bigcopy_970)
102 // Entry point. This is a subroutine of bcopy(). When called:
103 // r4 = source ptr (aka "rs")
104 // r12 = dest ptr (aka "rd")
105 // r5 = length (>= 16K bytes) (aka "rc")
107 // We only do "forward" moves, ie non-overlapping or toward 0.
109 // We return with non-volatiles and r3 preserved.
113 stw r13,rzR13(r1) // spill non-volatile regs we use to redzone
117 neg rt,rd // start to cache-line-align destination
118 stvx v20,r1,r0 // we use all 32 VRs
126 andi. rt,rt,127 // get #bytes to 128-byte align
131 sub rc,rc,rt // adjust length by #bytes to align destination
136 mtctr rt // #bytes to align destination
144 beq 2f // dest already 128-byte aligned
148 // Cache-line-align destination.
159 // Is source 16-byte aligned? Load constant offsets.
162 andi. r0,rs,15 // check source alignment
163 mfspr rv,vrsave // save caller's bitmask
164 li r0,-1 // we use all 32 VRs
165 li c16,16 // load the constant offsets for x-form ops
173 // NB: the kernel clears cr7 if it emulates a dcbz128 on the commpage,
174 // and we dcbz only if cr7 beq is set. We check to be sure the dcbz's
175 // won't zero source bytes before we load them, since we zero before
176 // loading as this is faster than zeroing after loading and before storing.
178 cmpw cr7,r0,r0 // initialize cr7 beq to use dcbz128
179 sub rt,rs,rd // get (rs-rd)
180 cmplwi cr1,rt,512 // are we moving down less than 512 bytes?
182 // Start fetching in source cache lines.
184 dcbt c128,rs // first line already touched in
188 bge++ cr1,3f // skip if not moving down less than 512 bytes
189 cmpw cr7,c16,c32 // cannot dcbz since it would zero source bytes
191 beq LalignedLoop // handle aligned sources
192 lvsl v0,0,rs // get permute vector for left shift
193 lvxl v1,0,rs // prime the loop
194 b LunalignedLoop // enter unaligned loop
197 // Main loop for unaligned operands. We loop over 384-byte chunks (3 cache lines)
198 // since we need a few VRs for permuted destination QWs and the permute vector.
202 subi rc,rc,384 // decrement byte count
203 addi rx,rs,384 // get address of next chunk
206 bne-- cr7,1f // skip dcbz's if cr7 beq has been turned off by kernel
207 dcbz128 0,rd // (also skip if moving down less than 512 bytes)
208 bne-- cr7,1f // catch it first time through
213 dcbt 0,rx // touch in next chunk
219 cmplwi rc,384 // another chunk to go?
252 lvx v1,0,rs // get 1st qw of next chunk
299 bge++ LunalignedLoop // loop if another 384 bytes to go
301 // End of unaligned main loop. Handle up to 384 leftover bytes.
303 srwi. r0,rc,5 // get count of 32-byte chunks remaining
305 rlwinm rc,rc,0,0x1F // mask count down to 0..31 leftover bytes
307 1: // loop over 32-byte chunks
313 vor v1,v3,v3 // v1 <- v3
322 // Aligned loop. Destination is 128-byte aligned, and source is 16-byte
323 // aligned. Loop over 512-byte chunks (4 cache lines.)
327 subi rc,rc,512 // decrement count
328 addi rx,rs,512 // address of next chunk
331 bne-- cr7,1f // skip dcbz's if cr7 beq has been turned off by kernel
332 dcbz128 0,rd // (also skip if moving down less than 512 bytes)
333 bne-- cr7,1f // catch it first time through
339 dcbt 0,rx // touch in next chunk
347 cmplwi rc,512 // another chunk to go?
422 bge++ LalignedLoop // loop if another 512 bytes to go
424 // End of aligned main loop. Handle up to 511 leftover bytes.
426 srwi. r0,rc,5 // get count of 32-byte chunks remaining
428 rlwinm rc,rc,0,0x1F // mask count down to 0..31 leftover bytes
430 1: // loop over 32-byte chunks
440 // Done, except for 0..31 leftovers at end. Restore non-volatiles.
443 // rc = count (0..31)
444 // rv = caller's vrsave
447 cmpwi rc,0 // any leftover bytes?
448 lwz r13,rzR13(r1) // restore non-volatiles from redzone
475 mtspr vrsave,rv // restore caller's bitmask
476 beqlr // done if no leftover bytes
479 // Handle 1..31 leftover bytes at end.
481 mtctr rc // set up loop count
495 COMMPAGE_DESCRIPTOR(bigcopy_970,_COMM_PAGE_BIGCOPY,0,0,0) // load on all machines for now