X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/d7e50217d7adf6e52786a38bcaa4cd698cb9a79e..4a3eedf9ecc9bbe3f3a5c6ce5e53ad199d639d32:/osfmk/ppc/commpage/bigcopy_970.s?ds=inline diff --git a/osfmk/ppc/commpage/bigcopy_970.s b/osfmk/ppc/commpage/bigcopy_970.s index fa9e1245a..add093ea3 100644 --- a/osfmk/ppc/commpage/bigcopy_970.s +++ b/osfmk/ppc/commpage/bigcopy_970.s @@ -1,16 +1,19 @@ /* * Copyright (c) 2003 Apple Computer, Inc. All rights reserved. * - * @APPLE_LICENSE_HEADER_START@ - * - * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved. + * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in - * compliance with the License. Please obtain a copy of the License at - * http://www.opensource.apple.com/apsl/ and read it before using this - * file. + * compliance with the License. The rights granted to you under the License + * may not be used to create, or enable the creation or redistribution of, + * unlawful or unlicensed copies of an Apple operating system, or to + * circumvent, violate, or enable the circumvention or violation of, any + * terms of an Apple operating system software license agreement. + * + * Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this file. * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER @@ -20,14 +23,15 @@ * Please see the License for the specific language governing rights and * limitations under the License. * - * @APPLE_LICENSE_HEADER_END@ + * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* ==================================== * Very Long Operand BCOPY for Mac OS X * ==================================== * - * Version of 6/11/2003, tuned for the IBM 970. This is for operands at - * least several pages long. It is called from bcopy()/memcpy()/memmove(). + * Version of 2/21/2004, tuned for the IBM 970. This is for operands at + * least several pages long. It is called from bcopy()/memcpy()/memmove(), + * and runs both in 32 and 64-bit mode. * * We use the following additional strategies not used by the shorter * operand paths. Mostly, we try to optimize for memory bandwidth: @@ -39,58 +43,40 @@ * which is amortized across the very long operand. * 2. Copy larger chunks per iteration to minimize R/W bus turnaround * and maximize DRAM page locality (opening a new page is expensive.) + * We use 256-byte chunks. * 3. Touch in one source chunk ahead with DCBT. This is probably the * least important change, and probably only helps restart the * hardware stream at the start of each source page. - * - * Register usage. Note the rather delicate way we assign multiple uses - * to the same register. Beware. - * r0 = temp (NB: cannot use r0 for any constant such as "c16") - * r3 = not used, as memcpy and memmove return 1st parameter as a value - * r4 = source ptr ("rs") - * r5 = count of bytes to move ("rc") - * r6 = constant 16 ("c16") - * r7 = constant 32 (""c32") - * r8 = constant 48 (""c48") - * r9 = constant 128 (""c128") - * r10 = vrsave ("rv") - * r11 = constant 256 (""c256") - * r12 = destination ptr ("rd") - * r13 = constant 384 (""c384") - * r14 = temp ("rx") - * r15 = temp ("rt") */ -#define rs r4 -#define rd r12 -#define rc r5 -#define rv r10 -#define rx r14 -#define rt r15 - -#define c16 r6 -#define c32 r7 -#define c48 r8 -#define c128 r9 -#define c256 r11 -#define c384 r13 + +#define rs r13 +#define rd r14 +#define rc r15 +#define rx r16 + +#define c16 r3 +#define c32 r4 +#define c48 r5 +#define c64 r6 +#define c80 r7 +#define c96 r8 +#define c112 r9 +#define c256 r10 +#define c384 r11 +#define rv r12 // vrsave // Offsets within the "red zone" (which is 224 bytes long): -#define rzR13 -8 -#define rzR14 -12 -#define rzR15 -16 -#define rzV20 -32 -#define rzV21 -48 -#define rzV22 -64 -#define rzV23 -80 -#define rzV24 -96 -#define rzV25 -112 -#define rzV26 -128 -#define rzV27 -144 -#define rzV28 -160 -#define rzV29 -176 -#define rzV30 -192 -#define rzV31 -208 +#define rzR3 -8 +#define rzR13 -16 +#define rzR14 -24 +#define rzR15 -32 +#define rzR16 -40 + +#define rzV20 -64 +#define rzV21 -80 +#define rzV22 -96 +#define rzV23 -112 #include @@ -99,401 +85,247 @@ #include .text - .globl EXT(bigcopy_970) - +/* + * WARNING: this code is written for 32-bit mode, and ported by the kernel if necessary + * to 64-bit mode for use in the 64-bit commpage. This "port" consists of the following + * simple transformations: + * - all word compares are changed to doubleword + * - all "srwi[.]" opcodes are changed to "srdi[.]" + * Nothing else is done. For this to work, the following rules must be + * carefully followed: + * - do not use carry or overflow + * - only use record mode if you are sure the results are mode-invariant + * for example, all "andi." and almost all "rlwinm." are fine + * - do not use "slwi", "slw", or "srw" + * An imaginative programmer could break the porting model in other ways, but the above + * are the most likely problem areas. It is perhaps surprising how well in practice + * this simple method works. + */ // Entry point. This is a subroutine of bcopy(). When called: -// r4 = source ptr (aka "rs") -// r12 = dest ptr (aka "rd") -// r5 = length (>= 16K bytes) (aka "rc") +// r0 = return address (also stored in caller's SF) +// r4 = source ptr +// r5 = length (at least several pages) +// r12 = dest ptr // -// We only do "forward" moves, ie non-overlapping or toward 0. -// -// We return with non-volatiles and r3 preserved. +// We only do "forward" moves, ie non-overlapping or toward 0. We return with non-volatiles +// and r3 preserved. .align 5 bigcopy_970: - stw r13,rzR13(r1) // spill non-volatile regs we use to redzone - stw r14,rzR14(r1) - stw r15,rzR15(r1) - li r0,rzV20 - neg rt,rd // start to cache-line-align destination - stvx v20,r1,r0 // we use all 32 VRs - li r0,rzV21 - stvx v21,r1,r0 - li r0,rzV22 - stvx v22,r1,r0 - li r0,rzV23 - stvx v23,r1,r0 - li r0,rzV24 - andi. rt,rt,127 // get #bytes to 128-byte align - stvx v24,r1,r0 - li r0,rzV25 - stvx v25,r1,r0 - li r0,rzV26 - sub rc,rc,rt // adjust length by #bytes to align destination - stvx v26,r1,r0 - li r0,rzV27 - stvx v27,r1,r0 - li r0,rzV28 - mtctr rt // #bytes to align destination - stvx v28,r1,r0 - li r0,rzV29 - stvx v29,r1,r0 - li r0,rzV30 - stvx v30,r1,r0 - li r0,rzV31 - stvx v31,r1,r0 - beq 2f // dest already 128-byte aligned - b 1f - + neg r2,r12 // is destination cache-line-aligned? + std r3,rzR3(r1) // save caller's r3, which must be preserved for memcpy() + std r13,rzR13(r1) // spill non-volatile regs we use to redzone + std r14,rzR14(r1) + std r15,rzR15(r1) + andi. r2,r2,0x7F // #bytes to align + std r16,rzR16(r1) + mr rs,r4 // copy parameters into nonvolatile registers + mr rd,r12 + mr rc,r5 + mr rx,r0 // also save return address + beq 1f // skip if already aligned // Cache-line-align destination. - - .align 5 -1: - lbz r0,0(rs) - addi rs,rs,1 - stb r0,0(rd) - addi rd,rd,1 - bdnz 1b + + mr r3,rd // set up dest ptr for memcpy() + mr r5,r2 // number of bytes to copy + add rs,rs,r2 // then bump our parameters past initial copy + add rd,rd,r2 + sub rc,rc,r2 + bla _COMM_PAGE_MEMCPY // 128-byte-align destination -// Is source 16-byte aligned? Load constant offsets. +// Load constant offsets and check whether source is 16-byte aligned. +// NB: the kernel clears cr7 if it emulates a dcbz128 on the commpage, +// and we dcbz only if cr7 beq is set. -2: +1: + dcbt 0,rs // touch in 1st line of source andi. r0,rs,15 // check source alignment mfspr rv,vrsave // save caller's bitmask - li r0,-1 // we use all 32 VRs li c16,16 // load the constant offsets for x-form ops li c32,32 + srwi r2,rc,8 // get number of 256-byte chunks to xfer + li r0,-256 // we use 24 VRs (ie, 0-23) li c48,48 - li c128,128 + li c64,64 + li c80,80 + or r0,r0,rv // add our bits to caller's + li c96,96 + mtctr r2 // set up loop count + li c112,112 + cmpd cr7,r2,r2 // initialize cr7_eq to "on", so we dcbz128 + mtspr vrsave,r0 // say we use vr0..vr23 li c256,256 li c384,384 - mtspr vrsave,r0 - -// NB: the kernel clears cr7 if it emulates a dcbz128 on the commpage, -// and we dcbz only if cr7 beq is set. We check to be sure the dcbz's -// won't zero source bytes before we load them, since we zero before -// loading as this is faster than zeroing after loading and before storing. + beq LalignedLoop // handle aligned sources - cmpw cr7,r0,r0 // initialize cr7 beq to use dcbz128 - sub rt,rs,rd // get (rs-rd) - cmplwi cr1,rt,512 // are we moving down less than 512 bytes? -// Start fetching in source cache lines. +// Set up for unaligned loop. - dcbt c128,rs // first line already touched in - dcbt c256,rs - dcbt c384,rs - - bge++ cr1,3f // skip if not moving down less than 512 bytes - cmpw cr7,c16,c32 // cannot dcbz since it would zero source bytes -3: - beq LalignedLoop // handle aligned sources lvsl v0,0,rs // get permute vector for left shift lvxl v1,0,rs // prime the loop + li r0,rzV20 // save non-volatile VRs in redzone + stvx v20,r1,r0 + li r0,rzV21 + stvx v21,r1,r0 + li r0,rzV22 + stvx v22,r1,r0 + li r0,rzV23 + stvx v23,r1,r0 b LunalignedLoop // enter unaligned loop -// Main loop for unaligned operands. We loop over 384-byte chunks (3 cache lines) -// since we need a few VRs for permuted destination QWs and the permute vector. +// Main loop for unaligned operands. We loop over 256-byte chunks (2 cache lines). +// Destination is 128-byte aligned, source is unaligned. .align 5 LunalignedLoop: - subi rc,rc,384 // decrement byte count - addi rx,rs,384 // get address of next chunk + dcbt c256,rs // touch in next chunk + dcbt c384,rs + addi r2,rs,128 // point to 2nd 128 bytes of source lvxl v2,c16,rs lvxl v3,c32,rs + lvxl v4,c48,rs + lvxl v5,c64,rs + lvxl v6,c80,rs + lvxl v7,c96,rs + lvxl v8,c112,rs + lvxl v9,0,r2 + addi rs,rs,256 // point to next source chunk + lvxl v10,c16,r2 + lvxl v11,c32,r2 + vperm v17,v1,v2,v0 + lvxl v12,c48,r2 + lvxl v13,c64,r2 + vperm v18,v2,v3,v0 + lvxl v14,c80,r2 + lvxl v15,c96,r2 + vperm v19,v3,v4,v0 + lvxl v16,c112,r2 + lvxl v1,0,rs // peek ahead at first source quad in next chunk + vperm v20,v4,v5,v0 + addi r2,rd,128 // point to 2nd 128 bytes of dest bne-- cr7,1f // skip dcbz's if cr7 beq has been turned off by kernel - dcbz128 0,rd // (also skip if moving down less than 512 bytes) - bne-- cr7,1f // catch it first time through - dcbz128 c128,rd - dcbz128 c256,rd + dcbz128 0,rd + dcbz128 0,r2 1: - addi rt,rs,64 - dcbt 0,rx // touch in next chunk - dcbt c128,rx - dcbt c256,rx - lvxl v4,c48,rs - addi rs,rs,128 - lvxl v5,0,rt - cmplwi rc,384 // another chunk to go? - lvxl v6,c16,rt - lvxl v7,c32,rt - lvxl v8,c48,rt - addi rt,rs,64 - vperm v25,v1,v2,v0 - lvxl v9,0,rs - lvxl v10,c16,rs - vperm v26,v2,v3,v0 - lvxl v11,c32,rs - lvxl v12,c48,rs - vperm v27,v3,v4,v0 - addi rs,rs,128 - lvxl v13,0,rt - lvxl v14,c16,rt - vperm v28,v4,v5,v0 - lvxl v15,c32,rt - lvxl v16,c48,rt - vperm v29,v5,v6,v0 - addi rt,rs,64 - lvxl v17,0,rs - lvxl v18,c16,rs - vperm v30,v6,v7,v0 - lvxl v19,c32,rs - lvxl v20,c48,rs - vperm v31,v7,v8,v0 - addi rs,rs,128 - lvxl v21,0,rt - lvxl v22,c16,rt - vperm v2,v8,v9,v0 - lvxl v23,c32,rt - lvxl v24,c48,rt - vperm v3,v9,v10,v0 - lvx v1,0,rs // get 1st qw of next chunk - vperm v4,v10,v11,v0 - - addi rt,rd,64 - stvxl v25,0,rd - stvxl v26,c16,rd - vperm v5,v11,v12,v0 - stvxl v27,c32,rd - stvxl v28,c48,rd - vperm v6,v12,v13,v0 - addi rd,rd,128 - stvxl v29,0,rt - stvxl v30,c16,rt - vperm v7,v13,v14,v0 - stvxl v31,c32,rt - stvxl v2,c48,rt - vperm v8,v14,v15,v0 - addi rt,rd,64 - stvxl v3,0,rd - stvxl v4,c16,rd - vperm v9,v15,v16,v0 - stvxl v5,c32,rd - stvxl v6,c48,rd - vperm v10,v16,v17,v0 - addi rd,rd,128 - stvxl v7,0,rt - vperm v11,v17,v18,v0 - stvxl v8,c16,rt - stvxl v9,c32,rt - vperm v12,v18,v19,v0 - stvxl v10,c48,rt - addi rt,rd,64 - vperm v13,v19,v20,v0 - stvxl v11,0,rd - stvxl v12,c16,rd - vperm v14,v20,v21,v0 - stvxl v13,c32,rd - vperm v15,v21,v22,v0 - stvxl v14,c48,rd - vperm v16,v22,v23,v0 - addi rd,rd,128 - stvxl v15,0,rt - vperm v17,v23,v24,v0 - stvxl v16,c16,rt - vperm v18,v24,v1,v0 - stvxl v17,c32,rt - stvxl v18,c48,rt - bge++ LunalignedLoop // loop if another 384 bytes to go - -// End of unaligned main loop. Handle up to 384 leftover bytes. - - srwi. r0,rc,5 // get count of 32-byte chunks remaining - beq Ldone // none - rlwinm rc,rc,0,0x1F // mask count down to 0..31 leftover bytes - mtctr r0 -1: // loop over 32-byte chunks - lvx v2,c16,rs - lvx v3,c32,rs - addi rs,rs,32 - vperm v8,v1,v2,v0 - vperm v9,v2,v3,v0 - vor v1,v3,v3 // v1 <- v3 - stvx v8,0,rd - stvx v9,c16,rd - addi rd,rd,32 - bdnz 1b - - b Ldone + vperm v21,v5,v6,v0 + stvxl v17,0,rd + vperm v22,v6,v7,v0 + stvxl v18,c16,rd + vperm v23,v7,v8,v0 + stvxl v19,c32,rd + vperm v17,v8,v9,v0 + stvxl v20,c48,rd + vperm v18,v9,v10,v0 + stvxl v21,c64,rd + vperm v19,v10,v11,v0 + stvxl v22,c80,rd + vperm v20,v11,v12,v0 + stvxl v23,c96,rd + vperm v21,v12,v13,v0 + stvxl v17,c112,rd + vperm v22,v13,v14,v0 + addi rd,rd,256 // point to next dest chunk + stvxl v18,0,r2 + vperm v23,v14,v15,v0 + stvxl v19,c16,r2 + vperm v17,v15,v16,v0 + stvxl v20,c32,r2 + vperm v18,v16,v1,v0 + stvxl v21,c48,r2 + stvxl v22,c64,r2 + stvxl v23,c80,r2 + stvxl v17,c96,r2 + stvxl v18,c112,r2 + bdnz++ LunalignedLoop // loop if another 256 bytes to go + + li r6,rzV20 // restore non-volatile VRs + li r7,rzV21 + li r8,rzV22 + li r9,rzV23 + lvx v20,r1,r6 + lvx v21,r1,r7 + lvx v22,r1,r8 + lvx v23,r1,r9 + b Ldone // Aligned loop. Destination is 128-byte aligned, and source is 16-byte -// aligned. Loop over 512-byte chunks (4 cache lines.) +// aligned. Loop over 256-byte chunks (2 cache lines.) .align 5 LalignedLoop: - subi rc,rc,512 // decrement count - addi rx,rs,512 // address of next chunk + dcbt c256,rs // touch in next chunk + dcbt c384,rs + addi r2,rs,128 // point to 2nd 128 bytes of source lvxl v1,0,rs lvxl v2,c16,rs - bne-- cr7,1f // skip dcbz's if cr7 beq has been turned off by kernel - dcbz128 0,rd // (also skip if moving down less than 512 bytes) - bne-- cr7,1f // catch it first time through - dcbz128 c128,rd - dcbz128 c256,rd - dcbz128 c384,rd -1: - addi rt,rs,64 - dcbt 0,rx // touch in next chunk - dcbt c128,rx - dcbt c256,rx - dcbt c384,rx lvxl v3,c32,rs lvxl v4,c48,rs - addi rs,rs,128 - lvxl v5,0,rt - cmplwi rc,512 // another chunk to go? - lvxl v6,c16,rt - lvxl v7,c32,rt - lvxl v8,c48,rt - addi rt,rs,64 - lvxl v9,0,rs - lvxl v10,c16,rs - lvxl v11,c32,rs - lvxl v12,c48,rs - addi rs,rs,128 - lvxl v13,0,rt - lvxl v14,c16,rt - lvxl v15,c32,rt - lvxl v16,c48,rt - addi rt,rs,64 - lvxl v17,0,rs - lvxl v18,c16,rs - lvxl v19,c32,rs - lvxl v20,c48,rs - addi rs,rs,128 - lvxl v21,0,rt - lvxl v22,c16,rt - lvxl v23,c32,rt - lvxl v24,c48,rt - addi rt,rs,64 - lvxl v25,0,rs - lvxl v26,c16,rs - lvxl v27,c32,rs - lvxl v28,c48,rs - addi rs,rs,128 - lvxl v29,0,rt - lvxl v30,c16,rt - lvxl v31,c32,rt - lvxl v0,c48,rt - - addi rt,rd,64 + lvxl v5,c64,rs + lvxl v6,c80,rs + lvxl v7,c96,rs + lvxl v8,c112,rs + lvxl v9,0,r2 + lvxl v10,c16,r2 + lvxl v11,c32,r2 + lvxl v12,c48,r2 + lvxl v13,c64,r2 + lvxl v14,c80,r2 + lvxl v15,c96,r2 + lvxl v16,c112,r2 + addi r2,rd,128 // point to 2nd 128 bytes of dest + bne-- cr7,1f // skip dcbz's if cr7 beq has been turned off by kernel + dcbz128 0,rd + dcbz128 0,r2 +1: + addi rs,rs,256 // point to next source chunk stvxl v1,0,rd stvxl v2,c16,rd stvxl v3,c32,rd stvxl v4,c48,rd - addi rd,rd,128 - stvxl v5,0,rt - stvxl v6,c16,rt - stvxl v7,c32,rt - stvxl v8,c48,rt - addi rt,rd,64 - stvxl v9,0,rd - stvxl v10,c16,rd - stvxl v11,c32,rd - stvxl v12,c48,rd - addi rd,rd,128 - stvxl v13,0,rt - stvxl v14,c16,rt - stvxl v15,c32,rt - stvxl v16,c48,rt - addi rt,rd,64 - stvxl v17,0,rd - stvxl v18,c16,rd - stvxl v19,c32,rd - stvxl v20,c48,rd - addi rd,rd,128 - stvxl v21,0,rt - stvxl v22,c16,rt - stvxl v23,c32,rt - stvxl v24,c48,rt - addi rt,rd,64 - stvxl v25,0,rd - stvxl v26,c16,rd - stvxl v27,c32,rd - stvxl v28,c48,rd - addi rd,rd,128 - stvxl v29,0,rt - stvxl v30,c16,rt - stvxl v31,c32,rt - stvxl v0,c48,rt - bge++ LalignedLoop // loop if another 512 bytes to go - -// End of aligned main loop. Handle up to 511 leftover bytes. - - srwi. r0,rc,5 // get count of 32-byte chunks remaining - beq Ldone // none - rlwinm rc,rc,0,0x1F // mask count down to 0..31 leftover bytes - mtctr r0 -1: // loop over 32-byte chunks - lvx v1,0,rs - lvx v2,c16,rs - addi rs,rs,32 - stvx v1,0,rd - stvx v2,c16,rd - addi rd,rd,32 - bdnz 1b - - -// Done, except for 0..31 leftovers at end. Restore non-volatiles. + stvxl v5,c64,rd + stvxl v6,c80,rd + stvxl v7,c96,rd + stvxl v8,c112,rd + addi rd,rd,256 // point to next dest chunk + stvxl v9,0,r2 + stvxl v10,c16,r2 + stvxl v11,c32,r2 + stvxl v12,c48,r2 + stvxl v13,c64,r2 + stvxl v14,c80,r2 + stvxl v15,c96,r2 + stvxl v16,c112,r2 + bdnz++ LalignedLoop // loop if another 256 bytes to go + + +// Done, except for 0..255 leftover bytes at end. // rs = source ptr // rd = dest ptr -// rc = count (0..31) +// rc = remaining count in low 7 bits // rv = caller's vrsave +// rx = caller's return address Ldone: - cmpwi rc,0 // any leftover bytes? - lwz r13,rzR13(r1) // restore non-volatiles from redzone - lwz r14,rzR14(r1) - lwz r15,rzR15(r1) - li r0,rzV20 - lvx v20,r1,r0 - li r0,rzV21 - lvx v21,r1,r0 - li r0,rzV22 - lvx v22,r1,r0 - li r0,rzV23 - lvx v23,r1,r0 - li r0,rzV24 - lvx v24,r1,r0 - li r0,rzV25 - lvx v25,r1,r0 - li r0,rzV26 - lvx v26,r1,r0 - li r0,rzV27 - lvx v27,r1,r0 - li r0,rzV28 - lvx v28,r1,r0 - li r0,rzV29 - lvx v29,r1,r0 - li r0,rzV30 - lvx v30,r1,r0 - li r0,rzV31 - lvx v31,r1,r0 - mtspr vrsave,rv // restore caller's bitmask - beqlr // done if no leftover bytes - - -// Handle 1..31 leftover bytes at end. - - mtctr rc // set up loop count - b 1f - - .align 5 -1: - lbz r0,0(rs) - addi rs,rs,1 - stb r0,0(rd) - addi rd,rd,1 - bdnz 1b + andi. r5,rc,0xFF // any leftover bytes? (0..255) + mtspr vrsave,rv // restore bitmap of live vr's + mr r3,rd + mr r4,rs + bnela _COMM_PAGE_MEMCPY // copy leftover bytes + + mtlr rx // restore return address + ld r3,rzR3(r1) // restore non-volatile GPRs from redzone + ld r13,rzR13(r1) + ld r14,rzR14(r1) + ld r15,rzR15(r1) + ld r16,rzR16(r1) blr - COMMPAGE_DESCRIPTOR(bigcopy_970,_COMM_PAGE_BIGCOPY,0,0,0) // load on all machines for now + COMMPAGE_DESCRIPTOR(bigcopy_970,_COMM_PAGE_BIGCOPY,0,0,kPort32to64+kCommPageBoth)