X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/9bccf70c0258c7cac2dcb80011b2a964d884c552..55e303ae13a4cf49d70f2294092726f2fffb9ef2:/osfmk/ppc/bzero.s diff --git a/osfmk/ppc/bzero.s b/osfmk/ppc/bzero.s index 2faf894a0..82e83de1c 100644 --- a/osfmk/ppc/bzero.s +++ b/osfmk/ppc/bzero.s @@ -1,287 +1,307 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2002 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * - * The contents of this file constitute Original Code as defined in and - * are subject to the Apple Public Source License Version 1.1 (the - * "License"). You may not use this file except in compliance with the - * License. Please obtain a copy of the License at - * http://www.apple.com/publicsource and read it before using this file. + * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved. * - * This Original Code and all software distributed under the License are - * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER + * This file contains Original Code and/or Modifications of Original Code + * as defined in and that are subject to the Apple Public Source License + * Version 2.0 (the 'License'). You may not use this file except in + * compliance with the License. Please obtain a copy of the License at + * http://www.opensource.apple.com/apsl/ and read it before using this + * file. + * + * The Original Code and all software distributed under the License are + * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the - * License for the specific language governing rights and limitations - * under the License. + * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. + * Please see the License for the specific language governing rights and + * limitations under the License. * * @APPLE_LICENSE_HEADER_END@ */ - /* - * @OSF_FREE_COPYRIGHT@ - */ #include -#include /* For CACHE_LINE_SIZE */ - -/* - * void bzero(char *addr, unsigned int length) - * - * bzero implementation for PowerPC - * - assumes cacheable memory (i.e. uses DCBZ) - * - assumes non-pic code - * - * returns start address in r3, as per memset (called by memset) - */ - -ENTRY(bzero, TAG_NO_FRAME_USED) - - cmpwi cr0, r4, 0 /* no bytes to zero? */ - mr r7, r3 - mr r8, r3 /* use r8 as counter to where we are */ - beqlr- - cmpwi cr0, r4, CACHE_LINE_SIZE /* clear less than a block? */ - li r0, 0 /* use r0 as source of zeros */ - blt .L_bzeroEndWord - -/* first, clear bytes up to the next word boundary */ - addis r6, 0, HIGH_CADDR(.L_bzeroBeginWord) - addi r6, r6, LOW_ADDR(.L_bzeroBeginWord) - /* extract byte offset as word offset */ - rlwinm. r5, r8, 2, 28, 29 - addi r8, r8, -1 /* adjust for update */ - beq .L_bzeroBeginWord /* no bytes to zero */ - subfic r5, r5, 16 /* compute the number of instructions */ - sub r6, r6, r5 /* back from word clear to execute */ - mtctr r6 - bctr - - stbu r0, 1(r8) - stbu r0, 1(r8) - stbu r0, 1(r8) - -/* clear words up to the next block boundary */ -.L_bzeroBeginWord: - addis r6, 0, HIGH_CADDR(.L_bzeroBlock) - addi r6, r6, LOW_ADDR(.L_bzeroBlock) - addi r8, r8, 1 - rlwinm. r5, r8, 0, 27, 29 /* extract word offset */ - addi r8, r8, -4 /* adjust for update */ - beq .L_bzeroBlock /* no words to zero */ - /* compute the number of instructions */ - subfic r5, r5, CACHE_LINE_SIZE - sub r6, r6, r5 /* back from word clear to execute */ - mtctr r6 - bctr - - stwu r0, 4(r8) - stwu r0, 4(r8) - stwu r0, 4(r8) - stwu r0, 4(r8) - stwu r0, 4(r8) - stwu r0, 4(r8) - stwu r0, 4(r8) - - /* clear cache blocks */ -.L_bzeroBlock: - addi r8, r8, 4 /* remove update adjust */ - sub r5, r8, r7 /* bytes zeroed */ - sub r4, r4, r5 - srwi. r5, r4, CACHE_LINE_POW2 /* blocks to zero */ - beq .L_bzeroEndWord - mtctr r5 - -.L_bzeroBlock1: - dcbz 0, r8 - addi r8, r8, CACHE_LINE_SIZE - bdnz .L_bzeroBlock1 - - /* clear remaining words */ -.L_bzeroEndWord: - addis r6, 0, HIGH_CADDR(.L_bzeroEndByte) - addi r6, r6, LOW_ADDR(.L_bzeroEndByte) - rlwinm. r5, r4, 0, 27, 29 /* extract word offset */ - addi r8, r8, -4 /* adjust for update */ - beq .L_bzeroEndByte /* no words to zero */ - sub r6, r6, r5 /* back from word clear to execute */ - mtctr r6 - bctr - - stwu r0, 4(r8) - stwu r0, 4(r8) - stwu r0, 4(r8) - stwu r0, 4(r8) - stwu r0, 4(r8) - stwu r0, 4(r8) - stwu r0, 4(r8) - - /* clear remaining bytes */ -.L_bzeroEndByte: - addis r6, 0, HIGH_CADDR(.L_bzeroEnd) - addi r6, r6, LOW_ADDR(.L_bzeroEnd) - /* extract byte offset as word offset */ - rlwinm. r5, r4, 2, 28, 29 - addi r8, r8, 3 /* adjust for update */ - beqlr - sub r6, r6, r5 /* back from word clear to execute */ - mtctr r6 - bctr - - stbu r0, 1(r8) - stbu r0, 1(r8) - stbu r0, 1(r8) - -.L_bzeroEnd: - blr - -/* - * void *memset(void *from, int c, vm_size_t nbytes) - * - * almost everywhere in the kernel - * this appears to be called with argument c==0. We optimise for those - * cases and call bzero if we can. - * - */ - -ENTRY(memset, TAG_NO_FRAME_USED) - - mr. ARG3, ARG1 - mr ARG1, ARG2 - /* optimised case - do a bzero */ - beq+ EXT(bzero) - - /* If count is zero, return straight away */ - cmpi cr0, ARG1, 0 - beqlr- - - /* Now, ARG0 = addr, ARG1=len, ARG3=value */ - - subi ARG2, ARG0, 1 /* use ARG2 as our counter */ - -0: - subi ARG1, ARG1, 1 - cmpi cr0, ARG1, 0 - stbu ARG3, 1(ARG2) - bne+ 0b - - /* Return original address in ARG0 */ - - blr - -/* - * void bzero_nc(char *addr, unsigned int length) - * - * bzero implementation for PowerPC - * - assumes non-pic code - * - * returns start address in r3, as per memset (called by memset) - */ - -ENTRY(bzero_nc, TAG_NO_FRAME_USED) - - cmpwi cr0, r4, 0 /* no bytes to zero? */ - mr r7, r3 - mr r8, r3 /* use r8 as counter to where we are */ - beqlr- - cmpwi cr0, r4, CACHE_LINE_SIZE /* clear less than a block? */ - li r0, 0 /* use r0 as source of zeros */ - blt .L_bzeroNCEndWord - -/* first, clear bytes up to the next word boundary */ - addis r6, 0, HIGH_CADDR(.L_bzeroNCBeginWord) - addi r6, r6, LOW_ADDR(.L_bzeroNCBeginWord) - /* extract byte offset as word offset */ - rlwinm. r5, r8, 2, 28, 29 - addi r8, r8, -1 /* adjust for update */ - beq .L_bzeroNCBeginWord /* no bytes to zero */ - subfic r5, r5, 16 /* compute the number of instructions */ - sub r6, r6, r5 /* back from word clear to execute */ - mtctr r6 - bctr - - stbu r0, 1(r8) - stbu r0, 1(r8) - stbu r0, 1(r8) - -/* clear words up to the next block boundary */ -.L_bzeroNCBeginWord: - addis r6, 0, HIGH_CADDR(.L_bzeroNCBlock) - addi r6, r6, LOW_ADDR(.L_bzeroNCBlock) - addi r8, r8, 1 - rlwinm. r5, r8, 0, 27, 29 /* extract word offset */ - addi r8, r8, -4 /* adjust for update */ - beq .L_bzeroNCBlock /* no words to zero */ - /* compute the number of instructions */ - subfic r5, r5, CACHE_LINE_SIZE - sub r6, r6, r5 /* back from word clear to execute */ - mtctr r6 - bctr - - stwu r0, 4(r8) - stwu r0, 4(r8) - stwu r0, 4(r8) - stwu r0, 4(r8) - stwu r0, 4(r8) - stwu r0, 4(r8) - stwu r0, 4(r8) - - /* clear cache blocks */ -.L_bzeroNCBlock: - addi r8, r8, 4 /* remove update adjust */ - sub r5, r8, r7 /* bytes zeroed */ - sub r4, r4, r5 - srwi. r5, r4, CACHE_LINE_POW2 /* blocks to zero */ - beq .L_bzeroNCEndWord - mtctr r5 - -.L_bzeroNCBlock1: - stw r0, 0(r8) - stw r0, 4(r8) - stw r0, 8(r8) - stw r0, 12(r8) - stw r0, 16(r8) - stw r0, 20(r8) - stw r0, 24(r8) - stw r0, 28(r8) - addi r8, r8, CACHE_LINE_SIZE - bdnz .L_bzeroNCBlock1 - - /* clear remaining words */ -.L_bzeroNCEndWord: - addis r6, 0, HIGH_CADDR(.L_bzeroNCEndByte) - addi r6, r6, LOW_ADDR(.L_bzeroNCEndByte) - rlwinm. r5, r4, 0, 27, 29 /* extract word offset */ - addi r8, r8, -4 /* adjust for update */ - beq .L_bzeroNCEndByte /* no words to zero */ - sub r6, r6, r5 /* back from word clear to execute */ - mtctr r6 - bctr - - stwu r0, 4(r8) - stwu r0, 4(r8) - stwu r0, 4(r8) - stwu r0, 4(r8) - stwu r0, 4(r8) - stwu r0, 4(r8) - stwu r0, 4(r8) - - /* clear remaining bytes */ -.L_bzeroNCEndByte: - addis r6, 0, HIGH_CADDR(.L_bzeroNCEnd) - addi r6, r6, LOW_ADDR(.L_bzeroNCEnd) - /* extract byte offset as word offset */ - rlwinm. r5, r4, 2, 28, 29 - addi r8, r8, 3 /* adjust for update */ - beqlr - sub r6, r6, r5 /* back from word clear to execute */ - mtctr r6 - bctr - - stbu r0, 1(r8) - stbu r0, 1(r8) - stbu r0, 1(r8) +#include +#include + + .text + .align 2 + .globl _memset + .globl _bzero + .globl _bzero_nc + .globl _bzero_phys + + +// *********************** +// * B Z E R O _ P H Y S * +// *********************** +// +// void bzero_phys(addr64_t phys_addr, uint32_t length); +// +// Takes a phys addr in (r3,r4), and length in r5. We leave cache on. + + .align 5 +LEXT(bzero_phys) + mflr r12 // save return address + rlwinm r3,r3,0,1,0 // coallesce long-long in (r3,r4) into reg64_t in r3 + rlwimi r3,r4,0,0,31 + mr r4,r5 // put length where bzero() expects it + bl EXT(ml_set_physical_get_ffs) // turn DR off, SF on, features in cr6, old MSR in r11 + bl EXT(bzero) // use normal bzero() routine + mtlr r12 // restore return + b EXT(ml_restore) // restore MSR, turning DR on and SF off + + +// ******************* +// * B Z E R O _ N C * +// ******************* +// +// void bzero_nc(char *addr, unsigned int length); +// +// For use with uncached memory. Doesn't seem to be used at all, so probably not +// performance critical. NB: we must avoid unaligned stores, because some +// machines (eg, 970) take alignment exceptions on _any_ unaligned op to uncached +// memory. Of course, we must also avoid dcbz. + +LEXT(bzero_nc) + cmplwi cr1,r4,20 // too short to bother with 16-byte loops? + cmplwi cr7,r4,0 // check for (len==0) + li r6,0 // get a 0 + bge cr1,bznc1 // skip if length >=20 + mtctr r4 // set up byte loop + beqlr-- cr7 // done if len=0 + +// Short operands, loop over bytes. + +bznc0: + stb r6,0(r3) + addi r3,r3,1 + bdnz bznc0 + blr + +// Handle operands long enough to do doubleword stores; we must doubleword +// align, to avoid alignment exceptions. + +bznc1: + neg r7,r3 // start to compute #bytes to align + mfsprg r10,2 // get feature flags + andi. r0,r7,7 // get #bytes to doubleword align + mr r5,r3 // make copy of operand ptr as bcopy expects + mtcrf 0x02,r10 // put pf64Bitb etc in cr6 + beq bzero_tail // already doubleword aligned + sub r4,r4,r0 // adjust count + mtctr r0 // set up loop +bznc2: // zero bytes until doubleword aligned + stb r6,0(r5) + addi r5,r5,1 + bdnz bznc2 + b bzero_tail // join bzero, now that r5 is aligned + + +// ************* *************** +// * B Z E R O * and * M E M S E T * +// ************* *************** +// +// void * memset(void *b, int c, size_t len); +// void bzero(void *b, size_t len); +// +// These routines support G3, G4, and the 970, and run in both 32 and +// 64-bit mode. Lengths (size_t) are always 32 bits. +// +// Register use: +// r0 = temp +// r2 = temp +// r3 = original ptr, not changed since memset returns it +// r4 = count of bytes to set +// r5 = working operand ptr ("rp") +// r6 = value to store (usually 0) +// r7-r9 = temps +// r10 = feature flags +// r11 = old MSR (if bzero_phys) +// r12 = return address (if bzero_phys) +// cr6 = feature flags (pf64Bit, pf128Byte, and pf32Byte) + + .align 5 +LEXT(memset) // void * memset(void *b, int c, size_t len); + andi. r6,r4,0xFF // copy value to working register, test for 0 + mr r4,r5 // move length to working register + bne-- memset1 // skip if nonzero +LEXT(bzero) // void bzero(void *b, size_t len); + dcbtst 0,r3 // touch in 1st cache block + mfsprg r10,2 // get features + li r6,0 // get a 0 + neg r7,r3 // start to compute #bytes to align + andi. r0,r10,pf128Byte+pf32Byte // get cache line size + mtcrf 0x02,r10 // put pf128Byte etc in cr6 + cmplw r4,r0 // operand length >= cache line size? + mr r5,r3 // make copy of operand ptr (can't change r3) + blt bzero_tail // too short for dcbz (or dcbz128) + rlwinm r0,r7,0,0x1F // get #bytes to 32-byte align + rlwinm r9,r7,0,0x7F // get #bytes to 128-byte align + bt++ pf128Byteb,bzero_128 // skip if 128-byte processor + +// Operand length >=32 and cache line size is 32. +// r0 = #bytes to 32-byte align +// r4 = length +// r5 = ptr to operand +// r6 = 0 + + sub r2,r4,r0 // adjust length + cmpwi cr1,r0,0 // already 32-byte aligned? + srwi. r8,r2,5 // get #32-byte chunks + beq bzero_tail // not long enough to dcbz + mtctr r8 // set up loop count + rlwinm r4,r2,0,27,31 // mask down to leftover byte count + beq cr1,bz_dcbz32 // skip if already 32-byte aligned + +// 32-byte align. We just store 32 0s, rather than test and use conditional +// branches. This is usually faster, because there are no mispredicts. + + stw r6,0(r5) // zero next 32 bytes + stw r6,4(r5) + stw r6,8(r5) + stw r6,12(r5) + stw r6,16(r5) + stw r6,20(r5) + stw r6,24(r5) + stw r6,28(r5) + add r5,r5,r0 // now r5 is 32-byte aligned + b bz_dcbz32 + +// Loop doing 32-byte version of DCBZ instruction. + + .align 4 // align the inner loop +bz_dcbz32: + dcbz 0,r5 // zero another 32 bytes + addi r5,r5,32 + bdnz bz_dcbz32 + +// Store trailing bytes. This routine is used both by bzero and memset. +// r4 = #bytes to store (may be large if memset) +// r5 = address +// r6 = value to store (in all 8 bytes) +// cr6 = pf64Bit etc flags + +bzero_tail: + srwi. r0,r4,4 // get #(16-byte-chunks) + mtcrf 0x01,r4 // remaining byte count to cr7 + beq bzt3 // no 16-byte chunks + mtctr r0 // set up loop count + bt++ pf64Bitb,bzt2 // skip if 64-bit processor + b bzt1 + .align 5 +bzt1: // loop over 16-byte chunks on 32-bit processor + stw r6,0(r5) + stw r6,4(r5) + stw r6,8(r5) + stw r6,12(r5) + addi r5,r5,16 + bdnz bzt1 + b bzt3 + .align 5 +bzt2: // loop over 16-byte chunks on 64-bit processor + std r6,0(r5) + std r6,8(r5) + addi r5,r5,16 + bdnz bzt2 + bf 28,bzt4 // 8-byte chunk? + std r6,0(r5) + addi r5,r5,8 + b bzt4 +bzt3: + bf 28,bzt4 // 8-byte chunk? + stw r6,0(r5) + stw r6,4(r5) + addi r5,r5,8 +bzt4: + bf 29,bzt5 // word? + stw r6,0(r5) + addi r5,r5,4 +bzt5: + bf 30,bzt6 // halfword? + sth r6,0(r5) + addi r5,r5,2 +bzt6: + bflr 31 // byte? + stb r6,0(r5) + blr + +// Operand length is >=128 and cache line size is 128. We assume that +// because the linesize is 128 bytes, this is a 64-bit processor. +// r4 = length +// r5 = ptr to operand +// r6 = 0 +// r7 = neg(r5) +// r9 = #bytes to 128-byte align + + .align 5 +bzero_128: + sub r2,r4,r9 // r2 <- length remaining after cache-line aligning + rlwinm r0,r7,0,0xF // r0 <- #bytes to 16-byte align + srwi. r8,r2,7 // r8 <- number of cache lines to 0 + std r6,0(r5) // always store 16 bytes to 16-byte align... + std r6,8(r5) // ...even if too short for dcbz128 + add r5,r5,r0 // 16-byte align ptr + sub r4,r4,r0 // adjust count + beq bzero_tail // r8==0, not long enough to dcbz128 + sub. r7,r9,r0 // get #bytes remaining to 128-byte align + rlwinm r4,r2,0,0x7F // r4 <- length remaining after dcbz128'ing + mtctr r8 // set up dcbz128 loop + beq bz_dcbz128 // already 128-byte aligned + b bz_align // enter loop over 16-byte chunks + +// 128-byte align by looping over 16-byte chunks. + + .align 5 +bz_align: // loop over 16-byte chunks + subic. r7,r7,16 // more to go? + std r6,0(r5) + std r6,8(r5) + addi r5,r5,16 + bgt bz_align + + b bz_dcbz128 // enter dcbz128 loop + +// Loop over 128-byte cache lines. +// r4 = length remaining after cache lines (0..127) +// r5 = ptr (128-byte aligned) +// r6 = 0 +// ctr = count of cache lines to 0 + + .align 5 +bz_dcbz128: + dcbz128 0,r5 // zero a 128-byte cache line + addi r5,r5,128 + bdnz bz_dcbz128 + + b bzero_tail // handle leftovers + + +// Handle memset() for nonzero values. This case is relatively infrequent; +// the large majority of memset() calls are for 0. +// r3 = ptr +// r4 = count +// r6 = value in lower byte (nonzero) + +memset1: + cmplwi r4,16 // too short to bother aligning? + rlwimi r6,r6,8,16,23 // replicate value to low 2 bytes + mr r5,r3 // make working copy of operand ptr + rlwimi r6,r6,16,0,15 // value now in all 4 bytes + blt bzero_tail // length<16, we won't be using "std" + mfsprg r10,2 // get feature flags + neg r7,r5 // start to compute #bytes to align + rlwinm r6,r6,0,1,0 // value now in all 8 bytes (if 64-bit) + andi. r0,r7,7 // r6 <- #bytes to doubleword align + stw r6,0(r5) // store 8 bytes to avoid a loop + stw r6,4(r5) + mtcrf 0x02,r10 // get pf64Bit flag etc in cr6 + sub r4,r4,r0 // adjust count + add r5,r5,r0 // doubleword align ptr + b bzero_tail + + -.L_bzeroNCEnd: - blr