/* * Copyright (c) 2003 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * * The contents of this file constitute Original Code as defined in and * are subject to the Apple Public Source License Version 1.1 (the * "License"). You may not use this file except in compliance with the * License. Please obtain a copy of the License at * http://www.apple.com/publicsource and read it before using this file. * * This Original Code and all software distributed under the License are * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the * License for the specific language governing rights and limitations * under the License. * * @APPLE_LICENSE_HEADER_END@ */ #define ASSEMBLER #include #include #include #include .text .align 2 /* ********************* * * M E M S E T _ G 3 * * ********************* * * This is a subroutine called by Libc memset and _memset_pattern for large nonzero * operands (zero operands are funneled into bzero.) This version is for * 32-bit processors with a 32-byte cache line and no Altivec. * * Registers at entry: * r4 = count of bytes to store (must be >= 32) * r8 = ptr to the 1st byte to store (16-byte aligned) * r9 = ptr to 16-byte pattern to store (16-byte aligned) * When we return: * r3 = not changed, since memset returns it * r4 = bytes remaining to store (will be <32) * r7 = not changed * r8 = ptr to next byte to store (still 16-byte aligned) * r12 = not changed (holds return value for memset) */ .align 4 memset_g3: andi. r0,r8,16 // cache line aligned? lfd f0,0(r9) // pick up the pattern in two FPRs lfd f1,8(r9) beq 1f // skip if already aligned // cache line align stfd f0,0(r8) // no, store another 16 bytes to align stfd f1,8(r8) subi r4,r4,16 // skip past the 16 bytes we just stored addi r8,r8,16 // Loop over cache lines. This code uses a private protocol with the kernel: // when the kernel emulates an alignment exception on a DCBZ that occurs in the // commpage, it zeroes CR7. We use this to detect the case where we are operating on // uncached memory, and do not use DCBZ again in this code. We assume that either // all the operand is cacheable or none of it is, so we only check the first DCBZ. 1: srwi. r0,r4,6 // get count of 64-byte chunks cmpw cr7,r0,r0 // set cr7_eq (kernel turns off on alignment exception) rlwinm r4,r4,0,0x3F // mask down to residual count (0..63) beq Lleftover // no chunks dcbz 0,r8 // zero first cache line (clearing cr7 if alignment exception) mtctr r0 li r6,32 // get an offset for DCBZ beq+ cr7,LDcbzEnter // enter DCBZ loop (we didn't get an alignment exception) // Loop over 64-byte chunks without DCBZ. LNoDcbz: stfd f0,0(r8) stfd f1,8(r8) stfd f0,16(r8) stfd f1,24(r8) stfd f0,32(r8) stfd f1,40(r8) stfd f0,48(r8) stfd f1,56(r8) addi r8,r8,64 bdnz LNoDcbz b Lleftover // Loop over 64-byte chunks using DCBZ. LDcbz: dcbz 0,r8 LDcbzEnter: dcbz r6,r8 stfd f0,0(r8) stfd f1,8(r8) stfd f0,16(r8) stfd f1,24(r8) stfd f0,32(r8) stfd f1,40(r8) stfd f0,48(r8) stfd f1,56(r8) addi r8,r8,64 bdnz LDcbz // Handle leftovers (0..63 bytes) Lleftover: srwi. r0,r4,4 // get count of 16-byte chunks rlwinm r4,r4,0,0xF // mask down to residuals beqlr // no 16-byte chunks so done mtctr r0 2: stfd f0,0(r8) stfd f1,8(r8) addi r8,r8,16 bdnz 2b blr COMMPAGE_DESCRIPTOR(memset_g3,_COMM_PAGE_MEMSET_PATTERN,kCache32,kHasAltivec, \ kCommPage32)