osfmk/ppc/commpage/memset_g3.s

   1 /*
   2  * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28
  29 #include <sys/appleapiopts.h>
  30 #include <ppc/asm.h>
  31 #include <machine/cpu_capabilities.h>
  32 #include <machine/commpage.h>
  33
  34         .text
  35         .align  2
  36
  37 /* *********************
  38  * * M E M S E T _ G 3 *
  39  * *********************
  40  *
  41  * This is a subroutine called by Libc memset and _memset_pattern for large nonzero
  42  * operands (zero operands are funneled into bzero.)  This version is for
  43  * 32-bit processors with a 32-byte cache line and no Altivec.
  44  *
  45  * Registers at entry:
  46  *              r4 = count of bytes to store (must be >= 32)
  47  *      r8 = ptr to the 1st byte to store (16-byte aligned)
  48  *      r9 = ptr to 16-byte pattern to store (16-byte aligned)
  49  * When we return:
  50  *              r3 = not changed, since memset returns it
  51  *      r4 = bytes remaining to store (will be <32)
  52  *      r7 = not changed
  53  *      r8 = ptr to next byte to store (still 16-byte aligned)
  54  *     r12 = not changed (holds return value for memset)
  55  */
  56
  57         .align  4
  58 memset_g3:
  59         andi.   r0,r8,16                // cache line aligned?
  60         lfd     f0,0(r9)                // pick up the pattern in two FPRs
  61         lfd     f1,8(r9)
  62         beq     1f                      // skip if already aligned
  63
  64         // cache line align
  65
  66         stfd    f0,0(r8)                // no, store another 16 bytes to align
  67         stfd    f1,8(r8)
  68         subi    r4,r4,16                // skip past the 16 bytes we just stored
  69         addi    r8,r8,16
  70
  71         // Loop over cache lines.  This code uses a private protocol with the kernel:
  72         // when the kernel emulates an alignment exception on a DCBZ that occurs in the
  73         // commpage, it zeroes CR7.  We use this to detect the case where we are operating on
  74         // uncached memory, and do not use DCBZ again in this code. We assume that either
  75         // all the operand is cacheable or none of it is, so we only check the first DCBZ.
  76 1:
  77         srwi.   r0,r4,6                 // get count of 64-byte chunks
  78         cmpw    cr7,r0,r0               // set cr7_eq (kernel turns off on alignment exception)
  79         rlwinm  r4,r4,0,0x3F            // mask down to residual count (0..63)
  80         beq     Lleftover               // no chunks
  81         dcbz    0,r8                    // zero first cache line (clearing cr7 if alignment exception)
  82         mtctr   r0
  83         li      r6,32                   // get an offset for DCBZ
  84         beq+    cr7,LDcbzEnter          // enter DCBZ loop (we didn't get an alignment exception)
  85
  86         // Loop over 64-byte chunks without DCBZ.
  87 LNoDcbz:
  88         stfd    f0,0(r8)
  89         stfd    f1,8(r8)
  90         stfd    f0,16(r8)
  91         stfd    f1,24(r8)
  92         stfd    f0,32(r8)
  93         stfd    f1,40(r8)
  94         stfd    f0,48(r8)
  95         stfd    f1,56(r8)
  96         addi    r8,r8,64
  97         bdnz    LNoDcbz
  98
  99         b       Lleftover
 100
 101         // Loop over 64-byte chunks using DCBZ.
 102 LDcbz:
 103         dcbz    0,r8
 104 LDcbzEnter:
 105         dcbz    r6,r8
 106         stfd    f0,0(r8)
 107         stfd    f1,8(r8)
 108         stfd    f0,16(r8)
 109         stfd    f1,24(r8)
 110         stfd    f0,32(r8)
 111         stfd    f1,40(r8)
 112         stfd    f0,48(r8)
 113         stfd    f1,56(r8)
 114         addi    r8,r8,64
 115         bdnz    LDcbz
 116
 117         // Handle leftovers (0..63 bytes)
 118 Lleftover:
 119         srwi.   r0,r4,4                 // get count of 16-byte chunks
 120         rlwinm  r4,r4,0,0xF             // mask down to residuals
 121         beqlr                           // no 16-byte chunks so done
 122         mtctr   r0
 123 2:
 124         stfd    f0,0(r8)
 125         stfd    f1,8(r8)
 126         addi    r8,r8,16
 127         bdnz    2b
 128
 129         blr
 130
 131         COMMPAGE_DESCRIPTOR(memset_g3,_COMM_PAGE_MEMSET_PATTERN,kCache32,kHasAltivec, \
 132                                 kCommPage32)