osfmk/ppc/commpage/memset_g3.s

   1 /*
   2  * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * The contents of this file constitute Original Code as defined in and
   7  * are subject to the Apple Public Source License Version 1.1 (the
   8  * "License").  You may not use this file except in compliance with the
   9  * License.  Please obtain a copy of the License at
  10  * http://www.apple.com/publicsource and read it before using this file.
  11  *
  12  * This Original Code and all software distributed under the License are
  13  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  14  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  15  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
  17  * License for the specific language governing rights and limitations
  18  * under the License.
  19  *
  20  * @APPLE_LICENSE_HEADER_END@
  21  */
  22
  23 #define ASSEMBLER
  24 #include <sys/appleapiopts.h>
  25 #include <ppc/asm.h>
  26 #include <machine/cpu_capabilities.h>
  27 #include <machine/commpage.h>
  28
  29         .text
  30         .align  2
  31
  32 /* *********************
  33  * * M E M S E T _ G 3 *
  34  * *********************
  35  *
  36  * This is a subroutine called by Libc memset and _memset_pattern for large nonzero
  37  * operands (zero operands are funneled into bzero.)  This version is for
  38  * 32-bit processors with a 32-byte cache line and no Altivec.
  39  *
  40  * Registers at entry:
  41  *              r4 = count of bytes to store (must be >= 32)
  42  *      r8 = ptr to the 1st byte to store (16-byte aligned)
  43  *      r9 = ptr to 16-byte pattern to store (16-byte aligned)
  44  * When we return:
  45  *              r3 = not changed, since memset returns it
  46  *      r4 = bytes remaining to store (will be <32)
  47  *      r7 = not changed
  48  *      r8 = ptr to next byte to store (still 16-byte aligned)
  49  *     r12 = not changed (holds return value for memset)
  50  */
  51
  52         .align  4
  53 memset_g3:
  54         andi.   r0,r8,16                // cache line aligned?
  55         lfd     f0,0(r9)                // pick up the pattern in two FPRs
  56         lfd     f1,8(r9)
  57         beq     1f                      // skip if already aligned
  58
  59         // cache line align
  60
  61         stfd    f0,0(r8)                // no, store another 16 bytes to align
  62         stfd    f1,8(r8)
  63         subi    r4,r4,16                // skip past the 16 bytes we just stored
  64         addi    r8,r8,16
  65
  66         // Loop over cache lines.  This code uses a private protocol with the kernel:
  67         // when the kernel emulates an alignment exception on a DCBZ that occurs in the
  68         // commpage, it zeroes CR7.  We use this to detect the case where we are operating on
  69         // uncached memory, and do not use DCBZ again in this code. We assume that either
  70         // all the operand is cacheable or none of it is, so we only check the first DCBZ.
  71 1:
  72         srwi.   r0,r4,6                 // get count of 64-byte chunks
  73         cmpw    cr7,r0,r0               // set cr7_eq (kernel turns off on alignment exception)
  74         rlwinm  r4,r4,0,0x3F            // mask down to residual count (0..63)
  75         beq     Lleftover               // no chunks
  76         dcbz    0,r8                    // zero first cache line (clearing cr7 if alignment exception)
  77         mtctr   r0
  78         li      r6,32                   // get an offset for DCBZ
  79         beq+    cr7,LDcbzEnter          // enter DCBZ loop (we didn't get an alignment exception)
  80
  81         // Loop over 64-byte chunks without DCBZ.
  82 LNoDcbz:
  83         stfd    f0,0(r8)
  84         stfd    f1,8(r8)
  85         stfd    f0,16(r8)
  86         stfd    f1,24(r8)
  87         stfd    f0,32(r8)
  88         stfd    f1,40(r8)
  89         stfd    f0,48(r8)
  90         stfd    f1,56(r8)
  91         addi    r8,r8,64
  92         bdnz    LNoDcbz
  93
  94         b       Lleftover
  95
  96         // Loop over 64-byte chunks using DCBZ.
  97 LDcbz:
  98         dcbz    0,r8
  99 LDcbzEnter:
 100         dcbz    r6,r8
 101         stfd    f0,0(r8)
 102         stfd    f1,8(r8)
 103         stfd    f0,16(r8)
 104         stfd    f1,24(r8)
 105         stfd    f0,32(r8)
 106         stfd    f1,40(r8)
 107         stfd    f0,48(r8)
 108         stfd    f1,56(r8)
 109         addi    r8,r8,64
 110         bdnz    LDcbz
 111
 112         // Handle leftovers (0..63 bytes)
 113 Lleftover:
 114         srwi.   r0,r4,4                 // get count of 16-byte chunks
 115         rlwinm  r4,r4,0,0xF             // mask down to residuals
 116         beqlr                           // no 16-byte chunks so done
 117         mtctr   r0
 118 2:
 119         stfd    f0,0(r8)
 120         stfd    f1,8(r8)
 121         addi    r8,r8,16
 122         bdnz    2b
 123
 124         blr
 125
 126         COMMPAGE_DESCRIPTOR(memset_g3,_COMM_PAGE_MEMSET_PATTERN,kCache32,kHasAltivec, \
 127                                 kCommPage32)