osfmk/ppc/commpage/memset_g4.s

   1 /*
   2  * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * The contents of this file constitute Original Code as defined in and
   7  * are subject to the Apple Public Source License Version 1.1 (the
   8  * "License").  You may not use this file except in compliance with the
   9  * License.  Please obtain a copy of the License at
  10  * http://www.apple.com/publicsource and read it before using this file.
  11  *
  12  * This Original Code and all software distributed under the License are
  13  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  14  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  15  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
  17  * License for the specific language governing rights and limitations
  18  * under the License.
  19  *
  20  * @APPLE_LICENSE_HEADER_END@
  21  */
  22
  23 #define ASSEMBLER
  24 #include <sys/appleapiopts.h>
  25 #include <ppc/asm.h>
  26 #include <machine/cpu_capabilities.h>
  27 #include <machine/commpage.h>
  28
  29         .text
  30         .align  2
  31
  32
  33 /* *********************
  34  * * M E M S E T _ G 4 *
  35  * *********************
  36  *
  37  * This is a subroutine called by Libc memset and memset_pattern for large nonzero
  38  * operands (zero operands are funneled into bzero.)  This version is for
  39  * 32-bit processors with a 32-byte cache line and Altivec.
  40  *
  41  * Registers at entry:
  42  *              r4 = count of bytes to store (must be >= 32)
  43  *      r8 = ptr to the 1st byte to store (16-byte aligned)
  44  *      r9 = ptr to 16-byte pattern to store (16-byte aligned)
  45  * When we return:
  46  *              r3 = not changed, since memset returns it
  47  *      r4 = bytes remaining to store (will be <32)
  48  *      r7 = not changed
  49  *      r8 = ptr to next byte to store (still 16-byte aligned)
  50  *     r12 = not changed (holds return value for memset)
  51  */
  52
  53 #define kBig    (3*64)                  // big enough to warrant using dcba (NB: must be >= 3*64)
  54
  55         .align  4
  56 memset_g4:
  57         cmplwi  cr1,r4,kBig             // big enough to warrant using dcbz?
  58         mfspr   r2,vrsave               // we'll be using VRs
  59         oris    r0,r2,0x8000            // we use vr0
  60         andi.   r5,r8,0x10              // is ptr 32-byte aligned?
  61         mtspr   vrsave,r0
  62         li      r5,16                   // get offsets for "stvx"
  63         lvx     v0,0,r9                 // load the pattern into v0
  64         li      r6,32
  65         blt     cr1,LShort              // not big enough to bother with dcba
  66         li      r9,48
  67
  68         // cache line align
  69
  70         beq     2f                      // already aligned
  71         stvx    v0,0,r8                 // store another 16 bytes to align
  72         addi    r8,r8,16
  73         subi    r4,r4,16
  74
  75         // Set up for inner loop.
  76 2:
  77         srwi    r0,r4,6                 // get count of 64-byte chunks (>=2)
  78         dcba    0,r8                    // pre-allocate first cache line (possibly nop'd)
  79         rlwinm  r4,r4,0,0x3F            // mask down to residual count (0..63)
  80         subic   r0,r0,1                 // loop 1-too-few times
  81         li      r10,64                  // get offsets to DCBA one chunk ahead
  82         li      r11,64+32
  83         mtctr   r0
  84         dcba    r6,r8                   // zero 2nd cache line (possibly nop'd)
  85         b       3f                      // enter DCBA loop
  86
  87         // Loop over 64-byte chunks.  We DCBA one chunk ahead, which is a little faster.
  88         // Note that some G4s do not benefit from the DCBAs.  We nop them in that case.
  89
  90         .align  4
  91 3:
  92         dcba    r10,r8                  // zero one 64-byte chunk ahead (possibly nop'd)
  93         dcba    r11,r8
  94         stvx    v0,0,r8
  95         stvx    v0,r5,r8
  96         stvx    v0,r6,r8
  97         stvx    v0,r9,r8
  98         addi    r8,r8,64
  99         bdnz+   3b
 100
 101         // Last chunk, which we've already DCBAd.
 102
 103         stvx    v0,0,r8
 104         stvx    v0,r5,r8
 105         stvx    v0,r6,r8
 106         stvx    v0,r9,r8
 107         addi    r8,r8,64
 108
 109         // loop over 32-byte chunks at end
 110 LShort:
 111         srwi.   r0,r4,5                 // get count of 32-byte chunks
 112         rlwinm  r4,r4,0,0x1F            // mask down to residual count (0..31)
 113         beq     7f                      // no chunks so done
 114         mtctr   r0
 115 6:
 116         stvx    v0,0,r8
 117         stvx    v0,r5,r8
 118         addi    r8,r8,32
 119         bdnz    6b
 120 7:
 121         mtspr   vrsave,r2               // restore caller's vrsave
 122         blr
 123
 124
 125         COMMPAGE_DESCRIPTOR(memset_g4,_COMM_PAGE_MEMSET_PATTERN,kCache32+kHasAltivec,0, \
 126                                 kCommPageDCBA+kCommPage32)