osfmk/ppc/commpage/memset_g4.s

   1 /*
   2  * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28
  29 #include <sys/appleapiopts.h>
  30 #include <ppc/asm.h>
  31 #include <machine/cpu_capabilities.h>
  32 #include <machine/commpage.h>
  33
  34         .text
  35         .align  2
  36
  37
  38 /* *********************
  39  * * M E M S E T _ G 4 *
  40  * *********************
  41  *
  42  * This is a subroutine called by Libc memset and memset_pattern for large nonzero
  43  * operands (zero operands are funneled into bzero.)  This version is for
  44  * 32-bit processors with a 32-byte cache line and Altivec.
  45  *
  46  * Registers at entry:
  47  *              r4 = count of bytes to store (must be >= 32)
  48  *      r8 = ptr to the 1st byte to store (16-byte aligned)
  49  *      r9 = ptr to 16-byte pattern to store (16-byte aligned)
  50  * When we return:
  51  *              r3 = not changed, since memset returns it
  52  *      r4 = bytes remaining to store (will be <32)
  53  *      r7 = not changed
  54  *      r8 = ptr to next byte to store (still 16-byte aligned)
  55  *     r12 = not changed (holds return value for memset)
  56  */
  57
  58 #define kBig    (3*64)                  // big enough to warrant using dcba (NB: must be >= 3*64)
  59
  60         .align  4
  61 memset_g4:
  62         cmplwi  cr1,r4,kBig             // big enough to warrant using dcbz?
  63         mfspr   r2,vrsave               // we'll be using VRs
  64         oris    r0,r2,0x8000            // we use vr0
  65         andi.   r5,r8,0x10              // is ptr 32-byte aligned?
  66         mtspr   vrsave,r0
  67         li      r5,16                   // get offsets for "stvx"
  68         lvx     v0,0,r9                 // load the pattern into v0
  69         li      r6,32
  70         blt     cr1,LShort              // not big enough to bother with dcba
  71         li      r9,48
  72
  73         // cache line align
  74
  75         beq     2f                      // already aligned
  76         stvx    v0,0,r8                 // store another 16 bytes to align
  77         addi    r8,r8,16
  78         subi    r4,r4,16
  79
  80         // Set up for inner loop.
  81 2:
  82         srwi    r0,r4,6                 // get count of 64-byte chunks (>=2)
  83         dcba    0,r8                    // pre-allocate first cache line (possibly nop'd)
  84         rlwinm  r4,r4,0,0x3F            // mask down to residual count (0..63)
  85         subic   r0,r0,1                 // loop 1-too-few times
  86         li      r10,64                  // get offsets to DCBA one chunk ahead
  87         li      r11,64+32
  88         mtctr   r0
  89         dcba    r6,r8                   // zero 2nd cache line (possibly nop'd)
  90         b       3f                      // enter DCBA loop
  91
  92         // Loop over 64-byte chunks.  We DCBA one chunk ahead, which is a little faster.
  93         // Note that some G4s do not benefit from the DCBAs.  We nop them in that case.
  94
  95         .align  4
  96 3:
  97         dcba    r10,r8                  // zero one 64-byte chunk ahead (possibly nop'd)
  98         dcba    r11,r8
  99         stvx    v0,0,r8
 100         stvx    v0,r5,r8
 101         stvx    v0,r6,r8
 102         stvx    v0,r9,r8
 103         addi    r8,r8,64
 104         bdnz+   3b
 105
 106         // Last chunk, which we've already DCBAd.
 107
 108         stvx    v0,0,r8
 109         stvx    v0,r5,r8
 110         stvx    v0,r6,r8
 111         stvx    v0,r9,r8
 112         addi    r8,r8,64
 113
 114         // loop over 32-byte chunks at end
 115 LShort:
 116         srwi.   r0,r4,5                 // get count of 32-byte chunks
 117         rlwinm  r4,r4,0,0x1F            // mask down to residual count (0..31)
 118         beq     7f                      // no chunks so done
 119         mtctr   r0
 120 6:
 121         stvx    v0,0,r8
 122         stvx    v0,r5,r8
 123         addi    r8,r8,32
 124         bdnz    6b
 125 7:
 126         mtspr   vrsave,r2               // restore caller's vrsave
 127         blr
 128
 129
 130         COMMPAGE_DESCRIPTOR(memset_g4,_COMM_PAGE_MEMSET_PATTERN,kCache32+kHasAltivec,0, \
 131                                 kCommPageDCBA+kCommPage32)