osfmk/ppc/commpage/memset_g4.s

   1 /*
   2  * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_OSREFERENCE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License.  The rights granted to you under the
  10  * License may not be used to create, or enable the creation or
  11  * redistribution of, unlawful or unlicensed copies of an Apple operating
  12  * system, or to circumvent, violate, or enable the circumvention or
  13  * violation of, any terms of an Apple operating system software license
  14  * agreement.
  15  *
  16  * Please obtain a copy of the License at
  17  * http://www.opensource.apple.com/apsl/ and read it before using this
  18  * file.
  19  *
  20  * The Original Code and all software distributed under the License are
  21  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  22  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  23  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  24  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  25  * Please see the License for the specific language governing rights and
  26  * limitations under the License.
  27  *
  28  * @APPLE_LICENSE_OSREFERENCE_HEADER_END@
  29  */
  30
  31 #define ASSEMBLER
  32 #include <sys/appleapiopts.h>
  33 #include <ppc/asm.h>
  34 #include <machine/cpu_capabilities.h>
  35 #include <machine/commpage.h>
  36
  37         .text
  38         .align  2
  39
  40
  41 /* *********************
  42  * * M E M S E T _ G 4 *
  43  * *********************
  44  *
  45  * This is a subroutine called by Libc memset and memset_pattern for large nonzero
  46  * operands (zero operands are funneled into bzero.)  This version is for
  47  * 32-bit processors with a 32-byte cache line and Altivec.
  48  *
  49  * Registers at entry:
  50  *              r4 = count of bytes to store (must be >= 32)
  51  *      r8 = ptr to the 1st byte to store (16-byte aligned)
  52  *      r9 = ptr to 16-byte pattern to store (16-byte aligned)
  53  * When we return:
  54  *              r3 = not changed, since memset returns it
  55  *      r4 = bytes remaining to store (will be <32)
  56  *      r7 = not changed
  57  *      r8 = ptr to next byte to store (still 16-byte aligned)
  58  *     r12 = not changed (holds return value for memset)
  59  */
  60
  61 #define kBig    (3*64)                  // big enough to warrant using dcba (NB: must be >= 3*64)
  62
  63         .align  4
  64 memset_g4:
  65         cmplwi  cr1,r4,kBig             // big enough to warrant using dcbz?
  66         mfspr   r2,vrsave               // we'll be using VRs
  67         oris    r0,r2,0x8000            // we use vr0
  68         andi.   r5,r8,0x10              // is ptr 32-byte aligned?
  69         mtspr   vrsave,r0
  70         li      r5,16                   // get offsets for "stvx"
  71         lvx     v0,0,r9                 // load the pattern into v0
  72         li      r6,32
  73         blt     cr1,LShort              // not big enough to bother with dcba
  74         li      r9,48
  75
  76         // cache line align
  77
  78         beq     2f                      // already aligned
  79         stvx    v0,0,r8                 // store another 16 bytes to align
  80         addi    r8,r8,16
  81         subi    r4,r4,16
  82
  83         // Set up for inner loop.
  84 2:
  85         srwi    r0,r4,6                 // get count of 64-byte chunks (>=2)
  86         dcba    0,r8                    // pre-allocate first cache line (possibly nop'd)
  87         rlwinm  r4,r4,0,0x3F            // mask down to residual count (0..63)
  88         subic   r0,r0,1                 // loop 1-too-few times
  89         li      r10,64                  // get offsets to DCBA one chunk ahead
  90         li      r11,64+32
  91         mtctr   r0
  92         dcba    r6,r8                   // zero 2nd cache line (possibly nop'd)
  93         b       3f                      // enter DCBA loop
  94
  95         // Loop over 64-byte chunks.  We DCBA one chunk ahead, which is a little faster.
  96         // Note that some G4s do not benefit from the DCBAs.  We nop them in that case.
  97
  98         .align  4
  99 3:
 100         dcba    r10,r8                  // zero one 64-byte chunk ahead (possibly nop'd)
 101         dcba    r11,r8
 102         stvx    v0,0,r8
 103         stvx    v0,r5,r8
 104         stvx    v0,r6,r8
 105         stvx    v0,r9,r8
 106         addi    r8,r8,64
 107         bdnz+   3b
 108
 109         // Last chunk, which we've already DCBAd.
 110
 111         stvx    v0,0,r8
 112         stvx    v0,r5,r8
 113         stvx    v0,r6,r8
 114         stvx    v0,r9,r8
 115         addi    r8,r8,64
 116
 117         // loop over 32-byte chunks at end
 118 LShort:
 119         srwi.   r0,r4,5                 // get count of 32-byte chunks
 120         rlwinm  r4,r4,0,0x1F            // mask down to residual count (0..31)
 121         beq     7f                      // no chunks so done
 122         mtctr   r0
 123 6:
 124         stvx    v0,0,r8
 125         stvx    v0,r5,r8
 126         addi    r8,r8,32
 127         bdnz    6b
 128 7:
 129         mtspr   vrsave,r2               // restore caller's vrsave
 130         blr
 131
 132
 133         COMMPAGE_DESCRIPTOR(memset_g4,_COMM_PAGE_MEMSET_PATTERN,kCache32+kHasAltivec,0, \
 134                                 kCommPageDCBA+kCommPage32)