osfmk/ppc/commpage/memset_g5.s

   1 /*
   2  * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28
  29 #include <sys/appleapiopts.h>
  30 #include <ppc/asm.h>
  31 #include <machine/cpu_capabilities.h>
  32 #include <machine/commpage.h>
  33
  34         .text
  35         .align  2
  36 /*
  37  * WARNING: this code is written for 32-bit mode, and ported by the kernel if necessary
  38  * to 64-bit mode for use in the 64-bit commpage.  This "port" consists of the following
  39  * simple transformations:
  40  *      - all word compares are changed to doubleword
  41  *      - all "srwi[.]" opcodes are changed to "srdi[.]"
  42  * Nothing else is done.  For this to work, the following rules must be
  43  * carefully followed:
  44  *      - do not use carry or overflow
  45  *      - only use record mode if you are sure the results are mode-invariant
  46  *        for example, all "andi." and almost all "rlwinm." are fine
  47  *      - do not use "slwi", "slw", or "srw"
  48  * An imaginative programmer could break the porting model in other ways, but the above
  49  * are the most likely problem areas.  It is perhaps surprising how well in practice
  50  * this simple method works.
  51  */
  52
  53 /* *********************
  54  * * M E M S E T _ G 5 *
  55  * *********************
  56  *
  57  * This is a subroutine called by Libc memset and memset_pattern for large nonzero
  58  * operands (zero operands are funneled into bzero.)  This version is for
  59  * 64-bit processors with a 128-byte cache line and Altivec.
  60  *
  61  * Registers at entry:
  62  *              r4 = count of bytes to store (must be >= 32)
  63  *      r8 = ptr to the 1st byte to store (16-byte aligned)
  64  *      r9 = ptr to 16-byte pattern to store (16-byte aligned)
  65  * When we return:
  66  *              r3 = not changed, since memset returns it
  67  *      r4 = bytes remaining to store (will be <32)
  68  *      r7 = not changed
  69  *      r8 = ptr to next byte to store (still 16-byte aligned)
  70  *     r12 = not changed (holds return value for memset)
  71  */
  72
  73 #define kBig    (3*128)                 // big enough to warrant using dcbz (NB: must be >= 3*128)
  74
  75         .align  5
  76 memset_g5:
  77         cmplwi  cr1,r4,kBig             // big enough to warrant using dcbz?
  78         neg     r10,r8                  // start to align ptr
  79         mfspr   r2,vrsave               // we'll be using VRs
  80         andi.   r10,r10,0x70            // get #bytes to cache line align
  81         oris    r0,r2,0x8000            // we use vr0
  82         mtspr   vrsave,r0
  83         li      r5,16                   // get offsets for "stvx"
  84         lvx     v0,0,r9                 // load the pattern into v0
  85         li      r6,32
  86         blt     cr1,LShort              // not big enough to bother with dcbz
  87         li      r9,48
  88
  89         // cache line align
  90
  91         beq     2f                      // already aligned
  92 1:
  93         subic.  r10,r10,16              // more to go?
  94         stvx    v0,0,r8
  95         addi    r8,r8,16
  96         subi    r4,r4,16
  97         bne     1b
  98
  99         // Loop over cache lines.  This code uses a private protocol with the kernel:
 100         // when the kernel emulates an alignment exception on a DCBZ that occurs in the
 101         // commpage, it zeroes CR7.  We use this to detect the case where we are operating on
 102         // uncached memory, and do not use DCBZ again in this code. We assume that either
 103         // all the operand is cacheable or none of it is, so we only check the first DCBZ.
 104 2:
 105         cmpw    cr7,r3,r3               // set cr7_eq (kernel will clear if DCBZ faults)
 106         dcbzl   0,r8                    // zero first cache line (clearing cr7 if alignment exception)
 107         srwi    r0,r4,7                 // get #cache lines (>=2)
 108         rlwinm  r4,r4,0,0x7F            // mask down to residual count (0..127)
 109         bne--   cr7,LNoDcbz             // exit if we took alignment exception on the first DCBZ
 110         subic   r0,r0,1                 // loop 1-too-few times
 111         li      r11,128                 // set DCBZ look-ahead
 112         mtctr   r0
 113         b       3f                      // use loop that DCBZs
 114
 115         // Loop over cache lines.  We DCBZ one line ahead, which is a little faster.
 116
 117         .align  5
 118 3:
 119         dcbzl   r11,r8                  // zero one line ahead
 120         addi    r10,r8,64
 121         stvx    v0,0,r8
 122         stvx    v0,r5,r8
 123         stvx    v0,r6,r8
 124         stvx    v0,r9,r8
 125         addi    r8,r8,128
 126         stvx    v0,0,r10
 127         stvx    v0,r5,r10
 128         stvx    v0,r6,r10
 129         stvx    v0,r9,r10
 130         bdnz++  3b
 131
 132         li      r0,1                    // we've already DCBZ'd the last line
 133 LNoDcbz:                                // r0: loop count
 134         mtctr   r0
 135
 136         // Loop which does not DCBZ.  Normally this is only used for last cache line,
 137         // because we've already zeroed it.
 138 4:
 139         addi    r10,r8,64
 140         stvx    v0,0,r8
 141         stvx    v0,r5,r8
 142         stvx    v0,r6,r8
 143         stvx    v0,r9,r8
 144         addi    r8,r8,128
 145         stvx    v0,0,r10
 146         stvx    v0,r5,r10
 147         stvx    v0,r6,r10
 148         stvx    v0,r9,r10
 149         bdnz--  4b                      // optimize for the cacheable case
 150
 151         // loop over 32-byte chunks
 152 LShort:
 153         srwi.   r0,r4,5                 // get count of 32-byte chunks
 154         rlwinm  r4,r4,0,0x1F            // mask down to residual count (0..31)
 155         beq     7f                      // no chunks so done
 156         mtctr   r0
 157 6:
 158         stvx    v0,0,r8
 159         stvx    v0,r5,r8
 160         addi    r8,r8,32
 161         bdnz++  6b
 162 7:
 163         mtspr   vrsave,r2               // restore caller's vrsave
 164         blr
 165
 166
 167         COMMPAGE_DESCRIPTOR(memset_g5,_COMM_PAGE_MEMSET_PATTERN,kCache128+k64Bit+kHasAltivec,0, \
 168                                 kCommPageBoth+kPort32to64)