osfmk/ppc/commpage/memset_g5.s

   1 /*
   2  * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * The contents of this file constitute Original Code as defined in and
   7  * are subject to the Apple Public Source License Version 1.1 (the
   8  * "License").  You may not use this file except in compliance with the
   9  * License.  Please obtain a copy of the License at
  10  * http://www.apple.com/publicsource and read it before using this file.
  11  *
  12  * This Original Code and all software distributed under the License are
  13  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  14  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  15  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
  17  * License for the specific language governing rights and limitations
  18  * under the License.
  19  *
  20  * @APPLE_LICENSE_HEADER_END@
  21  */
  22
  23 #define ASSEMBLER
  24 #include <sys/appleapiopts.h>
  25 #include <ppc/asm.h>
  26 #include <machine/cpu_capabilities.h>
  27 #include <machine/commpage.h>
  28
  29         .text
  30         .align  2
  31 /*
  32  * WARNING: this code is written for 32-bit mode, and ported by the kernel if necessary
  33  * to 64-bit mode for use in the 64-bit commpage.  This "port" consists of the following
  34  * simple transformations:
  35  *      - all word compares are changed to doubleword
  36  *      - all "srwi[.]" opcodes are changed to "srdi[.]"
  37  * Nothing else is done.  For this to work, the following rules must be
  38  * carefully followed:
  39  *      - do not use carry or overflow
  40  *      - only use record mode if you are sure the results are mode-invariant
  41  *        for example, all "andi." and almost all "rlwinm." are fine
  42  *      - do not use "slwi", "slw", or "srw"
  43  * An imaginative programmer could break the porting model in other ways, but the above
  44  * are the most likely problem areas.  It is perhaps surprising how well in practice
  45  * this simple method works.
  46  */
  47
  48 /* *********************
  49  * * M E M S E T _ G 5 *
  50  * *********************
  51  *
  52  * This is a subroutine called by Libc memset and memset_pattern for large nonzero
  53  * operands (zero operands are funneled into bzero.)  This version is for
  54  * 64-bit processors with a 128-byte cache line and Altivec.
  55  *
  56  * Registers at entry:
  57  *              r4 = count of bytes to store (must be >= 32)
  58  *      r8 = ptr to the 1st byte to store (16-byte aligned)
  59  *      r9 = ptr to 16-byte pattern to store (16-byte aligned)
  60  * When we return:
  61  *              r3 = not changed, since memset returns it
  62  *      r4 = bytes remaining to store (will be <32)
  63  *      r7 = not changed
  64  *      r8 = ptr to next byte to store (still 16-byte aligned)
  65  *     r12 = not changed (holds return value for memset)
  66  */
  67
  68 #define kBig    (3*128)                 // big enough to warrant using dcbz (NB: must be >= 3*128)
  69
  70         .align  5
  71 memset_g5:
  72         cmplwi  cr1,r4,kBig             // big enough to warrant using dcbz?
  73         neg     r10,r8                  // start to align ptr
  74         mfspr   r2,vrsave               // we'll be using VRs
  75         andi.   r10,r10,0x70            // get #bytes to cache line align
  76         oris    r0,r2,0x8000            // we use vr0
  77         mtspr   vrsave,r0
  78         li      r5,16                   // get offsets for "stvx"
  79         lvx     v0,0,r9                 // load the pattern into v0
  80         li      r6,32
  81         blt     cr1,LShort              // not big enough to bother with dcbz
  82         li      r9,48
  83
  84         // cache line align
  85
  86         beq     2f                      // already aligned
  87 1:
  88         subic.  r10,r10,16              // more to go?
  89         stvx    v0,0,r8
  90         addi    r8,r8,16
  91         subi    r4,r4,16
  92         bne     1b
  93
  94         // Loop over cache lines.  This code uses a private protocol with the kernel:
  95         // when the kernel emulates an alignment exception on a DCBZ that occurs in the
  96         // commpage, it zeroes CR7.  We use this to detect the case where we are operating on
  97         // uncached memory, and do not use DCBZ again in this code. We assume that either
  98         // all the operand is cacheable or none of it is, so we only check the first DCBZ.
  99 2:
 100         cmpw    cr7,r3,r3               // set cr7_eq (kernel will clear if DCBZ faults)
 101         dcbzl   0,r8                    // zero first cache line (clearing cr7 if alignment exception)
 102         srwi    r0,r4,7                 // get #cache lines (>=2)
 103         rlwinm  r4,r4,0,0x7F            // mask down to residual count (0..127)
 104         bne--   cr7,LNoDcbz             // exit if we took alignment exception on the first DCBZ
 105         subic   r0,r0,1                 // loop 1-too-few times
 106         li      r11,128                 // set DCBZ look-ahead
 107         mtctr   r0
 108         b       3f                      // use loop that DCBZs
 109
 110         // Loop over cache lines.  We DCBZ one line ahead, which is a little faster.
 111
 112         .align  5
 113 3:
 114         dcbzl   r11,r8                  // zero one line ahead
 115         addi    r10,r8,64
 116         stvx    v0,0,r8
 117         stvx    v0,r5,r8
 118         stvx    v0,r6,r8
 119         stvx    v0,r9,r8
 120         addi    r8,r8,128
 121         stvx    v0,0,r10
 122         stvx    v0,r5,r10
 123         stvx    v0,r6,r10
 124         stvx    v0,r9,r10
 125         bdnz++  3b
 126
 127         li      r0,1                    // we've already DCBZ'd the last line
 128 LNoDcbz:                                // r0: loop count
 129         mtctr   r0
 130
 131         // Loop which does not DCBZ.  Normally this is only used for last cache line,
 132         // because we've already zeroed it.
 133 4:
 134         addi    r10,r8,64
 135         stvx    v0,0,r8
 136         stvx    v0,r5,r8
 137         stvx    v0,r6,r8
 138         stvx    v0,r9,r8
 139         addi    r8,r8,128
 140         stvx    v0,0,r10
 141         stvx    v0,r5,r10
 142         stvx    v0,r6,r10
 143         stvx    v0,r9,r10
 144         bdnz--  4b                      // optimize for the cacheable case
 145
 146         // loop over 32-byte chunks
 147 LShort:
 148         srwi.   r0,r4,5                 // get count of 32-byte chunks
 149         rlwinm  r4,r4,0,0x1F            // mask down to residual count (0..31)
 150         beq     7f                      // no chunks so done
 151         mtctr   r0
 152 6:
 153         stvx    v0,0,r8
 154         stvx    v0,r5,r8
 155         addi    r8,r8,32
 156         bdnz++  6b
 157 7:
 158         mtspr   vrsave,r2               // restore caller's vrsave
 159         blr
 160
 161
 162         COMMPAGE_DESCRIPTOR(memset_g5,_COMM_PAGE_MEMSET_PATTERN,kCache128+k64Bit+kHasAltivec,0, \
 163                                 kCommPageBoth+kPort32to64)