osfmk/ppc/commpage/bzero_128.s

   1 /*
   2  * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
   3  *
   4  * @APPLE_LICENSE_HEADER_START@
   5  *
   6  * The contents of this file constitute Original Code as defined in and
   7  * are subject to the Apple Public Source License Version 1.1 (the
   8  * "License").  You may not use this file except in compliance with the
   9  * License.  Please obtain a copy of the License at
  10  * http://www.apple.com/publicsource and read it before using this file.
  11  *
  12  * This Original Code and all software distributed under the License are
  13  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  14  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  15  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
  17  * License for the specific language governing rights and limitations
  18  * under the License.
  19  *
  20  * @APPLE_LICENSE_HEADER_END@
  21  */
  22
  23 #define ASSEMBLER
  24 #include <sys/appleapiopts.h>
  25 #include <ppc/asm.h>
  26 #include <machine/cpu_capabilities.h>
  27 #include <machine/commpage.h>
  28
  29         .text
  30         .align  2
  31 /*
  32  * WARNING: this code is written for 32-bit mode, and ported by the kernel if necessary
  33  * to 64-bit mode for use in the 64-bit commpage.  This "port" consists of the following
  34  * simple transformations:
  35  *      - all word compares are changed to doubleword
  36  *      - all "srwi[.]" opcodes are changed to "srdi[.]"
  37  * Nothing else is done.  For this to work, the following rules must be
  38  * carefully followed:
  39  *      - do not use carry or overflow
  40  *      - only use record mode if you are sure the results are mode-invariant
  41  *        for example, all "andi." and almost all "rlwinm." are fine
  42  *      - do not use "slwi", "slw", or "srw"
  43  * An imaginative programmer could break the porting model in other ways, but the above
  44  * are the most likely problem areas.  It is perhaps surprising how well in practice
  45  * this simple method works.
  46  */
  47
  48 // **********************
  49 // * B Z E R O _ 1 2 8  *
  50 // **********************
  51 //
  52 // For 64-bit processors with a 128-byte cache line.
  53 //
  54 // Register use:
  55 //              r0 = zero
  56 //              r3 = original ptr, not changed since memset returns it
  57 //              r4 = count of bytes to set
  58 //              r9 = working operand ptr
  59 // WARNING: We do not touch r2 and r10-r12, which some callers depend on.
  60
  61         .align  5
  62 bzero_128:                                              // void bzero(void *b, size_t len);
  63         cmplwi  cr7,r4,128              // too short for DCBZ128?
  64         li              r0,0                    // get a 0
  65         neg             r5,r3                   // start to compute #bytes to align
  66         mr              r9,r3                   // make copy of operand ptr (can't change r3)
  67         blt             cr7,Ltail               // length < 128, too short for DCBZ
  68
  69 // At least 128 bytes long, so compute alignment and #cache blocks.
  70
  71         andi.   r5,r5,0x7F              // r5 <-  #bytes to 128-byte align
  72         sub             r4,r4,r5                // adjust length
  73         srwi    r8,r4,7                 // r8 <- 128-byte chunks
  74         rlwinm  r4,r4,0,0x7F    // mask length down to remaining bytes
  75         mtctr   r8                              // set up loop count
  76         beq             Ldcbz                   // skip if already aligned (r8!=0)
  77
  78 // 128-byte align
  79
  80         mtcrf   0x01,r5                 // start to move #bytes to align to cr6 and cr7
  81         cmpwi   cr1,r8,0                // any 128-byte cache lines to 0?
  82         mtcrf   0x02,r5
  83
  84         bf              31,1f                   // byte?
  85         stb             r0,0(r9)
  86         addi    r9,r9,1
  87 1:
  88         bf              30,2f                   // halfword?
  89         sth             r0,0(r9)
  90         addi    r9,r9,2
  91 2:
  92         bf              29,3f                   // word?
  93         stw             r0,0(r9)
  94         addi    r9,r9,4
  95 3:
  96         bf              28,4f                   // doubleword?
  97         std             r0,0(r9)
  98         addi    r9,r9,8
  99 4:
 100         bf              27,5f                   // quadword?
 101         std             r0,0(r9)
 102         std             r0,8(r9)
 103         addi    r9,r9,16
 104 5:
 105         bf              26,6f                   // 32-byte chunk?
 106         std             r0,0(r9)
 107         std             r0,8(r9)
 108         std             r0,16(r9)
 109         std             r0,24(r9)
 110         addi    r9,r9,32
 111 6:
 112         bf              25,7f                   // 64-byte chunk?
 113         std             r0,0(r9)
 114         std             r0,8(r9)
 115         std             r0,16(r9)
 116         std             r0,24(r9)
 117         std             r0,32(r9)
 118         std             r0,40(r9)
 119         std             r0,48(r9)
 120         std             r0,56(r9)
 121         addi    r9,r9,64
 122 7:
 123         beq             cr1,Ltail               // no chunks to dcbz128
 124
 125 // Loop doing 128-byte version of DCBZ instruction.
 126 // NB: if the memory is cache-inhibited, the kernel will clear cr7
 127 // when it emulates the alignment exception.  Eventually, we may want
 128 // to check for this case.
 129
 130 Ldcbz:
 131         dcbz128 0,r9                    // zero another 32 bytes
 132         addi    r9,r9,128
 133         bdnz    Ldcbz
 134
 135 // Store trailing bytes.
 136 //              r0 = 0
 137 //              r4 = count
 138 //              r9 = ptr
 139
 140 Ltail:
 141         srwi.   r5,r4,4                 // r5 <- 16-byte chunks to 0
 142         mtcrf   0x01,r4                 // remaining byte count to cr7
 143         mtctr   r5
 144         beq             2f                              // skip if no 16-byte chunks
 145 1:                                                              // loop over 16-byte chunks
 146         std             r0,0(r9)
 147         std             r0,8(r9)
 148         addi    r9,r9,16
 149         bdnz    1b
 150 2:
 151         bf              28,4f                   // 8-byte chunk?
 152         std             r0,0(r9)
 153         addi    r9,r9,8
 154 4:
 155         bf              29,5f                   // word?
 156         stw             r0,0(r9)
 157         addi    r9,r9,4
 158 5:
 159         bf              30,6f                   // halfword?
 160         sth             r0,0(r9)
 161         addi    r9,r9,2
 162 6:
 163         bflr    31                              // byte?
 164         stb             r0,0(r9)
 165         blr
 166
 167         COMMPAGE_DESCRIPTOR(bzero_128,_COMM_PAGE_BZERO,kCache128+k64Bit,0, \
 168                                 kCommPageMTCRF+kCommPageBoth+kPort32to64)