]> git.saurik.com Git - apple/xnu.git/blame - osfmk/ppc/commpage/memset_g4.s
xnu-792.6.56.tar.gz
[apple/xnu.git] / osfmk / ppc / commpage / memset_g4.s
CommitLineData
91447636
A
1/*
2 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
ff6e181a
A
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
11 * file.
91447636 12 *
ff6e181a
A
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
91447636
A
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
ff6e181a
A
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
91447636
A
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 */
23
24#define ASSEMBLER
25#include <sys/appleapiopts.h>
26#include <ppc/asm.h>
27#include <machine/cpu_capabilities.h>
28#include <machine/commpage.h>
29
30 .text
31 .align 2
32
33
34/* *********************
35 * * M E M S E T _ G 4 *
36 * *********************
37 *
38 * This is a subroutine called by Libc memset and memset_pattern for large nonzero
39 * operands (zero operands are funneled into bzero.) This version is for
40 * 32-bit processors with a 32-byte cache line and Altivec.
41 *
42 * Registers at entry:
43 * r4 = count of bytes to store (must be >= 32)
44 * r8 = ptr to the 1st byte to store (16-byte aligned)
45 * r9 = ptr to 16-byte pattern to store (16-byte aligned)
46 * When we return:
47 * r3 = not changed, since memset returns it
48 * r4 = bytes remaining to store (will be <32)
49 * r7 = not changed
50 * r8 = ptr to next byte to store (still 16-byte aligned)
51 * r12 = not changed (holds return value for memset)
52 */
53
54#define kBig (3*64) // big enough to warrant using dcba (NB: must be >= 3*64)
55
56 .align 4
57memset_g4:
58 cmplwi cr1,r4,kBig // big enough to warrant using dcbz?
59 mfspr r2,vrsave // we'll be using VRs
60 oris r0,r2,0x8000 // we use vr0
61 andi. r5,r8,0x10 // is ptr 32-byte aligned?
62 mtspr vrsave,r0
63 li r5,16 // get offsets for "stvx"
64 lvx v0,0,r9 // load the pattern into v0
65 li r6,32
66 blt cr1,LShort // not big enough to bother with dcba
67 li r9,48
68
69 // cache line align
70
71 beq 2f // already aligned
72 stvx v0,0,r8 // store another 16 bytes to align
73 addi r8,r8,16
74 subi r4,r4,16
75
76 // Set up for inner loop.
772:
78 srwi r0,r4,6 // get count of 64-byte chunks (>=2)
79 dcba 0,r8 // pre-allocate first cache line (possibly nop'd)
80 rlwinm r4,r4,0,0x3F // mask down to residual count (0..63)
81 subic r0,r0,1 // loop 1-too-few times
82 li r10,64 // get offsets to DCBA one chunk ahead
83 li r11,64+32
84 mtctr r0
85 dcba r6,r8 // zero 2nd cache line (possibly nop'd)
86 b 3f // enter DCBA loop
87
88 // Loop over 64-byte chunks. We DCBA one chunk ahead, which is a little faster.
89 // Note that some G4s do not benefit from the DCBAs. We nop them in that case.
90
91 .align 4
923:
93 dcba r10,r8 // zero one 64-byte chunk ahead (possibly nop'd)
94 dcba r11,r8
95 stvx v0,0,r8
96 stvx v0,r5,r8
97 stvx v0,r6,r8
98 stvx v0,r9,r8
99 addi r8,r8,64
100 bdnz+ 3b
101
102 // Last chunk, which we've already DCBAd.
103
104 stvx v0,0,r8
105 stvx v0,r5,r8
106 stvx v0,r6,r8
107 stvx v0,r9,r8
108 addi r8,r8,64
109
110 // loop over 32-byte chunks at end
111LShort:
112 srwi. r0,r4,5 // get count of 32-byte chunks
113 rlwinm r4,r4,0,0x1F // mask down to residual count (0..31)
114 beq 7f // no chunks so done
115 mtctr r0
1166:
117 stvx v0,0,r8
118 stvx v0,r5,r8
119 addi r8,r8,32
120 bdnz 6b
1217:
122 mtspr vrsave,r2 // restore caller's vrsave
123 blr
124
125
126 COMMPAGE_DESCRIPTOR(memset_g4,_COMM_PAGE_MEMSET_PATTERN,kCache32+kHasAltivec,0, \
127 kCommPageDCBA+kCommPage32)