]> git.saurik.com Git - apple/libc.git/blame - arm/string/bzero_CortexA8.s
Libc-825.40.1.tar.gz
[apple/libc.git] / arm / string / bzero_CortexA8.s
CommitLineData
7b00c0c4
A
1/*
2 * Copyright (c) 2009 Apple Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
11 * file.
12 *
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 */
ad3c9f2a
A
23
24#include <arm/arch.h>
25#if defined _ARM_ARCH_7 && !defined VARIANT_DYLD
26
27/**********************************************************************
28 * Cortex-A8 implementation *
29 **********************************************************************/
30
31// Cortex-A8 implementations of memset( ) and bzero( ). Main loop is 64-byte
32// NEON stores, unless the buffer length is > 1k. Beyond that point, there is
33// little to no speed advantage with NEON (and a slight regression in some
34// measured cases), so we switch to the GPRs.
35//
36// The crossover point should be reevaluated for future architectures.
37//
38// -- Stephen Canon, August 2009
39
40.text
41.syntax unified
42.code 16
43
44// void bzero(void * destination,
45// size_t length);
46//
47// zeros out a buffer length bytes long, beginning at the address destination.
48.thumb_func ___bzero$VARIANT$CortexA8
49.globl ___bzero$VARIANT$CortexA8
50.thumb_func _bzero$VARIANT$CortexA8
51.globl _bzero$VARIANT$CortexA8
52.align 2
53___bzero$VARIANT$CortexA8:
54_bzero$VARIANT$CortexA8:
55 mov r2, r1 // match the API to memset(dest, 0, length)
56 eor r1, r1 // and fall through into memset
57
58// void *memset(void * destination,
59// int value, size_t n);
60//
61// writes value converted to an unsigned char to n successive bytes, beginning
62// at destination.
63
64// Notes on register usage:
65//
66// Throughout this function, registers have nearly constant usage; the pattern
67// is:
68//
69// r0 holds the original destination pointer, unmodified. This value
70// must be returned by the routine, so it is easiest to just leave it
71// in place.
72// r1 holds the value that is being copied into the buffer, in some stage
73// of splattedness. The low byte is guaranteed to always have the value
74// but the higher bytes may or may not contain copies of it.
75// r2 holds the length minus some offset, where the offset is always the
76// number of bytes that the current loop stores per iteration.
77// r3-r6,r8,r10,r11 are used with stmia, and will only ever contain splatted
78// copies of the value to be stored.
79// ip holds a pointer to the lowest byte in the array that has not yet been
80// set to hold value.
81// q0 and q1 hold splatted copies of the value in the vector path, and are
82// otherwise unused.
83
84.thumb_func _memset$VARIANT$CortexA8
85.globl _memset$VARIANT$CortexA8
86.align 2
87_memset$VARIANT$CortexA8:
88 mov ip, r0 // copy destination pointer.
89 subs r2, #0x8 // if length - 8 is negative (i.e. length
90 and r1, #0xff // is less than 8), jump to cleanup path.
91 blt L_scalarCleanup //
92
93 tst ip, #0x7 // if the destination is doubleword
94 beq L_vectorCopy // aligned, jump to fast path.
95
960: strb r1, [ip], #1 // store one byte at a time until
97 sub r2, #1 // destination pointer is 8 byte aligned.
98 tst ip, #7 //
99 bne 0b //
100
101 cmp r2, #0x0 // if length - 8 is negative,
102 blt L_scalarCleanup // jump to the cleanup code
103
104L_vectorCopy:
105 vdup.8 q0, r1 // splat the byte to be stored across
106 subs r2, #0x38 // q0 and q1, and check if length - 64
107 vmov q1, q0 // is negative; if so, jump to the
108 blt L_vectorCleanup // cleanup code.
109
110 tst ip, #0x38 // if the destination is cacheline
111 beq L_cachelineAligned // aligned, jump to the fast path.
112
1130: vst1.64 {d0}, [ip, :64]! // store one double word at a time until
114 sub r2, #8 // the destination is 64-byte aligned
115 tst ip, #0x38 //
116 bne 0b
117
118 cmp r2, #0x0 // if length - 64 is negative,
119 blt L_vectorCleanup // jump to the cleanup code
120
121L_cachelineAligned:
122 cmp r2, #0x3c0 // if length > 1024
123 bge L_useSTMIA // we use stmia instead
124
125.align 4 // main loop
1260: vst1.64 {q0,q1}, [ip, :256]! // store 32 bytes
127 subs r2, #0x40 // decrement length by 64
128 vst1.64 {q0,q1}, [ip, :256]! // store 32 bytes
129 bge 0b // if length - 64 >= 0, continue
130
131L_vectorCleanup:
132 adds r2, #0x38 // if (length - 8) < 0, goto scalar cleanup
133 blt L_scalarCleanup //
134
1350: subs r2, #8 // store one double word at a time until
136 vst1.64 {d0}, [ip, :64]! // (length - 8) < 0.
137 bge 0b
138
139L_scalarCleanup:
140 adds r2, #8 // restore length
141 beq 1f // early out if zero.
142
1430: strb r1, [ip], #1 // store one byte at a time until length
144 subs r2, #1 // is zero.
145 bne 0b //
1461: bx lr // return.
147
148// STMIA loop for large buffers
149//
150// For stores larger than 1024 bytes, we use STMIA because we can't get enough
151// of a speedup from NEON to offset the higher power draw of the NEON unit.
152//
153// This crossover should be reevaluated on future architectures.
154//
155// We avoid using r7 and r9 even though it's not strictly necessary.
156
157L_useSTMIA:
158 push {r4,r5,r6,r8,r10,r11}
159 orr r1, r1, r1, lsl #8
160 orr r1, r1, r1, lsl #16
161 mov r3, r1
162 mov r4, r1
163 mov r5, r1
164 mov r6, r1
165 mov r8, r1
166 mov r10, r1
167 mov r11, r1
168.align 4
1690: stmia ip!, {r1,r3,r4,r5,r6,r8,r10,r11}
170 subs r2, #0x40
171 stmia ip!, {r1,r3,r4,r5,r6,r8,r10,r11}
172 bge 0b
173 pop {r4,r5,r6,r8,r10,r11}
174 b L_vectorCleanup
175
176#endif // defined _ARM_ARCH_7 && !defined VARIANT_DYLD
177