[apple/xnu.git] / osfmk / arm64 / strnlen.s

/*
 * Copyright (c) 2012 Apple Computer, Inc. All rights reserved.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
 * 
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. The rights granted to you under the License
 * may not be used to create, or enable the creation or redistribution of,
 * unlawful or unlicensed copies of an Apple operating system, or to
 * circumvent, violate, or enable the circumvention or violation of, any
 * terms of an Apple operating system software license agreement.
 * 
 * Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this file.
 * 
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 * 
 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
 *
 * This file implements the following function for the arm64 architecture:
 *
 *  size_t strnlen(const char *string, size_t maxlen);
 *
 * The strnlen function returns either strlen(string) or maxlen, whichever
 * is amller, without reading beyond the first maxlen characters of string.
 */

#include <arm64/asm.h>

.globl _strlen
.globl _strnlen

/*****************************************************************************
 *  Macros                                                                   *
 *****************************************************************************/

.macro EstablishFrame
	ARM64_STACK_PROLOG
	stp       fp, lr, [sp, #-16]!
	mov       fp,      sp
.endm

.macro ClearFrameAndReturn
	ldp       fp, lr, [sp], #16
	ARM64_STACK_EPILOG
.endm

/*****************************************************************************
 *  Constants                                                                *
 *****************************************************************************/

.text
.align 5
L_masks:
.quad 0x0706050403020100, 0x0f0e0d0c0b0a0908
.quad 0x0000000000000000, 0x0000000000000000

/*****************************************************************************
 *  strnlen entrypoint                                                       *
 *****************************************************************************/

_strnlen:
//	If n == 0, return NULL without loading any data from s.  If n is so large
//	that it exceeds the size of any buffer that can be allocted, jump into a
//	simpler implementation that omits all length checks.  This is both faster
//	and lets us avoid some messy edgecases in the mainline.
	tst       x1,      x1
	b.mi      _strlen
	b.eq      L_maxlenIsZero
	EstablishFrame
//	Load the 16-byte aligned vector containing the start of the string.
	and       x2,      x0, #-16
	ldr       q0,     [x2]
//	Load a vector {0,1,2, ... ,15} for use in finding the index of the NUL
//	byte once we identify one.  We don't use this vector until the very end
//	of the routine; it simply falls out naturally to load it now.
	adr       x3,          L_masks
	ldr       q2,     [x3],#16
//	The aligned vector that we loaded to q0 contains the start of the string,
//	but if the string was not originally aligned, it also contains bytes
//	which preceed the start of the string, and which may cause false positives
//	when we search for the terminating NUL.  We generate a mask to OR into the
//	vector using an unaligned load to prevent this.  The mask has non-zero
//	values only in those bytes which correspond to bytes preceeding the start
//	of the string in the aligned vector load.
	and       x4,      x0, #0xf
	sub       x3,      x3, x4
	ldr       q1,     [x3]
	orr.16b   v0,      v0, v1
//	Adjust maxlen to account for bytes which preceed the start of the string,
//	and jump into the main scanning loop.
	add       x1,      x1, x4
	b         1f

//	Main loop.  Identical to strlen, except that we also need to check that we
//	don't read more than maxlen bytes.  To that end, we decrement maxlen by 16
//	on each iteration, and exit the loop if the result is zero or negative.
.align 4
0:	ldr       q0,     [x2, #16]!
1:  uminv.16b b1,      v0
	fmov      w3,      s1
	cbz       w3,      L_foundNUL
	subs      x1,      x1, #16
	b.hi      0b

//	We exhausted maxlen bytes without finding a terminating NUL character, so
//  we need to return maxlen.
	sub       x0,      x2, x0
	add       x1,      x1, #16
	add       x0,      x0, x1
	ClearFrameAndReturn

L_maxlenIsZero:
	mov       x0,      #0
	ret                         // No stack frame, so don't clear it.

L_foundNUL:
//	Compute the index of the NUL byte, and check if it occurs before maxlen
//	bytes into the vector.  If not, return maxlen.  Otherwise, return the
//	length of the string.
	eor.16b   v1,      v1, v1
	cmhi.16b  v0,      v0, v1
	orr.16b   v0,      v0, v2
	uminv.16b b1,      v0
	fmov      w3,      s1      // index of NUL byte in vector
	sub       x0,      x2, x0  // index of vector in string
	cmp       x1,      x3      // if NUL occurs before maxlen bytes
	csel      x1,      x1, x3, cc // return strlen, else maxlen
	add       x0,      x0, x1
	ClearFrameAndReturn

/*****************************************************************************
 *  strlen entrypoint                                                        *
 *****************************************************************************/

.align 4
_strlen:
	EstablishFrame
//	Load the 16-byte aligned vector containing the start of the string.
	and       x1,      x0, #-16
	ldr       q0,     [x1]
//	Load a vector {0,1,2, ... ,15} for use in finding the index of the NUL
//	byte once we identify one.  We don't use this vector until the very end
//	of the routine; it simply falls out naturally to load it now.
	adr       x3,          L_masks
	ldr       q2,     [x3],#16
//	The aligned vector that we loaded to q0 contains the start of the string,
//	but if the string was not originally aligned, it also contains bytes
//	which preceed the start of the string, and which may cause false positives
//	when we search for the terminating NUL.  We generate a mask to OR into the
//	vector using an unaligned load to prevent this.  The mask has non-zero
//	values only in those bytes which correspond to bytes preceeding the start
//	of the string in the aligned vector load.
	and       x2,      x0, #0xf
	sub       x3,      x3, x2
	ldr       q1,     [x3]
	orr.16b   v0,      v0, v1
	b         1f

//	Main loop.  On each iteration we do the following:
//
//		q0 <-- next 16 aligned bytes of string
//		b1 <-- unsigned minimum byte in q0
//      if (b1 != 0) continue
//
//	Thus, we continue the loop until the 16 bytes we load contain a zero byte.
.align 4
0:	ldr       q0,     [x1, #16]!
1:	uminv.16b b1,      v0
	fmov      w2,      s1 // umov.b would be more natural, but requries 2 µops.
	cbnz      w2,      0b

//	A zero byte has been found.  The following registers contain values that
//	we need to compute the string's length:
//
//		x0		pointer to start of string
//		x1		pointer to vector containing terminating NUL byte
//		v0		vector containing terminating NUL byte
//		v2      {0, 1, 2, ... , 15}
//
//	We compute the index of the terminating NUL byte in the string (which is
//	precisely the length of the string) as follows:
//
//		vec <-- mask(v0 != 0) | v2
//		index <-- x1 - x0 + unsignedMinimum(vec)
	eor.16b   v1,      v1, v1
	cmhi.16b  v0,      v0, v1
	orr.16b   v0,      v0, v2
	uminv.16b b1,      v0
	fmov      w2,      s1
	sub       x0,      x1, x0
	add       x0,      x0, x2
	ClearFrameAndReturn
Commit	Line	Data
5ba3f43e A	1	/*
	2	* Copyright (c) 2012 Apple Computer, Inc. All rights reserved.
	3	*
	4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
	5	*
	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. The rights granted to you under the License
	10	* may not be used to create, or enable the creation or redistribution of,
	11	* unlawful or unlicensed copies of an Apple operating system, or to
	12	* circumvent, violate, or enable the circumvention or violation of, any
	13	* terms of an Apple operating system software license agreement.
	14	*
	15	* Please obtain a copy of the License at
	16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
	17	*
	18	* The Original Code and all software distributed under the License are
	19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	23	* Please see the License for the specific language governing rights and
	24	* limitations under the License.
	25	*
	26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
	27	*
	28	* This file implements the following function for the arm64 architecture:
	29	*
	30	* size_t strnlen(const char *string, size_t maxlen);
	31	*
	32	* The strnlen function returns either strlen(string) or maxlen, whichever
	33	* is amller, without reading beyond the first maxlen characters of string.
	34	*/
	35
d9a64523 A	36	#include <arm64/asm.h>
d9a64523 A	37
5ba3f43e A	38	.globl _strlen
	39	.globl _strnlen
	40
	41	/*****************************************************************************
	42	* Macros *
	43	*****************************************************************************/
	44
	45	.macro EstablishFrame
d9a64523	46	ARM64_STACK_PROLOG
5ba3f43e A	47	stp fp, lr, [sp, #-16]!
	48	mov fp, sp
	49	.endm
	50
	51	.macro ClearFrameAndReturn
	52	ldp fp, lr, [sp], #16
d9a64523	53	ARM64_STACK_EPILOG
5ba3f43e A	54	.endm
	55
	56	/*****************************************************************************
	57	* Constants *
	58	*****************************************************************************/
	59
	60	.text
	61	.align 5
	62	L_masks:
	63	.quad 0x0706050403020100, 0x0f0e0d0c0b0a0908
	64	.quad 0x0000000000000000, 0x0000000000000000
	65
	66	/*****************************************************************************
	67	* strnlen entrypoint *
	68	*****************************************************************************/
	69
	70	_strnlen:
	71	// If n == 0, return NULL without loading any data from s. If n is so large
	72	// that it exceeds the size of any buffer that can be allocted, jump into a
	73	// simpler implementation that omits all length checks. This is both faster
	74	// and lets us avoid some messy edgecases in the mainline.
	75	tst x1, x1
	76	b.mi _strlen
	77	b.eq L_maxlenIsZero
	78	EstablishFrame
	79	// Load the 16-byte aligned vector containing the start of the string.
	80	and x2, x0, #-16
	81	ldr q0, [x2]
	82	// Load a vector {0,1,2, ... ,15} for use in finding the index of the NUL
	83	// byte once we identify one. We don't use this vector until the very end
	84	// of the routine; it simply falls out naturally to load it now.
	85	adr x3, L_masks
	86	ldr q2, [x3],#16
	87	// The aligned vector that we loaded to q0 contains the start of the string,
	88	// but if the string was not originally aligned, it also contains bytes
	89	// which preceed the start of the string, and which may cause false positives
	90	// when we search for the terminating NUL. We generate a mask to OR into the
	91	// vector using an unaligned load to prevent this. The mask has non-zero
	92	// values only in those bytes which correspond to bytes preceeding the start
	93	// of the string in the aligned vector load.
	94	and x4, x0, #0xf
	95	sub x3, x3, x4
	96	ldr q1, [x3]
	97	orr.16b v0, v0, v1
	98	// Adjust maxlen to account for bytes which preceed the start of the string,
	99	// and jump into the main scanning loop.
	100	add x1, x1, x4
	101	b 1f
	102
	103	// Main loop. Identical to strlen, except that we also need to check that we
	104	// don't read more than maxlen bytes. To that end, we decrement maxlen by 16
	105	// on each iteration, and exit the loop if the result is zero or negative.
	106	.align 4
	107	0: ldr q0, [x2, #16]!
	108	1: uminv.16b b1, v0
	109	fmov w3, s1
	110	cbz w3, L_foundNUL
	111	subs x1, x1, #16
	112	b.hi 0b
	113
	114	// We exhausted maxlen bytes without finding a terminating NUL character, so
	115	// we need to return maxlen.
	116	sub x0, x2, x0
	117	add x1, x1, #16
118	add x0, x0, x1
119	ClearFrameAndReturn
120
121	L_maxlenIsZero:
d9a64523	122	mov x0, #0
5ba3f43e A	123	ret // No stack frame, so don't clear it.
	124
	125	L_foundNUL:
	126	// Compute the index of the NUL byte, and check if it occurs before maxlen
	127	// bytes into the vector. If not, return maxlen. Otherwise, return the
	128	// length of the string.
	129	eor.16b v1, v1, v1
	130	cmhi.16b v0, v0, v1
	131	orr.16b v0, v0, v2
	132	uminv.16b b1, v0
	133	fmov w3, s1 // index of NUL byte in vector
	134	sub x0, x2, x0 // index of vector in string
	135	cmp x1, x3 // if NUL occurs before maxlen bytes
	136	csel x1, x1, x3, cc // return strlen, else maxlen
	137	add x0, x0, x1
	138	ClearFrameAndReturn
	139
	140	/*****************************************************************************
	141	* strlen entrypoint *
	142	*****************************************************************************/
	143
	144	.align 4
	145	_strlen:
	146	EstablishFrame
	147	// Load the 16-byte aligned vector containing the start of the string.
	148	and x1, x0, #-16
	149	ldr q0, [x1]
	150	// Load a vector {0,1,2, ... ,15} for use in finding the index of the NUL
	151	// byte once we identify one. We don't use this vector until the very end
	152	// of the routine; it simply falls out naturally to load it now.
	153	adr x3, L_masks
	154	ldr q2, [x3],#16
	155	// The aligned vector that we loaded to q0 contains the start of the string,
	156	// but if the string was not originally aligned, it also contains bytes
	157	// which preceed the start of the string, and which may cause false positives
	158	// when we search for the terminating NUL. We generate a mask to OR into the
	159	// vector using an unaligned load to prevent this. The mask has non-zero
	160	// values only in those bytes which correspond to bytes preceeding the start
	161	// of the string in the aligned vector load.
	162	and x2, x0, #0xf
	163	sub x3, x3, x2
	164	ldr q1, [x3]
	165	orr.16b v0, v0, v1
	166	b 1f
	167
	168	// Main loop. On each iteration we do the following:
	169	//
	170	// q0 <-- next 16 aligned bytes of string
	171	// b1 <-- unsigned minimum byte in q0
	172	// if (b1 != 0) continue
	173	//
	174	// Thus, we continue the loop until the 16 bytes we load contain a zero byte.
	175	.align 4
	176	0: ldr q0, [x1, #16]!
	177	1: uminv.16b b1, v0
	178	fmov w2, s1 // umov.b would be more natural, but requries 2 µops.
	179	cbnz w2, 0b
	180
	181	// A zero byte has been found. The following registers contain values that
	182	// we need to compute the string's length:
	183	//
	184	// x0 pointer to start of string
	185	// x1 pointer to vector containing terminating NUL byte
	186	// v0 vector containing terminating NUL byte
187	// v2 {0, 1, 2, ... , 15}
188	//
189	// We compute the index of the terminating NUL byte in the string (which is
190	// precisely the length of the string) as follows:
191	//
192	// vec <-- mask(v0 != 0) \| v2
193	// index <-- x1 - x0 + unsignedMinimum(vec)
194	eor.16b v1, v1, v1
195	cmhi.16b v0, v0, v1
196	orr.16b v0, v0, v2
197	uminv.16b b1, v0
198	fmov w2, s1
199	sub x0, x1, x0
200	add x0, x0, x2
201	ClearFrameAndReturn