[apple/xnu.git] / osfmk / arm64 / bcopy.s

/*
 * Copyright (c) 2012 Apple Computer, Inc. All rights reserved.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
 * 
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. The rights granted to you under the License
 * may not be used to create, or enable the creation or redistribution of,
 * unlawful or unlicensed copies of an Apple operating system, or to
 * circumvent, violate, or enable the circumvention or violation of, any
 * terms of an Apple operating system software license agreement.
 * 
 * Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this file.
 * 
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 * 
 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
 *
 *  This file implements the following functions for the arm64 architecture.
 *
 *  void bcopy(const void * source,
 *             void * destination,
 *             size_t length);
 *
 *  void *memmove(void * destination,
 *                const void * source,
 *                size_t n);
 *
 *  void *memcpy(void * restrict destination,
 *               const void * restrict source,
 *               size_t n);
 *
 * All copy n successive bytes from source to destination.  Memmove and memcpy
 * return destination, whereas bcopy has no return value.  Copying takes place
 * as if it were through a temporary buffer -- after return destination
 * contains exactly the bytes from source, even if the buffers overlap (this is
 * not required of memcpy by the C standard; its behavior is undefined if the
 * buffers overlap, but we are holding ourselves to the historical behavior of
 * this function on MacOS).
 */

#include "asm.h"

.globl _bcopy
.globl _ovbcopy
.globl _memcpy
.globl _memmove

/*****************************************************************************
 *  Macros                                                                   *
 *****************************************************************************/

#define kSmallCopy 64

/*****************************************************************************
 *  Entrypoints                                                              *
 *****************************************************************************/

.text
.align 5
_bcopy:
_ovbcopy:
//  Translate bcopy into memcpy by swapping the first and second arguments.
	mov     x3,      x0
	mov     x0,      x1
	mov     x1,      x3

.align 4
_memcpy:
_memmove:
//	Our preference is to copy the data in ascending address order, but if the
//	buffers overlap such that the beginning of the destination buffer aliases
//	the end of the source buffer, we need to copy in descending address order
//	instead to preserve the memmove semantics.  We detect this case with the
//	test:
//
//	    destination - source < length    (unsigned compare)
//
//	If the address of the source buffer is higher than the address of the
//	destination buffer, this arithmetic can overflow, but the overflowed value
//	can only be smaller than length if the buffers do not overlap, so we don't
//	need to worry about false positives due to the overflow (they happen, but
//	only in cases where copying in either order is correct).
	ARM64_STACK_PROLOG
	PUSH_FRAME
	sub     x3,      x0, x1
	cmp     x3,      x2
	b.cc    L_reverse
	mov     x3,      x0      // copy destination pointer
	cmp     x2,      #(kSmallCopy)
	b.cc    L_forwardSmallCopy

/*****************************************************************************
 *  Forward large copy                                                       *
 *****************************************************************************/

//	Load the first 32 bytes from src, and compute the number of bytes to the
//	first 32-byte aligned location in dst.  Even though we are going to copy
//	32 bytes, only those preceeding that 32-byte location "count" towards
//	reducing the length of the buffer or advancing the pointers.  We will need
//	to issue the first load from the advanced src pointer BEFORE the store to
//	the unmodified dst pointer.
	add     x3,      x3, #32
	and     x3,      x3, #-32 // aligned dst
	ldp     x12,x13,[x1]
	ldp     x14,x15,[x1, #16]
	sub     x5,      x3, x0   // bytes between original dst and aligned dst
	add     x1,      x1, x5   // update src pointer

//	At this point, data in the following registers is in flight:
//
//		x0    original dst pointer
//		x1    corresponding location in src buffer.
//		x2    length from aligned location in dst to end of buffer.  This is
//		      guaranteed to be >= (64 - 32).
//		x3    aligned location in dst buffer.
//		x12:x15 first 32 bytes of src buffer.
//
//	We now load 32 bytes from x1, and store 32 bytes from x12:x15 to x3.  The
//	store *may* overlap the first 32 bytes of the load, so in order to get
//	correct memmove semantics, the first 32 byte load must occur before the
//	store.
//
//	After loading these 32 bytes, we advance x1, and decrement the length by
//	64.  If the remaining length of the buffer was less than 64, then we jump
//	directly to the cleanup path.
	ldp     x8, x9, [x1]
	ldp     x10,x11,[x1, #16]
	add     x1,      x1, #32
	sub     x2,      x2, x5   // update length
	stp     x12,x13,[x0]      // initial unaligned store
	stp     x14,x15,[x0, #16] // initial unaligned store
	subs    x2,      x2, #64
	b.ls    L_forwardCleanup

L_forwardCopyLoop:
//	Main copy loop:
//
//		1. store the 32 bytes loaded in the previous loop iteration
//		2. advance the destination pointer
//		3. load the next 32 bytes
//		4. advance the source pointer
//		5. subtract 32 from the length
//
//	The loop is terminated when 32 or fewer bytes remain to be loaded.  Those
//	trailing 1-32 bytes will be copied in the loop cleanup.
	stnp    x8, x9, [x3]
	stnp    x10,x11,[x3, #16]
	add     x3,      x3, #32
	ldnp    x8, x9, [x1]
	ldnp    x10,x11,[x1, #16]
	add     x1,      x1, #32
	subs    x2,      x2, #32
	b.hi    L_forwardCopyLoop

L_forwardCleanup:
//	There are 32 bytes in x8-x11 that were loaded in the previous loop
//	iteration, which need to be stored to [x3,x3+32).  In addition, between
//  0 and 32 more bytes need to be copied from x1 to x3 + 32.  The exact
//	number of bytes to copy is x2 + 32.  Instead of using smaller conditional
//	copies, we simply copy 32 unaligned bytes from x1+x2 to 64+x3+x2.
//	This copy may overlap with the first store, so the loads must come before
//	the store of the data from the previous loop iteration.
	add     x1,      x1, x2
	ldp     x12,x13,[x1]
	ldp     x14,x15,[x1, #16]
	stp     x8, x9, [x3]
	stp     x10,x11,[x3, #16]
	add     x3,      x3, x2
	stp     x12,x13,[x3, #32]
	stp     x14,x15,[x3, #48]
	POP_FRAME
	ARM64_STACK_EPILOG

/*****************************************************************************
 *  forward small copy                                                       *
 *****************************************************************************/

//	Copy one quadword at a time until less than 8 bytes remain to be copied.
//	At the point of entry to L_forwardSmallCopy, the "calling convention"
//	is as follows:
//
//	  x0     pointer to first byte of destination
//	  x1     pointer to first byte of source
//	  x2     length of buffers
//	  x3     pointer to first byte of destination
0:	ldr     x6,     [x1],#8
	str     x6,     [x3],#8
L_forwardSmallCopy:
	subs    x2,      x2, #8
	b.cs    0b
	adds    x2,      x2, #8
	b.eq    2f
1:	ldrb    w6,     [x1],#1
	strb    w6,     [x3],#1
	subs    x2,      x2, #1
	b.ne    1b
2:	POP_FRAME
	ARM64_STACK_EPILOG

/*****************************************************************************
 *  Reverse copy engines                                                     *
 *****************************************************************************/

//	The reverse copy engines are identical in every way to the forward copy
//	engines, except in that they do everything backwards.  For this reason, they
//	are somewhat more sparsely commented than the forward copy loops.  I have
//	tried to only comment things that might be somewhat surprising in how they
//	differ from the forward implementation.
//
//	The one important thing to note is that (almost without fail), x1 and x3
//	will point to ONE BYTE BEYOND the "right-hand edge" of the active buffer
//	throughout these copy loops.  They are initially advanced to that position
//	in the L_reverse jump island.  Because of this, whereas the forward copy
//	loops generally follow a "copy data, then advance pointers" scheme, in the
//	reverse copy loops, we advance the pointers, then copy the data.

L_reverse:
//	As a minor optimization, we early out if dst == src.
	cbz     x3,      L_return
//	advance both pointers to the ends of their respective buffers before
//	jumping into the appropriate reverse copy loop.
	add     x4,      x0, x2
	add     x1,      x1, x2
	cmp     x2,      #(kSmallCopy)
	b.cc    L_reverseSmallCopy

/*****************************************************************************
 *  Reverse large copy                                                       *
 *****************************************************************************/

	ldp     x12,x13,[x1, #-16]
	ldp     x14,x15,[x1, #-32]
	sub     x3,      x4, #1   // In the forward copy, we used dst+32 & -32
	and     x3,      x3, #-32 // to find an aligned location in the dest
	sub     x5,      x4, x3   // buffer.  Here we use dst-1 & -32 instead,
	sub     x1,      x1, x5   // because we are going backwards.
	sub     x2,      x2, x5
	ldp     x8, x9, [x1, #-16]
	ldp     x10,x11,[x1, #-32]
	stp     x12,x13,[x4, #-16]
	stp     x14,x15,[x4, #-32]
	sub     x1,      x1, #32
	subs    x2,      x2, #64
	b.ls    L_reverseCleanup

L_reverseCopyLoop:
	stnp    x8, x9, [x3, #-16]
	stnp    x10,x11,[x3, #-32]
	sub     x3,      x3, #32
	ldnp    x8, x9, [x1, #-16]
	ldnp    x10,x11,[x1, #-32]
	sub     x1,      x1, #32
	subs    x2,      x2, #32
	b.hi    L_reverseCopyLoop

L_reverseCleanup:
	sub     x1,      x1, x2
	ldp     x12,x13,[x1, #-16]
	ldp     x14,x15,[x1, #-32]
	stp     x8, x9, [x3, #-16]
	stp     x10,x11,[x3, #-32]
	stp     x12,x13,[x0, #16] // In the forward copy, we need to compute the
	stp     x14,x15,[x0]      // address of these stores, but here we already
	POP_FRAME       // have a pointer to the start of the buffer.
	ARM64_STACK_EPILOG

/*****************************************************************************
 *  reverse small copy                                                       *
 *****************************************************************************/

0:	ldr     x6,     [x1,#-8]!
	str     x6,     [x4,#-8]!
L_reverseSmallCopy:
	subs    x2,      x2, #8
	b.cs    0b
	adds    x2,      x2, #8
	b.eq    2f
1:	ldrb    w6,     [x1,#-1]!
	strb    w6,     [x4,#-1]!
	subs    x2,      x2, #1
	b.ne    1b
2:	POP_FRAME
	ARM64_STACK_EPILOG


L_return:
	POP_FRAME
	ARM64_STACK_EPILOG
Commit	Line	Data
5ba3f43e A	1	/*
	2	* Copyright (c) 2012 Apple Computer, Inc. All rights reserved.
	3	*
	4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
	5	*
	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. The rights granted to you under the License
	10	* may not be used to create, or enable the creation or redistribution of,
	11	* unlawful or unlicensed copies of an Apple operating system, or to
	12	* circumvent, violate, or enable the circumvention or violation of, any
	13	* terms of an Apple operating system software license agreement.
	14	*
	15	* Please obtain a copy of the License at
	16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
	17	*
	18	* The Original Code and all software distributed under the License are
	19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	23	* Please see the License for the specific language governing rights and
	24	* limitations under the License.
	25	*
	26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
	27	*
	28	* This file implements the following functions for the arm64 architecture.
	29	*
	30	* void bcopy(const void * source,
	31	* void * destination,
	32	* size_t length);
	33	*
	34	* void memmove(void destination,
	35	* const void * source,
	36	* size_t n);
	37	*
	38	* void memcpy(void restrict destination,
	39	* const void * restrict source,
	40	* size_t n);
	41	*
	42	* All copy n successive bytes from source to destination. Memmove and memcpy
	43	* return destination, whereas bcopy has no return value. Copying takes place
	44	* as if it were through a temporary buffer -- after return destination
	45	* contains exactly the bytes from source, even if the buffers overlap (this is
	46	* not required of memcpy by the C standard; its behavior is undefined if the
	47	* buffers overlap, but we are holding ourselves to the historical behavior of
	48	* this function on MacOS).
	49	*/
	50
	51	#include "asm.h"
	52
	53	.globl _bcopy
	54	.globl _ovbcopy
	55	.globl _memcpy
	56	.globl _memmove
	57
	58	/*****************************************************************************
	59	* Macros *
	60	*****************************************************************************/
	61
	62	#define kSmallCopy 64
	63
	64	/*****************************************************************************
65	* Entrypoints *
66	*****************************************************************************/
67
68	.text
69	.align 5
70	_bcopy:
71	_ovbcopy:
72	// Translate bcopy into memcpy by swapping the first and second arguments.
73	mov x3, x0
74	mov x0, x1
75	mov x1, x3
76
77	.align 4
78	_memcpy:
79	_memmove:
80	// Our preference is to copy the data in ascending address order, but if the
81	// buffers overlap such that the beginning of the destination buffer aliases
82	// the end of the source buffer, we need to copy in descending address order
83	// instead to preserve the memmove semantics. We detect this case with the
84	// test:
85	//
86	// destination - source < length (unsigned compare)
87	//
88	// If the address of the source buffer is higher than the address of the
89	// destination buffer, this arithmetic can overflow, but the overflowed value
90	// can only be smaller than length if the buffers do not overlap, so we don't
91	// need to worry about false positives due to the overflow (they happen, but
92	// only in cases where copying in either order is correct).
d9a64523	93	ARM64_STACK_PROLOG
5ba3f43e A	94	PUSH_FRAME
	95	sub x3, x0, x1
	96	cmp x3, x2
	97	b.cc L_reverse
	98	mov x3, x0 // copy destination pointer
	99	cmp x2, #(kSmallCopy)
	100	b.cc L_forwardSmallCopy
	101
	102	/*****************************************************************************
	103	* Forward large copy *
	104	*****************************************************************************/
	105
	106	// Load the first 32 bytes from src, and compute the number of bytes to the
	107	// first 32-byte aligned location in dst. Even though we are going to copy
	108	// 32 bytes, only those preceeding that 32-byte location "count" towards
	109	// reducing the length of the buffer or advancing the pointers. We will need
	110	// to issue the first load from the advanced src pointer BEFORE the store to
	111	// the unmodified dst pointer.
	112	add x3, x3, #32
	113	and x3, x3, #-32 // aligned dst
	114	ldp x12,x13,[x1]
	115	ldp x14,x15,[x1, #16]
	116	sub x5, x3, x0 // bytes between original dst and aligned dst
	117	add x1, x1, x5 // update src pointer
	118
	119	// At this point, data in the following registers is in flight:
	120	//
	121	// x0 original dst pointer
	122	// x1 corresponding location in src buffer.
	123	// x2 length from aligned location in dst to end of buffer. This is
	124	// guaranteed to be >= (64 - 32).
	125	// x3 aligned location in dst buffer.
	126	// x12:x15 first 32 bytes of src buffer.
	127	//
	128	// We now load 32 bytes from x1, and store 32 bytes from x12:x15 to x3. The
	129	// store may overlap the first 32 bytes of the load, so in order to get
	130	// correct memmove semantics, the first 32 byte load must occur before the
	131	// store.
	132	//
	133	// After loading these 32 bytes, we advance x1, and decrement the length by
	134	// 64. If the remaining length of the buffer was less than 64, then we jump
	135	// directly to the cleanup path.
	136	ldp x8, x9, [x1]
	137	ldp x10,x11,[x1, #16]
	138	add x1, x1, #32
	139	sub x2, x2, x5 // update length
	140	stp x12,x13,[x0] // initial unaligned store
	141	stp x14,x15,[x0, #16] // initial unaligned store
	142	subs x2, x2, #64
	143	b.ls L_forwardCleanup
	144
	145	L_forwardCopyLoop:
	146	// Main copy loop:
	147	//
	148	// 1. store the 32 bytes loaded in the previous loop iteration
	149	// 2. advance the destination pointer
	150	// 3. load the next 32 bytes
	151	// 4. advance the source pointer
	152	// 5. subtract 32 from the length
	153	//
	154	// The loop is terminated when 32 or fewer bytes remain to be loaded. Those
	155	// trailing 1-32 bytes will be copied in the loop cleanup.
	156	stnp x8, x9, [x3]
	157	stnp x10,x11,[x3, #16]
158	add x3, x3, #32
159	ldnp x8, x9, [x1]
160	ldnp x10,x11,[x1, #16]
161	add x1, x1, #32
162	subs x2, x2, #32
163	b.hi L_forwardCopyLoop
164
165	L_forwardCleanup:
166	// There are 32 bytes in x8-x11 that were loaded in the previous loop
167	// iteration, which need to be stored to [x3,x3+32). In addition, between
168	// 0 and 32 more bytes need to be copied from x1 to x3 + 32. The exact
169	// number of bytes to copy is x2 + 32. Instead of using smaller conditional
170	// copies, we simply copy 32 unaligned bytes from x1+x2 to 64+x3+x2.
171	// This copy may overlap with the first store, so the loads must come before
172	// the store of the data from the previous loop iteration.
173	add x1, x1, x2
174	ldp x12,x13,[x1]
175	ldp x14,x15,[x1, #16]
176	stp x8, x9, [x3]
177	stp x10,x11,[x3, #16]
178	add x3, x3, x2
179	stp x12,x13,[x3, #32]
180	stp x14,x15,[x3, #48]
181	POP_FRAME
d9a64523	182	ARM64_STACK_EPILOG
5ba3f43e A	183
	184	/*****************************************************************************
	185	* forward small copy *
	186	*****************************************************************************/
	187
	188	// Copy one quadword at a time until less than 8 bytes remain to be copied.
	189	// At the point of entry to L_forwardSmallCopy, the "calling convention"
	190	// is as follows:
	191	//
	192	// x0 pointer to first byte of destination
	193	// x1 pointer to first byte of source
	194	// x2 length of buffers
	195	// x3 pointer to first byte of destination
	196	0: ldr x6, [x1],#8
	197	str x6, [x3],#8
	198	L_forwardSmallCopy:
	199	subs x2, x2, #8
	200	b.cs 0b
	201	adds x2, x2, #8
	202	b.eq 2f
	203	1: ldrb w6, [x1],#1
	204	strb w6, [x3],#1
	205	subs x2, x2, #1
	206	b.ne 1b
	207	2: POP_FRAME
d9a64523	208	ARM64_STACK_EPILOG
5ba3f43e A	209
	210	/*****************************************************************************
	211	* Reverse copy engines *
	212	*****************************************************************************/
	213
	214	// The reverse copy engines are identical in every way to the forward copy
	215	// engines, except in that they do everything backwards. For this reason, they
	216	// are somewhat more sparsely commented than the forward copy loops. I have
	217	// tried to only comment things that might be somewhat surprising in how they
	218	// differ from the forward implementation.
	219	//
	220	// The one important thing to note is that (almost without fail), x1 and x3
	221	// will point to ONE BYTE BEYOND the "right-hand edge" of the active buffer
	222	// throughout these copy loops. They are initially advanced to that position
	223	// in the L_reverse jump island. Because of this, whereas the forward copy
	224	// loops generally follow a "copy data, then advance pointers" scheme, in the
	225	// reverse copy loops, we advance the pointers, then copy the data.
	226
	227	L_reverse:
	228	// As a minor optimization, we early out if dst == src.
	229	cbz x3, L_return
	230	// advance both pointers to the ends of their respective buffers before
	231	// jumping into the appropriate reverse copy loop.
	232	add x4, x0, x2
	233	add x1, x1, x2
	234	cmp x2, #(kSmallCopy)
	235	b.cc L_reverseSmallCopy
	236
	237	/*****************************************************************************
	238	* Reverse large copy *
	239	*****************************************************************************/
	240
	241	ldp x12,x13,[x1, #-16]
	242	ldp x14,x15,[x1, #-32]
	243	sub x3, x4, #1 // In the forward copy, we used dst+32 & -32
	244	and x3, x3, #-32 // to find an aligned location in the dest
	245	sub x5, x4, x3 // buffer. Here we use dst-1 & -32 instead,
	246	sub x1, x1, x5 // because we are going backwards.
	247	sub x2, x2, x5
	248	ldp x8, x9, [x1, #-16]
	249	ldp x10,x11,[x1, #-32]
	250	stp x12,x13,[x4, #-16]
	251	stp x14,x15,[x4, #-32]
	252	sub x1, x1, #32
	253	subs x2, x2, #64
	254	b.ls L_reverseCleanup
	255
	256	L_reverseCopyLoop:
	257	stnp x8, x9, [x3, #-16]
	258	stnp x10,x11,[x3, #-32]
	259	sub x3, x3, #32
	260	ldnp x8, x9, [x1, #-16]
	261	ldnp x10,x11,[x1, #-32]
	262	sub x1, x1, #32
	263	subs x2, x2, #32
	264	b.hi L_reverseCopyLoop
	265
	266	L_reverseCleanup:
	267	sub x1, x1, x2
	268	ldp x12,x13,[x1, #-16]
	269	ldp x14,x15,[x1, #-32]
	270	stp x8, x9, [x3, #-16]
	271	stp x10,x11,[x3, #-32]
	272	stp x12,x13,[x0, #16] // In the forward copy, we need to compute the
273	stp x14,x15,[x0] // address of these stores, but here we already
274	POP_FRAME // have a pointer to the start of the buffer.
d9a64523	275	ARM64_STACK_EPILOG
5ba3f43e A	276
	277	/*****************************************************************************
	278	* reverse small copy *
	279	*****************************************************************************/
	280
	281	0: ldr x6, [x1,#-8]!
	282	str x6, [x4,#-8]!
	283	L_reverseSmallCopy:
	284	subs x2, x2, #8
	285	b.cs 0b
	286	adds x2, x2, #8
	287	b.eq 2f
	288	1: ldrb w6, [x1,#-1]!
	289	strb w6, [x4,#-1]!
	290	subs x2, x2, #1
	291	b.ne 1b
	292	2: POP_FRAME
d9a64523 A	293	ARM64_STACK_EPILOG
d9a64523 A	294
5ba3f43e A	295
	296	L_return:
	297	POP_FRAME
d9a64523	298	ARM64_STACK_EPILOG