[apple/xnu.git] / osfmk / i386 / commpage / memset_pattern_sse2_64.s

/*
 * Copyright (c) 2006 Apple Computer, Inc. All rights reserved.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
 * 
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. The rights granted to you under the License
 * may not be used to create, or enable the creation or redistribution of,
 * unlawful or unlicensed copies of an Apple operating system, or to
 * circumvent, violate, or enable the circumvention or violation of, any
 * terms of an Apple operating system software license agreement.
 * 
 * Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this file.
 * 
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 * 
 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
 */

#include <machine/cpu_capabilities.h>
#include <machine/commpage.h>

/* The common path for nonzero memset and the memset_pattern routines,
 * tuned for Pentium-M class processors with SSE2 and 64-byte cache lines.
 * This is the 64-bit bersion.  It is used by the following functions:
 *
 *	void *memset(void *b, int c, size_t len);                   // when c!=0
 *	void memset_pattern4(void *b, const void *c4, size_t len);
 *	void memset_pattern8(void *b, const void *c8, size_t len);
 *	void memset_pattern16(void *b, const void *c16, size_t len);
 *
 * Note bzero() and memset() of 0 are handled separately.
 */

#define	kShort		63
#define	kVeryLong	(1024*1024)

// Initial entry from Libc with parameters passed in registers.  Although we
// correctly handle misaligned ptrs and short operands, they are inefficient.
// Therefore our caller should filter out short operands and exploit local
// knowledge (ie, original pattern length) to align the ptr if possible.
// When called, we expect:
//	%rdi = ptr to memory to set (not necessarily aligned)
//	%rdx = length (may be short or even 0)
//	%xmm0 = the pattern to store
// Return conditions:
//	%rax, %rdi, %rsi, %rcx, and %rdx all trashed
//	we preserve %r8, %r9, %r10, and %r11

        .text
        .align  5, 0x90
	.code64
Lmemset_pattern_sse2_64:
        cmpq    $(kShort),%rdx		// long enough to bother aligning?
        ja	LNotShort		// yes
	jmp	LShort			// no
        
// Here for short operands or the end of long ones.
//      %rdx = length (<= kShort)
//      %rdi = ptr (may not be not aligned)
//      %xmm0 = pattern

LUnalignedStore16:
	movdqu	%xmm0,(%rdi)		// stuff in another 16 bytes
	subl	$16,%edx
	addq	$16,%rdi
LShort:	
	cmpl	$16,%edx		// room for another vector?
	jge	LUnalignedStore16	// yes
LLessThan16:				// here at end of copy with < 16 bytes remaining
	test	$8,%dl			// 8-byte store required?
	jz	2f			// no
	movq	%xmm0,(%rdi)		// pack in 8 low bytes
	psrldq	$8,%xmm0		// then shift vector down 8 bytes
	addq	$8,%rdi
2:
	test	$4,%dl			// 4-byte store required?
	jz	3f			// no
	movd	%xmm0,(%rdi)		// pack in 4 low bytes
	psrldq	$4,%xmm0		// then shift vector down 4 bytes
	addq	$4,%rdi
3:
	andl	$3,%edx			// more to go?
	jz	5f			// no
	movd	%xmm0,%eax		// move remainders out into %eax
4:					// loop on up to three bytes
	movb	%al,(%rdi)		// pack in next byte
	shrl	$8,%eax			// shift next byte into position
	incq	%rdi
	dec	%edx
	jnz	4b
5:	ret
        
// Long enough to justify aligning ptr.  Note that we have to rotate the
// pattern to account for any alignment.  We do this by doing two unaligned
// stores, and then an aligned load from the middle of the two stores.
// This will stall on store forwarding alignment mismatch, and the unaligned
// stores can be pretty slow too, but the alternatives aren't any better.
// Fortunately, in most cases our caller has already aligned the ptr.
//      %rdx = length (> kShort)
//      %rdi = ptr (may not be aligned)
//      %xmm0 = pattern

LNotShort:
        movl    %edi,%ecx		// copy low bits of dest ptr
        negl    %ecx
        andl    $15,%ecx                // mask down to #bytes to 16-byte align
	jz	LAligned		// skip if already aligned
	movdqu	%xmm0,(%rdi)		// store 16 unaligned bytes
	movdqu	%xmm0,16(%rdi)		// and 16 more, to be sure we have an aligned chunk
	addq	%rcx,%rdi		// now point to the aligned chunk
	subq	%rcx,%rdx		// adjust remaining count
	movdqa	(%rdi),%xmm0		// get the rotated pattern (probably stalling)
	addq	$16,%rdi		// skip past the aligned chunk
	subq	$16,%rdx

// Set up for 64-byte loops.
//      %rdx = length remaining
//      %rdi = ptr (aligned)
//      %xmm0 = rotated pattern

LAligned:
	movq	%rdx,%rcx		// copy length remaining
        andl    $63,%edx                // mask down to residual length (0..63)
        andq    $-64,%rcx               // %ecx <- #bytes we will zero in by-64 loop
	jz	LNoMoreChunks		// no 64-byte chunks
        addq    %rcx,%rdi               // increment ptr by length to move
	cmpq	$(kVeryLong),%rcx	// long enough to justify non-temporal stores?
	jge	LVeryLong		// yes
        negq    %rcx			// negate length to move
	jmp	1f
	
// Loop over 64-byte chunks, storing into cache.

	.align	4,0x90			// keep inner loops 16-byte aligned
1:
        movdqa  %xmm0,(%rdi,%rcx)
        movdqa  %xmm0,16(%rdi,%rcx)
        movdqa  %xmm0,32(%rdi,%rcx)
        movdqa  %xmm0,48(%rdi,%rcx)
        addq    $64,%rcx
        jne     1b
	
	jmp	LNoMoreChunks
	
// Very long operands: use non-temporal stores to bypass cache.

LVeryLong:
        negq    %rcx			// negate length to move
	jmp	1f
	
	.align	4,0x90			// keep inner loops 16-byte aligned
1:
        movntdq %xmm0,(%rdi,%rcx)
        movntdq %xmm0,16(%rdi,%rcx)
        movntdq %xmm0,32(%rdi,%rcx)
        movntdq %xmm0,48(%rdi,%rcx)
        addq    $64,%rcx
        jne     1b

        sfence                          // required by non-temporal stores
	jmp	LNoMoreChunks
	
// Handle leftovers: loop by 16.
//      %edx = length remaining (<64)
//      %edi = ptr (aligned)
//      %xmm0 = rotated pattern

LLoopBy16:
	movdqa	%xmm0,(%rdi)		// pack in 16 more bytes
	subl	$16,%edx		// decrement count
	addq	$16,%rdi		// increment ptr
LNoMoreChunks:
	cmpl	$16,%edx		// more to go?
	jge	LLoopBy16		// yes
	jmp	LLessThan16		// handle up to 15 remaining bytes

	COMMPAGE_DESCRIPTOR(memset_pattern_sse2_64,_COMM_PAGE_MEMSET_PATTERN,kHasSSE2,0)
Commit	Line	Data
0c530ab8 A	1	/*
	2	* Copyright (c) 2006 Apple Computer, Inc. All rights reserved.
	3	*
2d21ac55 A	4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
	5	*
	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. The rights granted to you under the License
	10	* may not be used to create, or enable the creation or redistribution of,
	11	* unlawful or unlicensed copies of an Apple operating system, or to
	12	* circumvent, violate, or enable the circumvention or violation of, any
	13	* terms of an Apple operating system software license agreement.
	14	*
	15	* Please obtain a copy of the License at
	16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
	17	*
	18	* The Original Code and all software distributed under the License are
	19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
0c530ab8 A	20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
0c530ab8 A	21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
2d21ac55 A	22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	23	* Please see the License for the specific language governing rights and
	24	* limitations under the License.
	25	*
	26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
0c530ab8 A	27	*/
	28
	29	#include <machine/cpu_capabilities.h>
	30	#include <machine/commpage.h>
	31
	32	/* The common path for nonzero memset and the memset_pattern routines,
2d21ac55	33	* tuned for Pentium-M class processors with SSE2 and 64-byte cache lines.
0c530ab8 A	34	* This is the 64-bit bersion. It is used by the following functions:
	35	*
	36	* void memset(void b, int c, size_t len); // when c!=0
	37	* void memset_pattern4(void b, const void c4, size_t len);
	38	* void memset_pattern8(void b, const void c8, size_t len);
	39	* void memset_pattern16(void b, const void c16, size_t len);
	40	*
	41	* Note bzero() and memset() of 0 are handled separately.
	42	*/
	43
	44	#define kShort 63
	45	#define kVeryLong (1024*1024)
	46
	47	// Initial entry from Libc with parameters passed in registers. Although we
	48	// correctly handle misaligned ptrs and short operands, they are inefficient.
	49	// Therefore our caller should filter out short operands and exploit local
	50	// knowledge (ie, original pattern length) to align the ptr if possible.
	51	// When called, we expect:
	52	// %rdi = ptr to memory to set (not necessarily aligned)
	53	// %rdx = length (may be short or even 0)
	54	// %xmm0 = the pattern to store
	55	// Return conditions:
	56	// %rax, %rdi, %rsi, %rcx, and %rdx all trashed
	57	// we preserve %r8, %r9, %r10, and %r11
	58
	59	.text
	60	.align 5, 0x90
	61	.code64
2d21ac55	62	Lmemset_pattern_sse2_64:
0c530ab8 A	63	cmpq $(kShort),%rdx // long enough to bother aligning?
	64	ja LNotShort // yes
	65	jmp LShort // no
	66
	67	// Here for short operands or the end of long ones.
	68	// %rdx = length (<= kShort)
	69	// %rdi = ptr (may not be not aligned)
	70	// %xmm0 = pattern
	71
	72	LUnalignedStore16:
	73	movdqu %xmm0,(%rdi) // stuff in another 16 bytes
	74	subl $16,%edx
	75	addq $16,%rdi
	76	LShort:
	77	cmpl $16,%edx // room for another vector?
	78	jge LUnalignedStore16 // yes
	79	LLessThan16: // here at end of copy with < 16 bytes remaining
	80	test $8,%dl // 8-byte store required?
	81	jz 2f // no
	82	movq %xmm0,(%rdi) // pack in 8 low bytes
	83	psrldq $8,%xmm0 // then shift vector down 8 bytes
	84	addq $8,%rdi
	85	2:
	86	test $4,%dl // 4-byte store required?
	87	jz 3f // no
	88	movd %xmm0,(%rdi) // pack in 4 low bytes
	89	psrldq $4,%xmm0 // then shift vector down 4 bytes
	90	addq $4,%rdi
	91	3:
	92	andl $3,%edx // more to go?
	93	jz 5f // no
	94	movd %xmm0,%eax // move remainders out into %eax
	95	4: // loop on up to three bytes
	96	movb %al,(%rdi) // pack in next byte
	97	shrl $8,%eax // shift next byte into position
	98	incq %rdi
	99	dec %edx
	100	jnz 4b
	101	5: ret
	102
	103	// Long enough to justify aligning ptr. Note that we have to rotate the
	104	// pattern to account for any alignment. We do this by doing two unaligned
	105	// stores, and then an aligned load from the middle of the two stores.
	106	// This will stall on store forwarding alignment mismatch, and the unaligned
	107	// stores can be pretty slow too, but the alternatives aren't any better.
	108	// Fortunately, in most cases our caller has already aligned the ptr.
	109	// %rdx = length (> kShort)
	110	// %rdi = ptr (may not be aligned)
	111	// %xmm0 = pattern
	112
	113	LNotShort:
	114	movl %edi,%ecx // copy low bits of dest ptr
	115	negl %ecx
	116	andl $15,%ecx // mask down to #bytes to 16-byte align
	117	jz LAligned // skip if already aligned
	118	movdqu %xmm0,(%rdi) // store 16 unaligned bytes
	119	movdqu %xmm0,16(%rdi) // and 16 more, to be sure we have an aligned chunk
	120	addq %rcx,%rdi // now point to the aligned chunk
	121	subq %rcx,%rdx // adjust remaining count
	122	movdqa (%rdi),%xmm0 // get the rotated pattern (probably stalling)
	123	addq $16,%rdi // skip past the aligned chunk
	124	subq $16,%rdx
	125
	126	// Set up for 64-byte loops.
127	// %rdx = length remaining
128	// %rdi = ptr (aligned)
129	// %xmm0 = rotated pattern
130
131	LAligned:
132	movq %rdx,%rcx // copy length remaining
133	andl $63,%edx // mask down to residual length (0..63)
134	andq $-64,%rcx // %ecx <- #bytes we will zero in by-64 loop
135	jz LNoMoreChunks // no 64-byte chunks
136	addq %rcx,%rdi // increment ptr by length to move
137	cmpq $(kVeryLong),%rcx // long enough to justify non-temporal stores?
138	jge LVeryLong // yes
139	negq %rcx // negate length to move
140	jmp 1f
141
142	// Loop over 64-byte chunks, storing into cache.
143
144	.align 4,0x90 // keep inner loops 16-byte aligned
145	1:
146	movdqa %xmm0,(%rdi,%rcx)
147	movdqa %xmm0,16(%rdi,%rcx)
148	movdqa %xmm0,32(%rdi,%rcx)
149	movdqa %xmm0,48(%rdi,%rcx)
150	addq $64,%rcx
151	jne 1b
152
153	jmp LNoMoreChunks
154
155	// Very long operands: use non-temporal stores to bypass cache.
156
157	LVeryLong:
158	negq %rcx // negate length to move
159	jmp 1f
160
161	.align 4,0x90 // keep inner loops 16-byte aligned
162	1:
163	movntdq %xmm0,(%rdi,%rcx)
164	movntdq %xmm0,16(%rdi,%rcx)
165	movntdq %xmm0,32(%rdi,%rcx)
166	movntdq %xmm0,48(%rdi,%rcx)
167	addq $64,%rcx
168	jne 1b
169
170	sfence // required by non-temporal stores
171	jmp LNoMoreChunks
172
173	// Handle leftovers: loop by 16.
174	// %edx = length remaining (<64)
175	// %edi = ptr (aligned)
176	// %xmm0 = rotated pattern
177
178	LLoopBy16:
179	movdqa %xmm0,(%rdi) // pack in 16 more bytes
180	subl $16,%edx // decrement count
181	addq $16,%rdi // increment ptr
182	LNoMoreChunks:
183	cmpl $16,%edx // more to go?
184	jge LLoopBy16 // yes
185	jmp LLessThan16 // handle up to 15 remaining bytes
186
2d21ac55	187	COMMPAGE_DESCRIPTOR(memset_pattern_sse2_64,_COMM_PAGE_MEMSET_PATTERN,kHasSSE2,0)