git.saurik.com Git - apple/xnu.git/blame_incremental - osfmk/i386/commpage/memset_pattern

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2005 Apple Computer, Inc. All rights reserved.
	3	*
	4	* @APPLE_LICENSE_HEADER_START@
	5	*
	6	* The contents of this file constitute Original Code as defined in and
	7	* are subject to the Apple Public Source License Version 1.1 (the
	8	* "License"). You may not use this file except in compliance with the
	9	* License. Please obtain a copy of the License at
	10	* http://www.apple.com/publicsource and read it before using this file.
	11	*
	12	* This Original Code and all software distributed under the License are
	13	* distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	14	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	15	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	16	* FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
	17	* License for the specific language governing rights and limitations
	18	* under the License.
	19	*
	20	* @APPLE_LICENSE_HEADER_END@
	21	*/
	22
	23	#include <machine/cpu_capabilities.h>
	24	#include <machine/commpage.h>
	25
	26	/* The common path for nonzero memset and the memset_pattern routines,
	27	* tuned for Pentium-M class processors with SSE3 and 64-byte cache lines.
	28	* This is used by the following functions:
	29	*
	30	* void memset(void b, int c, size_t len); // when c!=0
	31	* void memset_pattern4(void b, const void c4, size_t len);
	32	* void memset_pattern8(void b, const void c8, size_t len);
	33	* void memset_pattern16(void b, const void c16, size_t len);
	34	*
	35	* Note bzero() and memset() of 0 are handled separately.
	36	*/
	37
	38	#define kShort 63
	39	#define kVeryLong (1024*1024)
	40
	41	// Initial entry from Libc with parameters passed in registers. Although we
	42	// correctly handle misaligned ptrs and short operands, they are inefficient.
	43	// Therefore our caller should filter out short operands and exploit local
	44	// knowledge (ie, original pattern length) to align the ptr if possible.
	45	// When called, we expect:
	46	// %edi = ptr to memory to set (not necessarily aligned)
	47	// %edx = length (may be short or even 0)
	48	// %xmm0 = the pattern to store
	49	// Return conditions:
	50	// %eax, %edi, %esi, %ecx, and %edx all trashed
	51
	52	.text
	53	.align 5, 0x90
	54	Lmemset_pattern_sse3:
	55	cmpl $(kShort),%edx // long enough to bother aligning?
	56	ja LNotShort // yes
	57	jmp LShort // no
	58
	59	// Here for short operands or the end of long ones.
	60	// %edx = length
	61	// %edi = ptr (may not be not aligned)
	62	// %xmm0 = pattern
	63
	64	LUnalignedStore16:
	65	movdqu %xmm0,(%edi) // stuff in another 16 bytes
	66	subl $16,%edx
	67	addl $16,%edi
	68	LShort:
	69	cmpl $16,%edx // room for another vector?
	70	jge LUnalignedStore16 // yes
	71	LLessThan16: // here at end of copy with < 16 bytes remaining
	72	test $8,%dl // 8-byte store required?
	73	jz 2f // no
	74	movq %xmm0,(%edi) // pack in 8 low bytes
	75	psrldq $8,%xmm0 // then shift vector down 8 bytes
	76	addl $8,%edi
	77	2:
	78	test $4,%dl // 4-byte store required?
	79	jz 3f // no
	80	movd %xmm0,(%edi) // pack in 4 low bytes
	81	psrldq $4,%xmm0 // then shift vector down 4 bytes
	82	addl $4,%edi
	83	3:
	84	andl $3,%edx // more to go?
	85	jz 5f // no
	86	movd %xmm0,%eax // move remainders out into %eax
	87	4: // loop on up to three bytes
	88	movb %al,(%edi) // pack in next byte
	89	shrl $8,%eax // shift next byte into position
	90	inc %edi
	91	dec %edx
	92	jnz 4b
	93	5: ret
	94
	95	// Long enough to justify aligning ptr. Note that we have to rotate the
	96	// pattern to account for any alignment. We do this by doing two unaligned
	97	// stores, and then an aligned load from the middle of the two stores.
	98	// This will stall on store forwarding alignment mismatch, and the unaligned
	99	// stores can be pretty slow too, but the alternatives aren't any better.
	100	// Fortunately, in most cases our caller has already aligned the ptr.
	101	// %edx = length (> kShort)
	102	// %edi = ptr (may not be aligned)
	103	// %xmm0 = pattern
	104
	105	LNotShort:
	106	movl %edi,%ecx // copy dest ptr
	107	negl %ecx
	108	andl $15,%ecx // mask down to #bytes to 16-byte align
	109	jz LAligned // skip if already aligned
	110	movdqu %xmm0,(%edi) // store 16 unaligned bytes
	111	movdqu %xmm0,16(%edi) // and 16 more, to be sure we have an aligned chunk
	112	addl %ecx,%edi // now point to the aligned chunk
	113	subl %ecx,%edx // adjust remaining count
	114	movdqa (%edi),%xmm0 // get the rotated pattern (probably stalling)
	115	addl $16,%edi // skip past the aligned chunk
	116	subl $16,%edx
	117
	118	// Set up for 64-byte loops.
	119	// %edx = length remaining
	120	// %edi = ptr (aligned)
	121	// %xmm0 = rotated pattern
	122
	123	LAligned:
	124	movl %edx,%ecx // copy length remaining
	125	andl $63,%edx // mask down to residual length (0..63)
	126	andl $-64,%ecx // %ecx <- #bytes we will zero in by-64 loop
	127	jz LNoMoreChunks // no 64-byte chunks
	128	addl %ecx,%edi // increment ptr by length to move
	129	cmpl $(kVeryLong),%ecx // long enough to justify non-temporal stores?
	130	jge LVeryLong // yes
	131	negl %ecx // negate length to move
	132	jmp 1f
	133
	134	// Loop over 64-byte chunks, storing into cache.
	135
	136	.align 4,0x90 // keep inner loops 16-byte aligned
	137	1:
	138	movdqa %xmm0,(%edi,%ecx)
	139	movdqa %xmm0,16(%edi,%ecx)
	140	movdqa %xmm0,32(%edi,%ecx)
	141	movdqa %xmm0,48(%edi,%ecx)
	142	addl $64,%ecx
	143	jne 1b
	144
	145	jmp LNoMoreChunks
	146
	147	// Very long operands: use non-temporal stores to bypass cache.
	148
	149	LVeryLong:
	150	negl %ecx // negate length to move
	151	jmp 1f
	152
	153	.align 4,0x90 // keep inner loops 16-byte aligned
	154	1:
	155	movntdq %xmm0,(%edi,%ecx)
	156	movntdq %xmm0,16(%edi,%ecx)
	157	movntdq %xmm0,32(%edi,%ecx)
	158	movntdq %xmm0,48(%edi,%ecx)
	159	addl $64,%ecx
	160	jne 1b
	161
	162	sfence // required by non-temporal stores
	163	jmp LNoMoreChunks
	164
	165	// Handle leftovers: loop by 16.
	166	// %edx = length remaining (<64)
	167	// %edi = ptr (aligned)
	168	// %xmm0 = rotated pattern
	169
	170	LLoopBy16:
	171	movdqa %xmm0,(%edi) // pack in 16 more bytes
	172	subl $16,%edx // decrement count
	173	addl $16,%edi // increment ptr
	174	LNoMoreChunks:
	175	cmpl $16,%edx // more to go?
	176	jge LLoopBy16 // yes
	177	jmp LLessThan16 // handle up to 15 remaining bytes
	178
	179	COMMPAGE_DESCRIPTOR(memset_pattern_sse3,_COMM_PAGE_MEMSET_PATTERN,kHasSSE2,0)