git.saurik.com Git - apple/libc.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2006 Apple Computer, Inc. All rights reserved.
	3	*
	4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
	5	*
	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. The rights granted to you under the License
	10	* may not be used to create, or enable the creation or redistribution of,
	11	* unlawful or unlicensed copies of an Apple operating system, or to
	12	* circumvent, violate, or enable the circumvention or violation of, any
	13	* terms of an Apple operating system software license agreement.
	14	*
	15	* Please obtain a copy of the License at
	16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
	17	*
	18	* The Original Code and all software distributed under the License are
	19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	23	* Please see the License for the specific language governing rights and
	24	* limitations under the License.
	25	*
	26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
	27	*/
	28
	29	#include <machine/cpu_capabilities.h>
	30	#include <platfunc.h>
	31
	32	/*
	33	* The bcopy/memcpy loops, tuned for Pentium-M class processors with
	34	* Supplemental SSE3 and 64-byte cache lines.
	35	*
	36	* The following #defines are tightly coupled to the u-architecture:
	37	*/
	38
	39	#define kShort 80 // too short to bother with SSE (must be >=80)
	40	#define kVeryLong (500*1024) // large enough for non-temporal stores (must be >= 8192)
	41	#define kFastUCode ((16*1024)-15) // cutoff for microcode fastpath for "rep/movsl"
	42
	43	// void bcopy(const void src, void dst, size_t len);
	44
	45	PLATFUNC_FUNCTION_START(bcopy, sse3x, 32, 5)
	46	pushl %ebp // set up a frame for backtraces
	47	movl %esp,%ebp
	48	pushl %esi
	49	pushl %edi
	50	pushl %ebx
	51	movl 8(%ebp),%esi // get source ptr
	52	movl 12(%ebp),%edi // get dest ptr
	53	movl 16(%ebp),%ecx // get length
	54	movl %edi,%edx
	55	subl %esi,%edx // (dest - source)
	56	cmpl %ecx,%edx // must move in reverse if (dest - source) < length
	57	jb LReverseIsland
	58	cmpl $(kShort),%ecx // long enough to bother with SSE?
	59	jbe Lshort // no
	60	jmp LNotShort
	61
	62	//
	63	// void memcpy(void dst, const void *src, size_t len);
	64	// void memmove(void dst, const void *src, size_t len);
	65	//
	66
	67	PLATFUNC_FUNCTION_START(memcpy, sse3x, 32, 0) // void memcpy(void dst, const void *src, size_t len)
	68	PLATFUNC_FUNCTION_START(memmove, sse3x, 32, 0) // void memmove(void dst, const void *src, size_t len)
	69	pushl %ebp // set up a frame for backtraces
	70	movl %esp,%ebp
	71	pushl %esi
	72	pushl %edi
	73	pushl %ebx
	74	movl 8(%ebp),%edi // get dest ptr
	75	movl 12(%ebp),%esi // get source ptr
	76	movl 16(%ebp),%ecx // get length
	77	movl %edi,%edx
	78	subl %esi,%edx // (dest - source)
	79	cmpl %ecx,%edx // must move in reverse if (dest - source) < length
	80	jb LReverseIsland
	81	cmpl $(kShort),%ecx // long enough to bother with SSE?
	82	ja LNotShort // yes
	83
	84	// Handle short forward copies. As the most common case, this is the fall-through path.
	85	// ecx = length (<= kShort)
	86	// esi = source ptr
	87	// edi = dest ptr
	88
	89	Lshort:
	90	movl %ecx,%edx // copy length
	91	shrl $2,%ecx // get #doublewords
	92	jz LLeftovers
	93	2: // loop copying doublewords
	94	movl (%esi),%eax
	95	addl $4,%esi
	96	movl %eax,(%edi)
	97	addl $4,%edi
	98	dec %ecx
	99	jnz 2b
	100	LLeftovers: // handle leftover bytes (0..3) in last word
	101	andl $3,%edx // any leftover bytes?
	102	jz Lexit
	103	4: // loop copying bytes
	104	movb (%esi),%al
	105	inc %esi
	106	movb %al,(%edi)
	107	inc %edi
	108	dec %edx
	109	jnz 4b
	110	Lexit:
	111	movl 8(%ebp),%eax // get return value (dst ptr) for memcpy/memmove
	112	popl %ebx
	113	popl %edi
	114	popl %esi
	115	popl %ebp
	116	ret
	117
	118
	119	LReverseIsland: // keep the "jb" above a short branch...
	120	jmp LReverse // ...because reverse moves are uncommon
	121
	122
	123	// Handle forward moves that are long enough to justify use of SSE3.
	124	// First, 16-byte align the destination.
	125	// ecx = length (> kShort)
	126	// esi = source ptr
	127	// edi = dest ptr
	128
	129	LNotShort:
	130	cmpl $(kVeryLong),%ecx // long enough to justify heavyweight loops?
	131	movl %edi,%edx // copy destination
	132	jae LVeryLong // use very-long-operand path
	133	negl %edx
	134	andl $15,%edx // get #bytes to align destination
	135	jz LDestAligned // already aligned
	136	subl %edx,%ecx // decrement length
	137	1: // loop copying 1..15 bytes
	138	movb (%esi),%al
	139	inc %esi
	140	movb %al,(%edi)
	141	inc %edi
	142	dec %edx
	143	jnz 1b
	144
	145	// Destination is now aligned. Dispatch to one of sixteen loops over 64-byte chunks,
	146	// based on the alignment of the source. All vector loads and stores are aligned.
	147	// Even though this means we have to shift and repack vectors, doing so is much faster
	148	// than unaligned loads. Since kShort>=80 and we've moved at most 15 bytes already,
	149	// there is at least one chunk. When we enter the copy loops, the following registers
	150	// are set up:
	151	// ecx = residual length (0..63)
	152	// edx = -(length to move), a multiple of 64
	153	// esi = ptr to 1st source byte not to move (unaligned)
	154	// edi = ptr to 1st dest byte not to move (aligned)
	155
	156	LDestAligned:
	157	movl %ecx,%edx // copy length
	158	movl %esi,%eax // copy source address
	159	andl $63,%ecx // get remaining bytes for Lshort
	160	andl $-64,%edx // get number of bytes we will copy in inner loop
	161	andl $15,%eax // mask to low 4 bits of source address
	162	addl %edx,%esi // point to 1st byte not copied
	163	addl %edx,%edi
	164	negl %edx // now generate offset to 1st byte to be copied
	165	call 1f
	166	1:
	167	popl %ebx
	168	movl (LTable-1b)(%ebx,%eax,4), %eax // load jump table entry address, relative to LZero
	169	leal (LTable-1b)(%ebx,%eax,1), %eax
	170	jmp *%eax
	171
	172	.align 2
	173	LTable: // table of copy loop addresses
	174	.long LMod0 -LTable
	175	.long LMod1 -LTable
	176	.long LMod2 -LTable
	177	.long LMod3 -LTable
	178	.long LMod4 -LTable
	179	.long LMod5 -LTable
	180	.long LMod6 -LTable
	181	.long LMod7 -LTable
	182	.long LMod8 -LTable
	183	.long LMod9 -LTable
	184	.long LMod10 -LTable
	185	.long LMod11 -LTable
	186	.long LMod12 -LTable
	187	.long LMod13 -LTable
	188	.long LMod14 -LTable
	189	.long LMod15 -LTable
	190
	191
	192	// Very long forward moves. These are at least several pages. They are special cased
	193	// and aggressively optimized, not so much because they are common or useful, but
	194	// because they are subject to benchmark. There isn't enough room for them in the
	195	// area reserved on the platfunc for bcopy, so we put them elsewhere. We call
	196	// the longcopy routine using the normal ABI.
	197
	198	LVeryLong:
	199	pushl %ecx // length (>= kVeryLong)
	200	pushl %esi // source ptr
	201	pushl %edi // dest ptr
	202	call _longcopy
	203	addl $12,%esp // pop off our parameters
	204	jmp Lexit
	205
	206
	207	// On Pentium-M, the microcode for "rep/movsl" is faster than SSE for 8-byte
	208	// aligned operands from about 32KB up to kVeryLong for the hot cache case, and from
	209	// about 256 bytes up to kVeryLong for cold caches. This is because the microcode
	210	// avoids having to read destination cache lines that will be completely overwritten.
	211	// The cutoff we use (ie, kFastUCode) must somehow balance the two cases, since
	212	// we do not know if the destination is in cache or not.
	213
	214	Lfastpath:
	215	addl %edx,%esi // restore ptrs to 1st byte of source and dest
	216	addl %edx,%edi
	217	negl %edx // make length positive
	218	orl %edx,%ecx // restore total #bytes remaining to move
	219	cld // we'll move forward
	220	movl %ecx,%edx // copy total length to move
	221	shrl $2,%ecx // compute #words to move
	222	rep // the u-code will optimize this
	223	movsl
	224	jmp LLeftovers // handle 0..3 leftover bytes
	225
	226
	227	// Forward loop for medium length operands in which low four bits of %esi == 0000
	228
	229	LMod0:
	230	cmpl $(-kFastUCode),%edx // %edx == -length, where (length < kVeryLong)
	231	jle Lfastpath // long enough for fastpath in microcode
	232	jmp 1f
	233	.align 4,0x90 // 16-byte align inner loops
	234	1: // loop over 64-byte chunks
	235	movdqa (%esi,%edx),%xmm0
	236	movdqa 16(%esi,%edx),%xmm1
	237	movdqa 32(%esi,%edx),%xmm2
	238	movdqa 48(%esi,%edx),%xmm3
	239
	240	movdqa %xmm0,(%edi,%edx)
	241	movdqa %xmm1,16(%edi,%edx)
	242	movdqa %xmm2,32(%edi,%edx)
	243	movdqa %xmm3,48(%edi,%edx)
	244
	245	addl $64,%edx
	246	jnz 1b
	247
	248	jmp Lshort // copy remaining 0..63 bytes and done
	249
	250
	251	// Forward loop for medium length operands in which low four bits of %esi == 0001
	252
	253	LMod1:
	254	movdqa -1(%esi,%edx),%xmm0 // prime the loop by loading 1st quadword
	255	1: // loop over 64-byte chunks
	256	movdqa 15(%esi,%edx),%xmm1
	257	movdqa 31(%esi,%edx),%xmm2
	258	movdqa 47(%esi,%edx),%xmm3
	259	movdqa 63(%esi,%edx),%xmm4
	260
	261	movdqa %xmm0,%xmm5
	262	movdqa %xmm4,%xmm0
	263
	264	palignr $1,%xmm3,%xmm4 // dest <- shr( dest \|\| source, imm*8 )
	265	palignr $1,%xmm2,%xmm3
	266	palignr $1,%xmm1,%xmm2
	267	palignr $1,%xmm5,%xmm1
	268
	269	movdqa %xmm1,(%edi,%edx)
	270	movdqa %xmm2,16(%edi,%edx)
	271	movdqa %xmm3,32(%edi,%edx)
	272	movdqa %xmm4,48(%edi,%edx)
	273
	274	addl $64,%edx
	275	jnz 1b
	276
	277	jmp Lshort // copy remaining 0..63 bytes and done
	278
	279
	280	// Forward loop for medium length operands in which low four bits of %esi == 0010
	281
	282	LMod2:
	283	movdqa -2(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
	284	1: // loop over 64-byte chunks
	285	movdqa 14(%esi,%edx),%xmm1
	286	movdqa 30(%esi,%edx),%xmm2
	287	movdqa 46(%esi,%edx),%xmm3
	288	movdqa 62(%esi,%edx),%xmm4
	289
	290	movdqa %xmm0,%xmm5
	291	movdqa %xmm4,%xmm0
	292
	293	palignr $2,%xmm3,%xmm4 // dest <- shr( dest \|\| source, imm*8 )
	294	palignr $2,%xmm2,%xmm3
	295	palignr $2,%xmm1,%xmm2
	296	palignr $2,%xmm5,%xmm1
	297
	298	movdqa %xmm1,(%edi,%edx)
	299	movdqa %xmm2,16(%edi,%edx)
	300	movdqa %xmm3,32(%edi,%edx)
	301	movdqa %xmm4,48(%edi,%edx)
	302
	303	addl $64,%edx
	304	jnz 1b
	305
	306	jmp Lshort // copy remaining 0..63 bytes and done
	307
	308
	309	// Forward loop for medium length operands in which low four bits of %esi == 0011
	310
	311	LMod3:
	312	movdqa -3(%esi,%edx),%xmm0 // prime the loop by loading 1st source dq
	313	1: // loop over 64-byte chunks
	314	movdqa 13(%esi,%edx),%xmm1
	315	movdqa 29(%esi,%edx),%xmm2
	316	movdqa 45(%esi,%edx),%xmm3
	317	movdqa 61(%esi,%edx),%xmm4
	318
	319	movdqa %xmm0,%xmm5
	320	movdqa %xmm4,%xmm0
	321
	322	palignr $3,%xmm3,%xmm4 // dest <- shr( dest \|\| source, imm*8 )
	323	palignr $3,%xmm2,%xmm3
	324	palignr $3,%xmm1,%xmm2
	325	palignr $3,%xmm5,%xmm1
	326
	327	movdqa %xmm1,(%edi,%edx)
	328	movdqa %xmm2,16(%edi,%edx)
	329	movdqa %xmm3,32(%edi,%edx)
	330	movdqa %xmm4,48(%edi,%edx)
	331
	332	addl $64,%edx
	333	jnz 1b
	334
	335	jmp Lshort // copy remaining 0..63 bytes and done
	336
	337
	338	// Forward loop for medium length operands in which low four bits of %esi == 0100
	339	// We use the float single data type in order to use "movss" to merge vectors.
	340
	341	LMod4:
	342	movaps -4(%esi,%edx),%xmm0 // 4-byte aligned: prime the loop
	343	jmp 1f
	344	.align 4,0x90
	345	1: // loop over 64-byte chunks
	346	movaps 12(%esi,%edx),%xmm1
	347	movaps 28(%esi,%edx),%xmm2
	348	movss %xmm1,%xmm0 // copy low 4 bytes of source into destination
	349	pshufd $(0x39),%xmm0,%xmm0 // rotate right 4 bytes (mask -- 00 11 10 01)
	350	movaps 44(%esi,%edx),%xmm3
	351	movss %xmm2,%xmm1
	352	pshufd $(0x39),%xmm1,%xmm1
	353	movaps 60(%esi,%edx),%xmm4
	354	movss %xmm3,%xmm2
	355	pshufd $(0x39),%xmm2,%xmm2
	356
	357	movaps %xmm0,(%edi,%edx)
	358	movss %xmm4,%xmm3
	359	pshufd $(0x39),%xmm3,%xmm3
	360	movaps %xmm1,16(%edi,%edx)
	361	movaps %xmm2,32(%edi,%edx)
	362	movaps %xmm4,%xmm0
	363	movaps %xmm3,48(%edi,%edx)
	364
	365	addl $64,%edx
	366	jnz 1b
	367
	368	jmp Lshort // copy remaining 0..63 bytes and done
	369
	370
	371	// Forward loop for medium length operands in which low four bits of %esi == 0101
	372
	373	LMod5:
	374	movdqa -5(%esi,%edx),%xmm0// prime the loop by loading 1st source dq
	375	1: // loop over 64-byte chunks
	376	movdqa 11(%esi,%edx),%xmm1
	377	movdqa 27(%esi,%edx),%xmm2
	378	movdqa 43(%esi,%edx),%xmm3
	379	movdqa 59(%esi,%edx),%xmm4
	380
	381	movdqa %xmm0,%xmm5
	382	movdqa %xmm4,%xmm0
	383
	384	palignr $5,%xmm3,%xmm4 // dest <- shr( dest \|\| source, imm*8 )
	385	palignr $5,%xmm2,%xmm3
	386	palignr $5,%xmm1,%xmm2
	387	palignr $5,%xmm5,%xmm1
	388
	389	movdqa %xmm1,(%edi,%edx)
	390	movdqa %xmm2,16(%edi,%edx)
	391	movdqa %xmm3,32(%edi,%edx)
	392	movdqa %xmm4,48(%edi,%edx)
	393
	394	addl $64,%edx
	395	jnz 1b
	396
	397	jmp Lshort // copy remaining 0..63 bytes and done
	398
	399
	400	// Forward loop for medium length operands in which low four bits of %esi == 0110
	401
	402	LMod6:
	403	movdqa -6(%esi,%edx),%xmm0// prime the loop by loading 1st source dq
	404	1: // loop over 64-byte chunks
	405	movdqa 10(%esi,%edx),%xmm1
	406	movdqa 26(%esi,%edx),%xmm2
	407	movdqa 42(%esi,%edx),%xmm3
	408	movdqa 58(%esi,%edx),%xmm4
	409
	410	movdqa %xmm0,%xmm5
	411	movdqa %xmm4,%xmm0
	412
	413	palignr $6,%xmm3,%xmm4 // dest <- shr( dest \|\| source, imm*8 )
	414	palignr $6,%xmm2,%xmm3
	415	palignr $6,%xmm1,%xmm2
	416	palignr $6,%xmm5,%xmm1
	417
	418	movdqa %xmm1,(%edi,%edx)
	419	movdqa %xmm2,16(%edi,%edx)
	420	movdqa %xmm3,32(%edi,%edx)
	421	movdqa %xmm4,48(%edi,%edx)
	422
	423	addl $64,%edx
	424	jnz 1b
	425
	426	jmp Lshort // copy remaining 0..63 bytes and done
	427
	428
	429	// Forward loop for medium length operands in which low four bits of %esi == 0111
	430
	431	LMod7:
	432	movdqa -7(%esi,%edx),%xmm0// prime the loop by loading 1st source dq
	433	1: // loop over 64-byte chunks
	434	movdqa 9(%esi,%edx),%xmm1
	435	movdqa 25(%esi,%edx),%xmm2
	436	movdqa 41(%esi,%edx),%xmm3
	437	movdqa 57(%esi,%edx),%xmm4
	438
	439	movdqa %xmm0,%xmm5
	440	movdqa %xmm4,%xmm0
	441
	442	palignr $7,%xmm3,%xmm4 // dest <- shr( dest \|\| source, imm*8 )
	443	palignr $7,%xmm2,%xmm3
	444	palignr $7,%xmm1,%xmm2
	445	palignr $7,%xmm5,%xmm1
	446
	447	movdqa %xmm1,(%edi,%edx)
	448	movdqa %xmm2,16(%edi,%edx)
	449	movdqa %xmm3,32(%edi,%edx)
	450	movdqa %xmm4,48(%edi,%edx)
	451
	452	addl $64,%edx
	453	jnz 1b
	454
	455	jmp Lshort // copy remaining 0..63 bytes and done
	456
	457
	458	// Forward loop for medium length operands in which low four bits of %esi == 1000
	459	// We use the float double data type in order to use "shufpd" to shift by 8 bytes.
	460
	461	LMod8:
	462	cmpl $(-kFastUCode),%edx// %edx == -length, where (length < kVeryLong)
	463	jle Lfastpath // long enough for fastpath in microcode
	464	movapd -8(%esi,%edx),%xmm0// 8-byte aligned: prime the loop
	465	jmp 1f
	466	.align 4,0x90
	467	1: // loop over 64-byte chunks
	468	movapd 8(%esi,%edx),%xmm1
	469	movapd 24(%esi,%edx),%xmm2
	470	shufpd $01,%xmm1,%xmm0 // %xmm0 <- shr( %xmm0 \|\| %xmm1, 8 bytes)
	471	movapd 40(%esi,%edx),%xmm3
	472	shufpd $01,%xmm2,%xmm1
	473	movapd 56(%esi,%edx),%xmm4
	474	shufpd $01,%xmm3,%xmm2
	475
	476	movapd %xmm0,(%edi,%edx)
	477	shufpd $01,%xmm4,%xmm3
	478	movapd %xmm1,16(%edi,%edx)
	479	movapd %xmm2,32(%edi,%edx)
	480	movapd %xmm4,%xmm0
	481	movapd %xmm3,48(%edi,%edx)
	482
	483	addl $64,%edx
	484	jnz 1b
	485
	486	jmp Lshort // copy remaining 0..63 bytes and done
	487
	488
	489	// Forward loop for medium length operands in which low four bits of %esi == 1001
	490
	491	LMod9:
	492	movdqa -9(%esi,%edx),%xmm0// prime the loop by loading 1st source dq
	493	1: // loop over 64-byte chunks
	494	movdqa 7(%esi,%edx),%xmm1
	495	movdqa 23(%esi,%edx),%xmm2
	496	movdqa 39(%esi,%edx),%xmm3
	497	movdqa 55(%esi,%edx),%xmm4
	498
	499	movdqa %xmm0,%xmm5
	500	movdqa %xmm4,%xmm0
	501
	502	palignr $9,%xmm3,%xmm4 // dest <- shr( dest \|\| source, imm*8 )
	503	palignr $9,%xmm2,%xmm3
	504	palignr $9,%xmm1,%xmm2
	505	palignr $9,%xmm5,%xmm1
	506
	507	movdqa %xmm1,(%edi,%edx)
	508	movdqa %xmm2,16(%edi,%edx)
	509	movdqa %xmm3,32(%edi,%edx)
	510	movdqa %xmm4,48(%edi,%edx)
	511
	512	addl $64,%edx
	513	jnz 1b
	514
	515	jmp Lshort // copy remaining 0..63 bytes and done
	516
	517
	518	// Forward loop for medium length operands in which low four bits of %esi == 1010
	519
	520	LMod10:
	521	movdqa -10(%esi,%edx),%xmm0// prime the loop by loading 1st source dq
	522	1: // loop over 64-byte chunks
	523	movdqa 6(%esi,%edx),%xmm1
	524	movdqa 22(%esi,%edx),%xmm2
	525	movdqa 38(%esi,%edx),%xmm3
	526	movdqa 54(%esi,%edx),%xmm4
	527
	528	movdqa %xmm0,%xmm5
	529	movdqa %xmm4,%xmm0
	530
	531	palignr $10,%xmm3,%xmm4 // dest <- shr( dest \|\| source, imm*8 )
	532	palignr $10,%xmm2,%xmm3
	533	palignr $10,%xmm1,%xmm2
	534	palignr $10,%xmm5,%xmm1
	535
	536	movdqa %xmm1,(%edi,%edx)
	537	movdqa %xmm2,16(%edi,%edx)
	538	movdqa %xmm3,32(%edi,%edx)
	539	movdqa %xmm4,48(%edi,%edx)
	540
	541	addl $64,%edx
	542	jnz 1b
	543
	544	jmp Lshort // copy remaining 0..63 bytes and done
	545
	546
	547	// Forward loop for medium length operands in which low four bits of %esi == 1011
	548
	549	LMod11:
	550	movdqa -11(%esi,%edx),%xmm0// prime the loop by loading 1st source dq
	551	1: // loop over 64-byte chunks
	552	movdqa 5(%esi,%edx),%xmm1
	553	movdqa 21(%esi,%edx),%xmm2
	554	movdqa 37(%esi,%edx),%xmm3
	555	movdqa 53(%esi,%edx),%xmm4
	556
	557	movdqa %xmm0,%xmm5
	558	movdqa %xmm4,%xmm0
	559
	560	palignr $11,%xmm3,%xmm4 // dest <- shr( dest \|\| source, imm*8 )
	561	palignr $11,%xmm2,%xmm3
	562	palignr $11,%xmm1,%xmm2
	563	palignr $11,%xmm5,%xmm1
	564
	565	movdqa %xmm1,(%edi,%edx)
	566	movdqa %xmm2,16(%edi,%edx)
	567	movdqa %xmm3,32(%edi,%edx)
	568	movdqa %xmm4,48(%edi,%edx)
	569
	570	addl $64,%edx
	571	jnz 1b
	572
	573	jmp Lshort // copy remaining 0..63 bytes and done
	574
	575
	576	// Forward loop for medium length operands in which low four bits of %esi == 1100
	577	// We use the float single data type in order to use "movss" to merge vectors.
	578
	579	LMod12:
	580	movss (%esi,%edx),%xmm0// prefetch 1st four bytes of source, right justified
	581	jmp 1f
	582	.align 4,0x90
	583	1: // loop over 64-byte chunks
	584	pshufd $(0x93),4(%esi,%edx),%xmm1 // load and rotate right 12 bytes (mask -- 10 01 00 11)
	585	pshufd $(0x93),20(%esi,%edx),%xmm2
	586	pshufd $(0x93),36(%esi,%edx),%xmm3
	587	pshufd $(0x93),52(%esi,%edx),%xmm4
	588
	589	movaps %xmm4,%xmm5
	590	movss %xmm3,%xmm4 // copy low 4 bytes of source into destination
	591	movss %xmm2,%xmm3
	592	movss %xmm1,%xmm2
	593	movss %xmm0,%xmm1
	594
	595	movaps %xmm1,(%edi,%edx)
	596	movaps %xmm2,16(%edi,%edx)
	597	movaps %xmm5,%xmm0
	598	movaps %xmm3,32(%edi,%edx)
	599	movaps %xmm4,48(%edi,%edx)
	600
	601	addl $64,%edx
	602	jnz 1b
	603
	604	jmp Lshort // copy remaining 0..63 bytes and done
	605
	606
	607	// Forward loop for medium length operands in which low four bits of %esi == 1101
	608
	609	LMod13:
	610	movdqa -13(%esi,%edx),%xmm0// prime the loop by loading 1st source dq
	611	1: // loop over 64-byte chunks
	612	movdqa 3(%esi,%edx),%xmm1
	613	movdqa 19(%esi,%edx),%xmm2
	614	movdqa 35(%esi,%edx),%xmm3
	615	movdqa 51(%esi,%edx),%xmm4
	616
	617	movdqa %xmm0,%xmm5
	618	movdqa %xmm4,%xmm0
	619
	620	palignr $13,%xmm3,%xmm4 // dest <- shr( dest \|\| source, imm*8 )
	621	palignr $13,%xmm2,%xmm3
	622	palignr $13,%xmm1,%xmm2
	623	palignr $13,%xmm5,%xmm1
	624
	625	movdqa %xmm1,(%edi,%edx)
	626	movdqa %xmm2,16(%edi,%edx)
	627	movdqa %xmm3,32(%edi,%edx)
	628	movdqa %xmm4,48(%edi,%edx)
	629
	630	addl $64,%edx
	631	jnz 1b
	632
	633	jmp Lshort // copy remaining 0..63 bytes and done
	634
	635
	636	// Forward loop for medium length operands in which low four bits of %esi == 1110
	637
	638	LMod14:
	639	movdqa -14(%esi,%edx),%xmm0// prime the loop by loading 1st source dq
	640	1: // loop over 64-byte chunks
	641	movdqa 2(%esi,%edx),%xmm1
	642	movdqa 18(%esi,%edx),%xmm2
	643	movdqa 34(%esi,%edx),%xmm3
	644	movdqa 50(%esi,%edx),%xmm4
	645
	646	movdqa %xmm0,%xmm5
	647	movdqa %xmm4,%xmm0
	648
	649	palignr $14,%xmm3,%xmm4 // dest <- shr( dest \|\| source, imm*8 )
	650	palignr $14,%xmm2,%xmm3
	651	palignr $14,%xmm1,%xmm2
	652	palignr $14,%xmm5,%xmm1
	653
	654	movdqa %xmm1,(%edi,%edx)
	655	movdqa %xmm2,16(%edi,%edx)
	656	movdqa %xmm3,32(%edi,%edx)
	657	movdqa %xmm4,48(%edi,%edx)
	658
	659	addl $64,%edx
	660	jnz 1b
	661
	662	jmp Lshort // copy remaining 0..63 bytes and done
	663
	664
	665	// Forward loop for medium length operands in which low four bits of %esi == 1111
	666
	667	LMod15:
	668	movdqa -15(%esi,%edx),%xmm0// prime the loop by loading 1st source dq
	669	1: // loop over 64-byte chunks
	670	movdqa 1(%esi,%edx),%xmm1
	671	movdqa 17(%esi,%edx),%xmm2
	672	movdqa 33(%esi,%edx),%xmm3
	673	movdqa 49(%esi,%edx),%xmm4
	674
	675	movdqa %xmm0,%xmm5
	676	movdqa %xmm4,%xmm0
	677
	678	palignr $15,%xmm3,%xmm4 // dest <- shr( dest \|\| source, imm*8 )
	679	palignr $15,%xmm2,%xmm3
	680	palignr $15,%xmm1,%xmm2
	681	palignr $15,%xmm5,%xmm1
	682
	683	movdqa %xmm1,(%edi,%edx)
	684	movdqa %xmm2,16(%edi,%edx)
	685	movdqa %xmm3,32(%edi,%edx)
	686	movdqa %xmm4,48(%edi,%edx)
	687
	688	addl $64,%edx
	689	jnz 1b
	690
	691	jmp Lshort // copy remaining 0..63 bytes and done
	692
	693
	694	// Reverse moves. These are not optimized as aggressively as their forward
	695	// counterparts, as they are only used with destructive overlap.
	696	// ecx = length
	697	// esi = source ptr
	698	// edi = dest ptr
	699
	700	LReverse:
	701	addl %ecx,%esi // point to end of strings
	702	addl %ecx,%edi
	703	cmpl $(kShort),%ecx // long enough to bother with SSE?
	704	ja LReverseNotShort // yes
	705
	706	// Handle reverse short copies.
	707	// ecx = length
	708	// esi = one byte past end of source
	709	// edi = one byte past end of dest
	710
	711	LReverseShort:
	712	movl %ecx,%edx // copy length
	713	shrl $2,%ecx // #words
	714	jz 3f
	715	1:
	716	subl $4,%esi
	717	movl (%esi),%eax
	718	subl $4,%edi
	719	movl %eax,(%edi)
	720	dec %ecx
	721	jnz 1b
	722	3:
	723	andl $3,%edx // bytes?
	724	jz 5f
	725	4:
	726	dec %esi
	727	movb (%esi),%al
	728	dec %edi
	729	movb %al,(%edi)
	730	dec %edx
	731	jnz 4b
	732	5:
	733	movl 8(%ebp),%eax // get return value (dst ptr) for memcpy/memmove
	734	popl %ebx
	735	popl %edi
	736	popl %esi
	737	popl %ebp
	738	ret
	739
	740	// Handle a reverse move long enough to justify using SSE.
	741	// ecx = length
	742	// esi = one byte past end of source
	743	// edi = one byte past end of dest
	744
	745	LReverseNotShort:
	746	movl %edi,%edx // copy destination
	747	andl $15,%edx // get #bytes to align destination
	748	je LReverseDestAligned // already aligned
	749	subl %edx,%ecx // adjust length
	750	1: // loop copying 1..15 bytes
	751	dec %esi
	752	movb (%esi),%al
	753	dec %edi
	754	movb %al,(%edi)
	755	dec %edx
	756	jnz 1b
	757
	758	// Destination is now aligned. Prepare for reverse loops.
	759
	760	LReverseDestAligned:
	761	movl %ecx,%edx // copy length
	762	andl $63,%ecx // get remaining bytes for Lshort
	763	andl $-64,%edx // get number of bytes we will copy in inner loop
	764	subl %edx,%esi // point to endpoint of copy
	765	subl %edx,%edi
	766	testl $15,%esi // is source aligned too?
	767	jnz LReverseUnalignedLoop // no
	768
	769	LReverseAlignedLoop: // loop over 64-byte chunks
	770	movdqa -16(%esi,%edx),%xmm0
	771	movdqa -32(%esi,%edx),%xmm1
	772	movdqa -48(%esi,%edx),%xmm2
	773	movdqa -64(%esi,%edx),%xmm3
	774
	775	movdqa %xmm0,-16(%edi,%edx)
	776	movdqa %xmm1,-32(%edi,%edx)
	777	movdqa %xmm2,-48(%edi,%edx)
	778	movdqa %xmm3,-64(%edi,%edx)
	779
	780	subl $64,%edx
	781	jne LReverseAlignedLoop
	782
	783	jmp LReverseShort // copy remaining 0..63 bytes and done
	784
	785
	786	// Reverse, unaligned loop. LDDQU==MOVDQU on these machines.
	787
	788	LReverseUnalignedLoop: // loop over 64-byte chunks
	789	movdqu -16(%esi,%edx),%xmm0
	790	movdqu -32(%esi,%edx),%xmm1
	791	movdqu -48(%esi,%edx),%xmm2
	792	movdqu -64(%esi,%edx),%xmm3
	793
	794	movdqa %xmm0,-16(%edi,%edx)
	795	movdqa %xmm1,-32(%edi,%edx)
	796	movdqa %xmm2,-48(%edi,%edx)
	797	movdqa %xmm3,-64(%edi,%edx)
	798
	799	subl $64,%edx
	800	jne LReverseUnalignedLoop
	801
	802	jmp LReverseShort // copy remaining 0..63 bytes and done
	803
	804	PLATFUNC_DESCRIPTOR(bcopy,sse3x,kHasSSE2\|kHasSupplementalSSE3\|kCache64,kHasSSE4_2)
	805	PLATFUNC_DESCRIPTOR(memcpy,sse3x,kHasSSE2\|kHasSupplementalSSE3\|kCache64,kHasSSE4_2)
	806	PLATFUNC_DESCRIPTOR(memmove,sse3x,kHasSSE2\|kHasSupplementalSSE3\|kCache64,kHasSSE4_2)