[apple/libc.git] / ppc / string / memset.s

/*
 * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved.
 *
 * @APPLE_LICENSE_HEADER_START@
 * 
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this
 * file.
 * 
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 * 
 * @APPLE_LICENSE_HEADER_END@
 */

#include <machine/cpu_capabilities.h>

/* We use mode-independent "g" opcodes such as "srgi".  These expand
 * into word operations when targeting __ppc__, and into doubleword
 * operations when targeting __ppc64__.
 */
#include <architecture/ppc/mode_independent_asm.h>


        .text
#define kShort  128             // threshold for calling commpage


/* ***************
 * * M E M S E T *
 * ***************
 *
 * Registers we use:
 *      r3  = original ptr, not changed since memset returns it
 *      r4  = count of bytes to set
 *      r7  = value to set
 *      r8  = working operand ptr
 */
 
        .globl	_memset
        .align	5
_memset:                        // void *   memset(void *b, int c, size_t len);
        andi.	r7,r4,0xFF      // copy value to working register, test for 0
        mr	r4,r5           // move length to working register
        cmplgi	cr1,r5,kShort	// long enough to bother with _COMM_PAGE_MEMSET_PATTERN?
        beqa++	_COMM_PAGE_BZERO    // if (c==0), map to bzero()
        rlwimi	r7,r7,8,16,23	// replicate nonzero value to low 2 bytes
        neg	r5,r3           // start to compute #bytes to align
        mr	r8,r3           // make working copy of operand ptr
        rlwimi	r7,r7,16,0,15	// value now in all 4 bytes
        blt	cr1,Lmemset3    // too short to use commpage
        andi.	r0,r5,0xF       // r0 <- #bytes to align on quadword
        
        // Align ptr and store enough so that we have an aligned 16-byte pattern.

        stw     r7,0(r8)
        stw     r7,4(r8)
        stw     r7,8(r8)
        stw     r7,12(r8)
        beq     Lmemset1        // skip if (r0==0), ie if r8 is 16-byte aligned
        add     r8,r8,r0        // 16-byte align ptr
        sub     r4,r4,r0        // adjust length
        stw     r7,0(r8)        // now we can store an aligned 16-byte pattern
        stw     r7,4(r8)
        stw     r7,8(r8)
        stw     r7,12(r8)

        // Call machine-specific commpage routine, which expects:
        //      r4 = count (>=32)
        //      r8 = ptr (16-byte aligned) to memory to store
        //      r9 = ptr (16-byte aligned) to 16-byte pattern to store
        // When it returns:
        //      r3, r7, and r12 are preserved
        //      r4 and r8 are updated to reflect a residual count of from 0..31 bytes
        
Lmemset1:
        mflr    r12             // save return address
        mr      r9,r8           // point to 16-byte-aligned 16-byte pattern
        addi    r8,r8,16        // point to first unstored byte
        subi    r4,r4,16        // account for the aligned bytes we have stored
        bla	_COMM_PAGE_MEMSET_PATTERN
        mtlr    r12

        // Here for short nonzero memset.
        //  r4 = count (<= kShort bytes)
        //  r7 = pattern in all four bytes
        //  r8 = ptr
Lmemset3:
        srgi.   r0,r4,4         // any 16-byte chunks?
        mtcrf   0x01,r4         // move length remaining to cr7 so we can test bits
        beq     Lmemset5        // fewer than 16 bytes
        mtctr   r0
        b       Lmemset4        // enter loop
        
        .align  5
Lmemset4:                       // loop over 16-byte chunks
        stw     r7,0(r8)
        stw     r7,4(r8)
        stw     r7,8(r8)
        stw     r7,12(r8)
        addi    r8,r8,16
        bdnz++  Lmemset4
        
        // Handle last 0..15 bytes.
Lmemset5:
        bf      28,2f
        stw     r7,0(r8)
        stw     r7,4(r8)
        addi    r8,r8,8
2:
        bf      29,3f
        stw     r7,0(r8)
        addi    r8,r8,4
3:
        bf      30,4f
        sth     r7,0(r8)
        addi    r8,r8,2
4:
        bflr    31
        stb     r7,0(r8)
        blr
        

/* ***********************************
 * * M E M S E T _ P A T T E R N 1 6 *
 * ***********************************
 *
 * Used to store a 16-byte pattern in memory:
 *
 *  void    memset_pattern16(void *b, const void *c16, size_t len);
 *
 * Where c16 points to the 16-byte pattern.  None of the parameters need be aligned.
 */

        .globl	_memset_pattern16
        .align	5
_memset_pattern16:
        cmplgi  cr1,r5,kShort   // check length
        lwz     r7,0(r4)        // load pattern into (these remain lwz in 64-bit mode)
        lwz     r9,4(r4)
        neg     r6,r3           // start to compute ptr alignment
        lwz     r10,8(r4)
        lwz     r11,12(r4)
        b       __memset_pattern_common
        

/* *********************************
 * * M E M S E T _ P A T T E R N 8 *
 * *********************************
 *
 * Used to store an 8-byte pattern in memory:
 *
 *  void    memset_pattern8(void *b, const void *c8, size_t len);
 *
 * Where c8 points to the 8-byte pattern.  None of the parameters need be aligned.
 */

        .globl	_memset_pattern8
        .align	5
_memset_pattern8:
        lwz     r7,0(r4)        // load pattern (these remain lwz in 64-bit mode)
        lwz     r9,4(r4)
        cmplgi  cr1,r5,kShort   // check length
        neg     r6,r3           // start to compute ptr alignment
        mr      r10,r7          // replicate into 16-byte pattern
        mr      r11,r9
        b       __memset_pattern_common
        

/* *********************************
 * * M E M S E T _ P A T T E R N 4 *
 * *********************************
 *
 * Used to store a 4-byte pattern in memory:
 *
 *  void    memset_pattern4(void *b, const void *c4, size_t len);
 *
 * Where c4 points to the 4-byte pattern.  None of the parameters need be aligned.
 */

        .globl	_memset_pattern4
        .align	5
_memset_pattern4:
        lwz     r7,0(r4)        // load pattern
        cmplgi  cr1,r5,kShort   // check length
        neg     r6,r3           // start to compute ptr alignment
        mr      r9,r7           // replicate into 16-byte pattern
        mr      r10,r7
        mr      r11,r7
        b       __memset_pattern_common // don't fall through because of scatter-loading
        
        
/* ***********************************************
 * * _ M E M S E T _ P A T T E R N _ C O M M O N *
 * ***********************************************
 *
 * This is the common code used by _memset_pattern16, 8, and 4.  They all get here via
 * long branch (ie, "b") in case the routines are re-ordered, with:
 *      r3 = ptr to memory to store pattern into (unaligned)
 *      r5 = length in bytes
 *      r6 = neg(r3), used to compute #bytes to align
 *      r7, r9, r10, r11 = 16-byte pattern to store
 *      cr1= ble if (r5 <= kShort)
 */

        .globl	__memset_pattern_common
        .private_extern __memset_pattern_common // avoid dyld stub, which trashes r11
        .align	5
__memset_pattern_common:
        andi.   r0,r6,0xF       // get #bytes to 16-byte align ptr
        ble--   cr1,LShort      // if short operand skip out

        // Align ptr and store enough of pattern so we have an aligned
        // 16-byte chunk of it (this effectively rotates incoming pattern
        // if the original ptr was not aligned.)
        
        stw     r7,0(r3)
        stw     r9,4(r3)
        stw     r10,8(r3)
        stw     r11,12(r3)
        beq     Laligned        // skip if (r0==0), ie if r3 is 16-byte aligned
        stw     r7,16(r3)
        stw     r9,20(r3)
        stw     r10,24(r3)
        stw     r11,28(r3)
        add     r3,r3,r0        // 16-byte align ptr
        sub     r5,r5,r0        // adjust length
        
        // We're ready to call the machine-specific commpage routine
        // to do the heavy lifting.  When called, _COMM_PAGE_MEMSET_PATTERN expects:
        //      r4 = length (>= 32)
        //      r8 = ptr (16-byte aligned)
        //      r9 = ptr to 16-byte pattern (16-byte aligned)
        // When it returns:
        //      r3, r7, and r12 are preserved
        //      r4 and r8 are updated to reflect a residual count of from 0..31 bytes

Laligned:
        mflr    r12             // save return across commpage call
        mr      r9,r3           // point to 16-byte aligned 16-byte pattern
        addi    r8,r3,16        // point to first unstored byte (r8 is 16-byte aligned)
        subi    r4,r5,16        // account for the aligned bytes we have stored
        bla     _COMM_PAGE_MEMSET_PATTERN
        mr.     r5,r4           // move length (0..31) back to original reg and test for 0
        mtlr    r12
        beqlr                   // done if residual length == 0
        lwz     r7,-16(r8)      // load aligned pattern into r7,r9,r10, and r11
        lwz     r9,-12(r8)
        mr      r3,r8           // move destination ptr back
        lwz     r10,-8(r8)
        lwz     r11,-4(r8)
        
        // Handle short operands and leftovers.
        //      r3 = dest
        //      r5 = length
        //      r7,r9,r10,r11 = pattern
LShort:
        srgi.   r0,r5,4         // at least 16 bytes?
        mtcrf   0x01,r5         // move leftover count to cr7
        beq     Lleftovers
        mtctr   r0
LShortLoop:
        stw     r7,0(r3)        // replicate the pattern
        stw     r9,4(r3)
        stw     r10,8(r3)
        stw     r11,12(r3)
        addi    r3,r3,16
        bdnz    LShortLoop      // store 16 more bytes
        
        // Fewer than 16 bytes remaining.
Lleftovers:        
        bf      28,1f
        stw     r7,0(r3)        // store next 8 bytes
        stw     r9,4(r3)
        addi    r3,r3,8
        mr      r7,r10          // shift pattern over
        mr      r9,r11
1:
        bf      29,2f
        stw     r7,0(r3)
        addi    r3,r3,4
        mr      r7,r9
2:
        bf      30,3f
        rlwinm  r7,r7,16,0,31   // position leftmost 2 bytes for store
        sth     r7,0(r3)
        addi    r3,r3,2
3:
        bflr    31
        srwi    r7,r7,24        // position leftmost byte for store
        stb     r7,0(r3)
        blr
Commit	Line	Data
3d9156a7 A	1	/*
	2	* Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved.
	3	*
	4	* @APPLE_LICENSE_HEADER_START@
	5	*
	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. Please obtain a copy of the License at
	10	* http://www.opensource.apple.com/apsl/ and read it before using this
	11	* file.
	12	*
	13	* The Original Code and all software distributed under the License are
	14	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	15	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	16	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	17	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	18	* Please see the License for the specific language governing rights and
	19	* limitations under the License.
	20	*
	21	* @APPLE_LICENSE_HEADER_END@
	22	*/
	23
	24	#include <machine/cpu_capabilities.h>
	25
	26	/* We use mode-independent "g" opcodes such as "srgi". These expand
	27	* into word operations when targeting __ppc__, and into doubleword
	28	* operations when targeting __ppc64__.
	29	*/
	30	#include <architecture/ppc/mode_independent_asm.h>
	31
	32
	33	.text
	34	#define kShort 128 // threshold for calling commpage
	35
	36
	37	/* ***************
	38	* * M E M S E T *
	39	* ***************
	40	*
	41	* Registers we use:
	42	* r3 = original ptr, not changed since memset returns it
	43	* r4 = count of bytes to set
	44	* r7 = value to set
	45	* r8 = working operand ptr
	46	*/
	47
	48	.globl _memset
	49	.align 5
	50	_memset: // void * memset(void *b, int c, size_t len);
	51	andi. r7,r4,0xFF // copy value to working register, test for 0
224c7076	52	mr r4,r5 // move length to working register
3d9156a7 A	53	cmplgi cr1,r5,kShort // long enough to bother with _COMM_PAGE_MEMSET_PATTERN?
	54	beqa++ _COMM_PAGE_BZERO // if (c==0), map to bzero()
	55	rlwimi r7,r7,8,16,23 // replicate nonzero value to low 2 bytes
224c7076 A	56	neg r5,r3 // start to compute #bytes to align
224c7076 A	57	mr r8,r3 // make working copy of operand ptr
3d9156a7	58	rlwimi r7,r7,16,0,15 // value now in all 4 bytes
224c7076	59	blt cr1,Lmemset3 // too short to use commpage
3d9156a7 A	60	andi. r0,r5,0xF // r0 <- #bytes to align on quadword
	61
	62	// Align ptr and store enough so that we have an aligned 16-byte pattern.
	63
	64	stw r7,0(r8)
	65	stw r7,4(r8)
	66	stw r7,8(r8)
	67	stw r7,12(r8)
3d9156a7 A	68	beq Lmemset1 // skip if (r0==0), ie if r8 is 16-byte aligned
	69	add r8,r8,r0 // 16-byte align ptr
	70	sub r4,r4,r0 // adjust length
	71	stw r7,0(r8) // now we can store an aligned 16-byte pattern
	72	stw r7,4(r8)
	73	stw r7,8(r8)
	74	stw r7,12(r8)
	75
	76	// Call machine-specific commpage routine, which expects:
	77	// r4 = count (>=32)
	78	// r8 = ptr (16-byte aligned) to memory to store
	79	// r9 = ptr (16-byte aligned) to 16-byte pattern to store
	80	// When it returns:
	81	// r3, r7, and r12 are preserved
	82	// r4 and r8 are updated to reflect a residual count of from 0..31 bytes
	83
	84	Lmemset1:
	85	mflr r12 // save return address
	86	mr r9,r8 // point to 16-byte-aligned 16-byte pattern
	87	addi r8,r8,16 // point to first unstored byte
	88	subi r4,r4,16 // account for the aligned bytes we have stored
224c7076	89	bla _COMM_PAGE_MEMSET_PATTERN
3d9156a7 A	90	mtlr r12
	91
	92	// Here for short nonzero memset.
	93	// r4 = count (<= kShort bytes)
	94	// r7 = pattern in all four bytes
	95	// r8 = ptr
	96	Lmemset3:
	97	srgi. r0,r4,4 // any 16-byte chunks?
	98	mtcrf 0x01,r4 // move length remaining to cr7 so we can test bits
	99	beq Lmemset5 // fewer than 16 bytes
	100	mtctr r0
	101	b Lmemset4 // enter loop
	102
	103	.align 5
	104	Lmemset4: // loop over 16-byte chunks
	105	stw r7,0(r8)
	106	stw r7,4(r8)
	107	stw r7,8(r8)
	108	stw r7,12(r8)
	109	addi r8,r8,16
	110	bdnz++ Lmemset4
	111
	112	// Handle last 0..15 bytes.
	113	Lmemset5:
	114	bf 28,2f
	115	stw r7,0(r8)
	116	stw r7,4(r8)
	117	addi r8,r8,8
	118	2:
	119	bf 29,3f
	120	stw r7,0(r8)
	121	addi r8,r8,4
	122	3:
	123	bf 30,4f
	124	sth r7,0(r8)
	125	addi r8,r8,2
	126	4:
	127	bflr 31
	128	stb r7,0(r8)
	129	blr
	130
	131
224c7076 A	132	/* ***********************************
	133	* * M E M S E T _ P A T T E R N 1 6 *
	134	* ***********************************
3d9156a7 A	135	*
	136	* Used to store a 16-byte pattern in memory:
	137	*
224c7076	138	* void memset_pattern16(void b, const void c16, size_t len);
3d9156a7 A	139	*
	140	* Where c16 points to the 16-byte pattern. None of the parameters need be aligned.
	141	*/
	142
224c7076	143	.globl _memset_pattern16
3d9156a7	144	.align 5
224c7076	145	_memset_pattern16:
3d9156a7 A	146	cmplgi cr1,r5,kShort // check length
	147	lwz r7,0(r4) // load pattern into (these remain lwz in 64-bit mode)
	148	lwz r9,4(r4)
	149	neg r6,r3 // start to compute ptr alignment
	150	lwz r10,8(r4)
	151	lwz r11,12(r4)
	152	b __memset_pattern_common
	153
	154
224c7076 A	155	/* *********************************
	156	* * M E M S E T _ P A T T E R N 8 *
	157	* *********************************
3d9156a7 A	158	*
	159	* Used to store an 8-byte pattern in memory:
	160	*
224c7076	161	* void memset_pattern8(void b, const void c8, size_t len);
3d9156a7 A	162	*
	163	* Where c8 points to the 8-byte pattern. None of the parameters need be aligned.
	164	*/
	165
224c7076	166	.globl _memset_pattern8
3d9156a7	167	.align 5
224c7076	168	_memset_pattern8:
3d9156a7 A	169	lwz r7,0(r4) // load pattern (these remain lwz in 64-bit mode)
	170	lwz r9,4(r4)
	171	cmplgi cr1,r5,kShort // check length
	172	neg r6,r3 // start to compute ptr alignment
	173	mr r10,r7 // replicate into 16-byte pattern
	174	mr r11,r9
	175	b __memset_pattern_common
	176
	177
224c7076 A	178	/* *********************************
	179	* * M E M S E T _ P A T T E R N 4 *
	180	* *********************************
3d9156a7 A	181	*
	182	* Used to store a 4-byte pattern in memory:
	183	*
224c7076	184	* void memset_pattern4(void b, const void c4, size_t len);
3d9156a7 A	185	*
	186	* Where c4 points to the 4-byte pattern. None of the parameters need be aligned.
	187	*/
	188
224c7076	189	.globl _memset_pattern4
3d9156a7	190	.align 5
224c7076	191	_memset_pattern4:
3d9156a7 A	192	lwz r7,0(r4) // load pattern
	193	cmplgi cr1,r5,kShort // check length
	194	neg r6,r3 // start to compute ptr alignment
	195	mr r9,r7 // replicate into 16-byte pattern
	196	mr r10,r7
	197	mr r11,r7
	198	b __memset_pattern_common // don't fall through because of scatter-loading
	199
	200
	201	/* ***********************************************
	202	* * _ M E M S E T _ P A T T E R N _ C O M M O N *
	203	* ***********************************************
	204	*
224c7076	205	* This is the common code used by _memset_pattern16, 8, and 4. They all get here via
3d9156a7 A	206	* long branch (ie, "b") in case the routines are re-ordered, with:
	207	* r3 = ptr to memory to store pattern into (unaligned)
	208	* r5 = length in bytes
	209	* r6 = neg(r3), used to compute #bytes to align
	210	* r7, r9, r10, r11 = 16-byte pattern to store
	211	* cr1= ble if (r5 <= kShort)
	212	*/
	213
	214	.globl __memset_pattern_common
224c7076	215	.private_extern __memset_pattern_common // avoid dyld stub, which trashes r11
3d9156a7 A	216	.align 5
	217	__memset_pattern_common:
	218	andi. r0,r6,0xF // get #bytes to 16-byte align ptr
	219	ble-- cr1,LShort // if short operand skip out
	220
	221	// Align ptr and store enough of pattern so we have an aligned
	222	// 16-byte chunk of it (this effectively rotates incoming pattern
	223	// if the original ptr was not aligned.)
	224
	225	stw r7,0(r3)
	226	stw r9,4(r3)
	227	stw r10,8(r3)
	228	stw r11,12(r3)
	229	beq Laligned // skip if (r0==0), ie if r3 is 16-byte aligned
	230	stw r7,16(r3)
	231	stw r9,20(r3)
	232	stw r10,24(r3)
	233	stw r11,28(r3)
	234	add r3,r3,r0 // 16-byte align ptr
	235	sub r5,r5,r0 // adjust length
	236
	237	// We're ready to call the machine-specific commpage routine
	238	// to do the heavy lifting. When called, _COMM_PAGE_MEMSET_PATTERN expects:
	239	// r4 = length (>= 32)
	240	// r8 = ptr (16-byte aligned)
	241	// r9 = ptr to 16-byte pattern (16-byte aligned)
	242	// When it returns:
	243	// r3, r7, and r12 are preserved
	244	// r4 and r8 are updated to reflect a residual count of from 0..31 bytes
	245
	246	Laligned:
	247	mflr r12 // save return across commpage call
	248	mr r9,r3 // point to 16-byte aligned 16-byte pattern
	249	addi r8,r3,16 // point to first unstored byte (r8 is 16-byte aligned)
	250	subi r4,r5,16 // account for the aligned bytes we have stored
	251	bla _COMM_PAGE_MEMSET_PATTERN
	252	mr. r5,r4 // move length (0..31) back to original reg and test for 0
	253	mtlr r12
	254	beqlr // done if residual length == 0
	255	lwz r7,-16(r8) // load aligned pattern into r7,r9,r10, and r11
	256	lwz r9,-12(r8)
	257	mr r3,r8 // move destination ptr back
	258	lwz r10,-8(r8)
	259	lwz r11,-4(r8)
	260
	261	// Handle short operands and leftovers.
	262	// r3 = dest
	263	// r5 = length
	264	// r7,r9,r10,r11 = pattern
	265	LShort:
	266	srgi. r0,r5,4 // at least 16 bytes?
	267	mtcrf 0x01,r5 // move leftover count to cr7
	268	beq Lleftovers
	269	mtctr r0
	270	LShortLoop:
	271	stw r7,0(r3) // replicate the pattern
	272	stw r9,4(r3)
	273	stw r10,8(r3)
	274	stw r11,12(r3)
	275	addi r3,r3,16
	276	bdnz LShortLoop // store 16 more bytes
	277
	278	// Fewer than 16 bytes remaining.
	279	Lleftovers:
280	bf 28,1f
281	stw r7,0(r3) // store next 8 bytes
282	stw r9,4(r3)
283	addi r3,r3,8
284	mr r7,r10 // shift pattern over
285	mr r9,r11
286	1:
287	bf 29,2f
288	stw r7,0(r3)
289	addi r3,r3,4
290	mr r7,r9
291	2:
292	bf 30,3f
293	rlwinm r7,r7,16,0,31 // position leftmost 2 bytes for store
294	sth r7,0(r3)
295	addi r3,r3,2
296	3:
297	bflr 31
298	srwi r7,r7,24 // position leftmost byte for store
299	stb r7,0(r3)
300	blr