[apple/xnu.git] / osfmk / ppc / commpage / bcopy_970.s

/*
 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
 *
 * @APPLE_LICENSE_HEADER_START@
 * 
 * The contents of this file constitute Original Code as defined in and
 * are subject to the Apple Public Source License Version 1.1 (the
 * "License").  You may not use this file except in compliance with the
 * License.  Please obtain a copy of the License at
 * http://www.apple.com/publicsource and read it before using this file.
 * 
 * This Original Code and all software distributed under the License are
 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
 * License for the specific language governing rights and limitations
 * under the License.
 * 
 * @APPLE_LICENSE_HEADER_END@
 */
/* =======================================
 * BCOPY, MEMCPY, and MEMMOVE for Mac OS X
 * =======================================
 *
 * Version of 6/11/2003, tuned for the IBM 970.
 *
 *
 * Register usage.  Note the rather delicate way we assign multiple uses
 * to the same register.  Beware.
 *   r0  = temp (NB: cannot use r0 for any constant such as "c16")
 *   r3  = not used, as memcpy and memmove return 1st parameter as a value
 *   r4  = source ptr ("rs")
 *   r5  = count of bytes to move ("rc")
 *   r6  = "w1", "c16", or "cm17"
 *   r7  = "w2", "c32", or "cm33"
 *   r8  = "w3", "c48", or "cm49"
 *   r9  = "w4",        or "cm1"
 *   r10 = vrsave ("rv")
 *   r11 = unused
 *   r12 = destination ptr ("rd")
 *   v0  = permute vector ("vp") 
 * v1-v8 = qw's loaded from source
 *v9-v12 = permuted qw's ("vw", "vx", "vy", and "vz")
 */
#define rs	r4
#define rd	r12
#define rc	r5
#define	rv	r10

#define w1	r6
#define w2	r7
#define w3	r8
#define	w4	r9

#define c16		r6
#define cm17	r6
#define c32		r7
#define cm33	r7
#define c48		r8
#define cm49	r8
#define cm1		r9

#define	vp	v0
#define	vw	v9
#define	vx	v10
#define	vy	v11
#define	vz	v12

#define	ASSEMBLER
#include <sys/appleapiopts.h>
#include <ppc/asm.h>
#include <machine/cpu_capabilities.h>
#include <machine/commpage.h>

        .text
        .globl	EXT(bcopy_970)


#define	kShort		64
#define	kVeryLong	(128*1024)


// Main entry points.

        .align 	5
bcopy_970:							// void bcopy(const void *src, void *dst, size_t len)
        cmplwi	rc,kShort			// short or long?
        sub		w1,r4,r3			// must move in reverse if (rd-rs)<rc
        mr		rd,r4				// move registers to canonic spot
        mr		rs,r3
        blt		LShort				// handle short operands
        dcbt	0,rs				// touch in the first line of source
        dcbtst	0,rd				// touch in destination
        b		LLong1				// join long operand code

// NB: memmove() must be 8 words past bcopy(), to agree with comm page addresses.

        .align	5
Lmemcpy_970:						// void* memcpy(void *dst, void *src, size_t len)
Lmemmove_970:						// void* memmove(void *dst, const void *src, size_t len)
        cmplwi	rc,kShort			// short or long?
        sub		w1,r3,r4			// must move in reverse if (rd-rs)<rc
        mr		rd,r3				// must leave r3 alone, it is return value for memcpy etc
        bge		LLong0				// handle long operands

// Handle short operands.
//		rs = source
//		rd = destination
//		rc = count
//		w1 = (rd-rs), must move reverse if (rd-rs)<rc
        
LShort:
        cmplw	cr1,w1,rc			// set cr1 blt if we must move reverse
        mtcrf	0x02,rc				// move length to cr6 and cr7 one at a time
        mtcrf	0x01,rc
        blt--	cr1,LShortReverse
        
// Forward short operands.  This is the most frequent case, so it is inline.

        bf		26,0f				// 32-byte chunk to move?
        ld		w1,0(rs)
        ld		w2,8(rs)
        ld		w3,16(rs)
        ld		w4,24(rs)
        addi	rs,rs,32
        std		w1,0(rd)
        std		w2,8(rd)
        std		w3,16(rd)
        std		w4,24(rd)
        addi	rd,rd,32
0:
LShort32:
        bf		27,1f				// quadword to move?
        ld		w1,0(rs)
        ld		w3,8(rs)
        addi	rs,rs,16
        std		w1,0(rd)
        std		w3,8(rd)
        addi	rd,rd,16
1:
LShort16:							// join here to xfer 0-15 bytes
        bf		28,2f				// doubleword?
        ld		w1,0(rs)
        addi	rs,rs,8
        std		w1,0(rd)
        addi	rd,rd,8
2:
        bf		29,3f				// word?
        lwz		w1,0(rs)
        addi	rs,rs,4
        stw		w1,0(rd)
        addi	rd,rd,4
3:
        bf		30,4f				// halfword to move?
        lhz		w1,0(rs)
        addi	rs,rs,2
        sth		w1,0(rd)
        addi	rd,rd,2
4:
        bflr	31					// skip if no odd byte
        lbz		w1,0(rs)
        stb		w1,0(rd)
        blr
        
        
// Handle short reverse operands.
//		cr = length in bits 26-31       

LShortReverse:
        add		rs,rs,rc			// adjust ptrs for reverse move
        add		rd,rd,rc
        bf		26,0f				// 32 bytes to move?
        ld		w1,-8(rs)
        ld		w2,-16(rs)
        ld		w3,-24(rs)
        ldu		w4,-32(rs)
        std		w1,-8(rd)
        std		w2,-16(rd)
        std		w3,-24(rd)
        stdu	w4,-32(rd)
0:
        bf		27,1f				// quadword to move?
        ld		w1,-8(rs)
        ldu		w2,-16(rs)
        std		w1,-8(rd)
        stdu	w2,-16(rd)
1:
LShortReverse16:					// join here to xfer 0-15 bytes and return
        bf		28,2f				// doubleword?
        ldu		w1,-8(rs)
        stdu	w1,-8(rd)
2:
        bf		29,3f				// word?
        lwzu	w1,-4(rs)
        stwu	w1,-4(rd)
3:
        bf		30,4f				// halfword to move?
        lhzu	w1,-2(rs)
        sthu	w1,-2(rd)
4:
        bflr	31					// done if no odd byte
        lbz 	w1,-1(rs)			// no update
        stb 	w1,-1(rd)
        blr
        

// Long operands, use Altivec in most cases.
//		rs = source
//		rd = destination
//		rc = count
//		w1 = (rd-rs), must move reverse if (rd-rs)<rc

LLong0:								// entry from memmove()
        dcbt	0,rs				// touch in source
        dcbtst	0,rd				// touch in destination
LLong1:								// entry from bcopy() with operands already touched in
        cmplw	cr1,w1,rc			// set cr1 blt iff we must move reverse
        neg		w3,rd				// start to compute #bytes to align destination
        rlwinm	w2,w1,0,0xF			// 16-byte aligned?  (w2==0 if so)
        andi.	w4,w3,0xF			// w4 <- #bytes to 16-byte align destination
        cmpwi	cr5,w2,0			// set cr5 beq if relatively 16-byte aligned
        blt--	cr1,LLongReverse	// handle reverse moves
        sub		rc,rc,w4			// adjust length for aligning destination
        srwi	r0,rc,7				// get #cache lines to copy (may be 0)
        cmpwi	cr1,r0,0			// set cr1 on #chunks
        beq		LFwdAligned			// dest is already aligned
        
// 16-byte align destination.

        mtcrf	0x01,w4				// cr7 <- #bytes to align dest (nonzero)
        bf		31,1f				// byte to move?
        lbz		w1,0(rs)
        addi	rs,rs,1
        stb		w1,0(rd)
        addi	rd,rd,1
1:
        bf		30,2f				// halfword?
        lhz		w1,0(rs)
        addi	rs,rs,2
        sth		w1,0(rd)
        addi	rd,rd,2
2:
        bf		29,3f				// word?
        lwz		w1,0(rs)
        addi	rs,rs,4
        stw		w1,0(rd)
        addi	rd,rd,4
3:
        bf		28,LFwdAligned		// doubleword?
        ld		w1,0(rs)
        addi	rs,rs,8
        std		w1,0(rd)
        addi	rd,rd,8


// Forward, destination is 16-byte aligned.  There are five cases:
//  1. If the length>=kVeryLong (ie, several pages), then use the
//     "bigcopy" path that pulls all the punches.  This is the fastest
//	   case for cold-cache operands, as any this long will likely be.
//	2. If length>=128 and source is 16-byte aligned, then use the
//	   lvx/stvx loop over 128-byte chunks.  This is the fastest
//     case for hot-cache operands, 2nd fastest for cold.
//	3. If length>=128 and source is not 16-byte aligned, then use the
//	   lvx/vperm/stvx loop over 128-byte chunks.
//	4. If length<128 and source is 8-byte aligned, then use the
//	   ld/std loop over 32-byte chunks.
//	5. If length<128 and source is not 8-byte aligned, then use the
//	   lvx/vperm/stvx loop over 32-byte chunks.  This is the slowest case.
// Registers at this point:
//		r0/cr1 = count of cache lines ("chunks") that we'll cover (may be 0)
//			rs = alignment unknown
//		    rd = 16-byte aligned
//			rc = bytes remaining
//			w2 = low 4 bits of (rd-rs), used to check alignment
//		   cr5 = beq if source is also 16-byte aligned

LFwdAligned:
        andi.	w3,w2,7				// is source at least 8-byte aligned?
        mtcrf	0x01,rc				// move leftover count to cr7 for LShort16
        bne		cr1,LFwdLongVectors	// at least one 128-byte chunk, so use vectors
        srwi	w1,rc,5				// get 32-byte chunk count
        mtcrf	0x02,rc				// move bit 27 of length to cr6 for LShort32
        mtctr	w1					// set up 32-byte loop (w1!=0)
        beq		LFwdMedAligned		// source is 8-byte aligned, so use ld/std loop
        mfspr	rv,vrsave			// get bitmap of live vector registers
        oris	w4,rv,0xFFF8		// we use v0-v12
        li		c16,16				// get constant used in lvx
        li		c32,32
        mtspr	vrsave,w4			// update mask
        lvx		v1,0,rs				// prefetch 1st source quadword
        lvsl	vp,0,rs				// get permute vector to shift left
        
        
// Fewer than 128 bytes but not doubleword aligned: use lvx/vperm/stvx.

1:									// loop over 32-byte chunks
        lvx		v2,c16,rs
        lvx		v3,c32,rs
        addi	rs,rs,32
        vperm	vx,v1,v2,vp
        vperm	vy,v2,v3,vp
        vor		v1,v3,v3			// v1 <- v3
        stvx	vx,0,rd
        stvx	vy,c16,rd
        addi	rd,rd,32
        bdnz	1b
        
        mtspr	vrsave,rv			// restore bitmap of live vr's
        b		LShort32

        
// Fewer than 128 bytes and doubleword aligned: use ld/std.

        .align	5
LFwdMedAligned:									// loop over 32-byte chunks
        ld		w1,0(rs)
        ld		w2,8(rs)
        ld		w3,16(rs)
        ld		w4,24(rs)
        addi	rs,rs,32
        std		w1,0(rd)
        std		w2,8(rd)
        std		w3,16(rd)
        std		w4,24(rd)
        addi	rd,rd,32
        bdnz	LFwdMedAligned
        
        b		LShort32

        
// Forward, 128 bytes or more: use vectors.  When entered:
//	    r0 = 128-byte chunks to move (>0)
//		rd = 16-byte aligned
//	   cr5 = beq if source is 16-byte aligned
//	   cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
// We set up many registers:
//	   ctr = number of 128-byte chunks to move
//	r0/cr0 = leftover QWs to move
//	   cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
//	   cr6 = beq if leftover byte count is 0
//		rv = original value of VRSave
// c16,c32,c48 = loaded

LFwdLongVectors:
        mfspr	rv,vrsave			// get bitmap of live vector registers
        lis		w3,kVeryLong>>16	// cutoff for very-long-operand special case path
        cmplw	cr1,rc,w3			// very long operand?
        rlwinm	w3,rc,0,28,31		// move last 0-15 byte count to w3
        bgea--	cr1,_COMM_PAGE_BIGCOPY	// handle big copies separately
        mtctr	r0					// set up loop count
        cmpwi	cr6,w3,0			// set cr6 on leftover byte count
        oris	w4,rv,0xFFF8		// we use v0-v12
        rlwinm.	r0,rc,28,29,31		// get number of quadword leftovers (0-7) and set cr0
        li		c16,16				// get constants used in ldvx/stvx
        mtspr	vrsave,w4			// update mask
        li		c32,32
        li		c48,48
        beq		cr5,LFwdLongAligned	// source is also 16-byte aligned, no need for vperm
        lvsl	vp,0,rs				// get permute vector to shift left
        lvx		v1,0,rs				// prefetch 1st source quadword
        b		LFwdLongUnaligned


// Forward, long, unaligned vector loop.

        .align	5					// align inner loops
LFwdLongUnaligned:					// loop over 128-byte chunks
        addi	w4,rs,64
        lvx		v2,c16,rs
        lvx		v3,c32,rs
        lvx		v4,c48,rs
        lvx		v5,0,w4
        lvx		v6,c16,w4
        vperm	vw,v1,v2,vp
        lvx		v7,c32,w4
        lvx		v8,c48,w4
        addi	rs,rs,128
        vperm	vx,v2,v3,vp
        addi	w4,rd,64
        lvx		v1,0,rs
        stvx	vw,0,rd
        vperm	vy,v3,v4,vp
        stvx	vx,c16,rd
        vperm	vz,v4,v5,vp
        stvx	vy,c32,rd
        vperm	vw,v5,v6,vp
        stvx	vz,c48,rd
        vperm	vx,v6,v7,vp
        addi	rd,rd,128
        stvx	vw,0,w4
        vperm	vy,v7,v8,vp
        stvx	vx,c16,w4
        vperm	vz,v8,v1,vp
        stvx	vy,c32,w4
        stvx	vz,c48,w4
        bdnz	LFwdLongUnaligned

        beq		4f					// no leftover quadwords
        mtctr	r0
3:									// loop over remaining quadwords
        lvx		v2,c16,rs
        addi	rs,rs,16
        vperm	vx,v1,v2,vp
        vor		v1,v2,v2			// v1 <- v2
        stvx	vx,0,rd
        addi	rd,rd,16
        bdnz	3b
4:
        mtspr	vrsave,rv			// restore bitmap of live vr's
        bne		cr6,LShort16		// handle last 0-15 bytes if any
        blr


// Forward, long, 16-byte aligned vector loop.

        .align	5
LFwdLongAligned:        			// loop over 128-byte chunks
        addi	w4,rs,64
        lvx		v1,0,rs
        lvx		v2,c16,rs
        lvx		v3,c32,rs
        lvx		v4,c48,rs
        lvx		v5,0,w4
        lvx		v6,c16,w4
        lvx		v7,c32,w4
        lvx		v8,c48,w4
        addi	rs,rs,128
        addi	w4,rd,64
        stvx	v1,0,rd 
        stvx	v2,c16,rd
        stvx	v3,c32,rd
        stvx	v4,c48,rd
        stvx	v5,0,w4
        stvx	v6,c16,w4
        stvx	v7,c32,w4
        stvx	v8,c48,w4
        addi	rd,rd,128
        bdnz	LFwdLongAligned
                
        beq		4f					// no leftover quadwords
        mtctr	r0
3:									// loop over remaining quadwords (1-7)
        lvx		v1,0,rs
        addi	rs,rs,16
        stvx	v1,0,rd
        addi	rd,rd,16
        bdnz	3b
4:
        mtspr	vrsave,rv			// restore bitmap of live vr's
        bne		cr6,LShort16		// handle last 0-15 bytes if any
        blr
        

// Long, reverse moves.
//		rs = source
//		rd = destination
//		rc = count
//	   cr5 = beq if relatively 16-byte aligned

LLongReverse:
        add		rd,rd,rc			// point to end of operands
        add		rs,rs,rc
        andi.	r0,rd,0xF			// #bytes to 16-byte align destination
        beq		2f					// already aligned
        
// 16-byte align destination.

        mtctr	r0					// set up for loop
        sub		rc,rc,r0
1:
        lbzu	w1,-1(rs)
        stbu	w1,-1(rd)
        bdnz	1b

// Prepare for reverse vector loop.  When entered:
//		rd = 16-byte aligned
//		cr5 = beq if source also 16-byte aligned
// We set up many registers:
//		ctr/cr1 = number of 64-byte chunks to move (may be 0)
//		r0/cr0 = leftover QWs to move
//		cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
//		cr6 = beq if leftover byte count is 0
//		cm1 = -1
//		rv = original value of vrsave

2:
        mfspr	rv,vrsave			// get bitmap of live vector registers
        srwi	r0,rc,6				// get count of 64-byte chunks to move (may be 0)
        oris	w1,rv,0xFFF8		// we use v0-v12
        mtcrf	0x01,rc				// prepare for moving last 0-15 bytes in LShortReverse16
        rlwinm	w3,rc,0,28,31		// move last 0-15 byte count to w3 too
        cmpwi	cr1,r0,0			// set cr1 on chunk count
        mtspr	vrsave,w1			// update mask
        mtctr	r0					// set up loop count
        cmpwi	cr6,w3,0			// set cr6 on leftover byte count
        rlwinm.	r0,rc,28,30,31		// get number of quadword leftovers (0-3) and set cr0
        li		cm1,-1				// get constants used in ldvx/stvx
        
        bne		cr5,LReverseVecUnal	// handle unaligned operands
        beq		cr1,2f				// no chunks (if no chunks, must be leftover QWs)
        li		cm17,-17
        li		cm33,-33
        li		cm49,-49
        b		1f

// Long, reverse 16-byte-aligned vector loop.
      
        .align	5					// align inner loops
1:        							// loop over 64-byte chunks
        lvx		v1,cm1,rs
        lvx		v2,cm17,rs
        lvx		v3,cm33,rs
        lvx		v4,cm49,rs
        subi	rs,rs,64
        stvx	v1,cm1,rd
        stvx	v2,cm17,rd
        stvx	v3,cm33,rd
        stvx	v4,cm49,rd
        subi	rd,rd,64
        bdnz	1b
        
        beq		4f					// no leftover quadwords
2:									// r0=#QWs, rv=vrsave, cr7=(rc & F), cr6 set on cr7
        mtctr	r0
3:									// loop over remaining quadwords (1-7)
        lvx		v1,cm1,rs
        subi	rs,rs,16
        stvx	v1,cm1,rd
        subi	rd,rd,16
        bdnz	3b
4:
        mtspr	vrsave,rv			// restore bitmap of live vr's
        bne		cr6,LShortReverse16	// handle last 0-15 bytes if any
        blr


// Long, reverse, unaligned vector loop.
//		ctr/cr1 = number of 64-byte chunks to move (may be 0)
//		r0/cr0 = leftover QWs to move
//		cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
//		cr6 = beq if leftover byte count is 0
//		rv = original value of vrsave
//		cm1 = -1

LReverseVecUnal:
        lvsl	vp,0,rs				// get permute vector to shift left
        lvx		v1,cm1,rs			// v1 always looks ahead
        li		cm17,-17
        beq		cr1,2f				// no chunks (if no chunks, must be leftover QWs)
        li		cm33,-33
        li		cm49,-49
        b		1f
        
        .align	5					// align the inner loops
1:									// loop over 64-byte chunks
        lvx		v2,cm17,rs
        lvx		v3,cm33,rs
        lvx		v4,cm49,rs
        subi	rs,rs,64
        vperm	vx,v2,v1,vp
        lvx		v1,cm1,rs
        vperm	vy,v3,v2,vp
        stvx	vx,cm1,rd
        vperm	vz,v4,v3,vp
        stvx	vy,cm17,rd
        vperm	vx,v1,v4,vp
        stvx	vz,cm33,rd
        stvx	vx,cm49,rd
        subi	rd,rd,64
        bdnz	1b

        beq		4f					// no leftover quadwords
2:									// r0=#QWs, rv=vrsave, v1=next QW, cr7=(rc & F), cr6 set on cr7
        mtctr	r0
3:									// loop over 1-3 quadwords
        lvx		v2,cm17,rs
        subi	rs,rs,16
        vperm	vx,v2,v1,vp
        vor		v1,v2,v2			// v1 <- v2
        stvx	vx,cm1,rd
        subi	rd,rd,16
        bdnz	3b
4:
        mtspr	vrsave,rv			// restore bitmap of live vr's
        bne		cr6,LShortReverse16	// handle last 0-15 bytes iff any
        blr

        COMMPAGE_DESCRIPTOR(bcopy_970,_COMM_PAGE_BCOPY,k64Bit+kHasAltivec,0,kCommPageMTCRF)
Commit	Line	Data
55e303ae A	1	/*
	2	* Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
	3	*
	4	* @APPLE_LICENSE_HEADER_START@
	5	*
e5568f75 A	6	* The contents of this file constitute Original Code as defined in and
	7	* are subject to the Apple Public Source License Version 1.1 (the
	8	* "License"). You may not use this file except in compliance with the
	9	* License. Please obtain a copy of the License at
	10	* http://www.apple.com/publicsource and read it before using this file.
55e303ae	11	*
e5568f75 A	12	* This Original Code and all software distributed under the License are
e5568f75 A	13	* distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
55e303ae A	14	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
55e303ae A	15	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
e5568f75 A	16	* FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
	17	* License for the specific language governing rights and limitations
	18	* under the License.
55e303ae A	19	*
	20	* @APPLE_LICENSE_HEADER_END@
	21	*/
	22	/* =======================================
	23	* BCOPY, MEMCPY, and MEMMOVE for Mac OS X
	24	* =======================================
	25	*
	26	* Version of 6/11/2003, tuned for the IBM 970.
	27	*
	28	*
	29	* Register usage. Note the rather delicate way we assign multiple uses
	30	* to the same register. Beware.
	31	* r0 = temp (NB: cannot use r0 for any constant such as "c16")
	32	* r3 = not used, as memcpy and memmove return 1st parameter as a value
	33	* r4 = source ptr ("rs")
	34	* r5 = count of bytes to move ("rc")
	35	* r6 = "w1", "c16", or "cm17"
	36	* r7 = "w2", "c32", or "cm33"
	37	* r8 = "w3", "c48", or "cm49"
	38	* r9 = "w4", or "cm1"
	39	* r10 = vrsave ("rv")
	40	* r11 = unused
	41	* r12 = destination ptr ("rd")
	42	* v0 = permute vector ("vp")
	43	* v1-v8 = qw's loaded from source
	44	*v9-v12 = permuted qw's ("vw", "vx", "vy", and "vz")
	45	*/
	46	#define rs r4
	47	#define rd r12
	48	#define rc r5
	49	#define rv r10
	50
	51	#define w1 r6
	52	#define w2 r7
	53	#define w3 r8
	54	#define w4 r9
	55
	56	#define c16 r6
	57	#define cm17 r6
	58	#define c32 r7
	59	#define cm33 r7
	60	#define c48 r8
	61	#define cm49 r8
	62	#define cm1 r9
	63
	64	#define vp v0
	65	#define vw v9
	66	#define vx v10
	67	#define vy v11
	68	#define vz v12
	69
	70	#define ASSEMBLER
	71	#include <sys/appleapiopts.h>
	72	#include <ppc/asm.h>
	73	#include <machine/cpu_capabilities.h>
	74	#include <machine/commpage.h>
	75
	76	.text
	77	.globl EXT(bcopy_970)
	78
	79
	80	#define kShort 64
	81	#define kVeryLong (128*1024)
	82
83
84	// Main entry points.
85
86	.align 5
87	bcopy_970: // void bcopy(const void src, void dst, size_t len)
88	cmplwi rc,kShort // short or long?
89	sub w1,r4,r3 // must move in reverse if (rd-rs)<rc
90	mr rd,r4 // move registers to canonic spot
91	mr rs,r3
92	blt LShort // handle short operands
93	dcbt 0,rs // touch in the first line of source
94	dcbtst 0,rd // touch in destination
95	b LLong1 // join long operand code
96
97	// NB: memmove() must be 8 words past bcopy(), to agree with comm page addresses.
98
99	.align 5
100	Lmemcpy_970: // void* memcpy(void dst, void src, size_t len)
101	Lmemmove_970: // void* memmove(void dst, const void src, size_t len)
102	cmplwi rc,kShort // short or long?
103	sub w1,r3,r4 // must move in reverse if (rd-rs)<rc
104	mr rd,r3 // must leave r3 alone, it is return value for memcpy etc
105	bge LLong0 // handle long operands
106
107	// Handle short operands.
108	// rs = source
109	// rd = destination
110	// rc = count
111	// w1 = (rd-rs), must move reverse if (rd-rs)<rc
112
113	LShort:
114	cmplw cr1,w1,rc // set cr1 blt if we must move reverse
115	mtcrf 0x02,rc // move length to cr6 and cr7 one at a time
116	mtcrf 0x01,rc
117	blt-- cr1,LShortReverse
118
119	// Forward short operands. This is the most frequent case, so it is inline.
120
121	bf 26,0f // 32-byte chunk to move?
122	ld w1,0(rs)
123	ld w2,8(rs)
124	ld w3,16(rs)
125	ld w4,24(rs)
126	addi rs,rs,32
127	std w1,0(rd)
128	std w2,8(rd)
129	std w3,16(rd)
130	std w4,24(rd)
131	addi rd,rd,32
132	0:
133	LShort32:
134	bf 27,1f // quadword to move?
135	ld w1,0(rs)
136	ld w3,8(rs)
137	addi rs,rs,16
138	std w1,0(rd)
139	std w3,8(rd)
140	addi rd,rd,16
141	1:
142	LShort16: // join here to xfer 0-15 bytes
143	bf 28,2f // doubleword?
144	ld w1,0(rs)
145	addi rs,rs,8
146	std w1,0(rd)
147	addi rd,rd,8
148	2:
149	bf 29,3f // word?
150	lwz w1,0(rs)
151	addi rs,rs,4
152	stw w1,0(rd)
153	addi rd,rd,4
154	3:
155	bf 30,4f // halfword to move?
156	lhz w1,0(rs)
157	addi rs,rs,2
158	sth w1,0(rd)
159	addi rd,rd,2
160	4:
161	bflr 31 // skip if no odd byte
162	lbz w1,0(rs)
163	stb w1,0(rd)
164	blr
165
166
167	// Handle short reverse operands.
168	// cr = length in bits 26-31
169
170	LShortReverse:
171	add rs,rs,rc // adjust ptrs for reverse move
172	add rd,rd,rc
173	bf 26,0f // 32 bytes to move?
174	ld w1,-8(rs)
175	ld w2,-16(rs)
176	ld w3,-24(rs)
177	ldu w4,-32(rs)
178	std w1,-8(rd)
179	std w2,-16(rd)
180	std w3,-24(rd)
181	stdu w4,-32(rd)
182	0:
183	bf 27,1f // quadword to move?
184	ld w1,-8(rs)
185	ldu w2,-16(rs)
186	std w1,-8(rd)
187	stdu w2,-16(rd)
188	1:
189	LShortReverse16: // join here to xfer 0-15 bytes and return
190	bf 28,2f // doubleword?
191	ldu w1,-8(rs)
192	stdu w1,-8(rd)
193	2:
194	bf 29,3f // word?
195	lwzu w1,-4(rs)
196	stwu w1,-4(rd)
197	3:
198	bf 30,4f // halfword to move?
199	lhzu w1,-2(rs)
200	sthu w1,-2(rd)
201	4:
202	bflr 31 // done if no odd byte
203	lbz w1,-1(rs) // no update
204	stb w1,-1(rd)
205	blr
206
207
208	// Long operands, use Altivec in most cases.
209	// rs = source
210	// rd = destination
211	// rc = count
212	// w1 = (rd-rs), must move reverse if (rd-rs)<rc
213
214	LLong0: // entry from memmove()
215	dcbt 0,rs // touch in source
216	dcbtst 0,rd // touch in destination
217	LLong1: // entry from bcopy() with operands already touched in
218	cmplw cr1,w1,rc // set cr1 blt iff we must move reverse
219	neg w3,rd // start to compute #bytes to align destination
220	rlwinm w2,w1,0,0xF // 16-byte aligned? (w2==0 if so)
221	andi. w4,w3,0xF // w4 <- #bytes to 16-byte align destination
222	cmpwi cr5,w2,0 // set cr5 beq if relatively 16-byte aligned
223	blt-- cr1,LLongReverse // handle reverse moves
224	sub rc,rc,w4 // adjust length for aligning destination
225	srwi r0,rc,7 // get #cache lines to copy (may be 0)
226	cmpwi cr1,r0,0 // set cr1 on #chunks
227	beq LFwdAligned // dest is already aligned
228
229	// 16-byte align destination.
230
231	mtcrf 0x01,w4 // cr7 <- #bytes to align dest (nonzero)
232	bf 31,1f // byte to move?
233	lbz w1,0(rs)
234	addi rs,rs,1
235	stb w1,0(rd)
236	addi rd,rd,1
237	1:
238	bf 30,2f // halfword?
239	lhz w1,0(rs)
240	addi rs,rs,2
241	sth w1,0(rd)
242	addi rd,rd,2
243	2:
244	bf 29,3f // word?
245	lwz w1,0(rs)
246	addi rs,rs,4
247	stw w1,0(rd)
248	addi rd,rd,4
249	3:
250	bf 28,LFwdAligned // doubleword?
251	ld w1,0(rs)
252	addi rs,rs,8
253	std w1,0(rd)
254	addi rd,rd,8
255
256
257	// Forward, destination is 16-byte aligned. There are five cases:
258	// 1. If the length>=kVeryLong (ie, several pages), then use the
259	// "bigcopy" path that pulls all the punches. This is the fastest
260	// case for cold-cache operands, as any this long will likely be.
261	// 2. If length>=128 and source is 16-byte aligned, then use the
262	// lvx/stvx loop over 128-byte chunks. This is the fastest
263	// case for hot-cache operands, 2nd fastest for cold.
264	// 3. If length>=128 and source is not 16-byte aligned, then use the
265	// lvx/vperm/stvx loop over 128-byte chunks.
266	// 4. If length<128 and source is 8-byte aligned, then use the
267	// ld/std loop over 32-byte chunks.
268	// 5. If length<128 and source is not 8-byte aligned, then use the
269	// lvx/vperm/stvx loop over 32-byte chunks. This is the slowest case.
270	// Registers at this point:
271	// r0/cr1 = count of cache lines ("chunks") that we'll cover (may be 0)
272	// rs = alignment unknown
273	// rd = 16-byte aligned
274	// rc = bytes remaining
275	// w2 = low 4 bits of (rd-rs), used to check alignment
276	// cr5 = beq if source is also 16-byte aligned
277
278	LFwdAligned:
279	andi. w3,w2,7 // is source at least 8-byte aligned?
280	mtcrf 0x01,rc // move leftover count to cr7 for LShort16
281	bne cr1,LFwdLongVectors // at least one 128-byte chunk, so use vectors
282	srwi w1,rc,5 // get 32-byte chunk count
283	mtcrf 0x02,rc // move bit 27 of length to cr6 for LShort32
284	mtctr w1 // set up 32-byte loop (w1!=0)
285	beq LFwdMedAligned // source is 8-byte aligned, so use ld/std loop
286	mfspr rv,vrsave // get bitmap of live vector registers
287	oris w4,rv,0xFFF8 // we use v0-v12
288	li c16,16 // get constant used in lvx
289	li c32,32
290	mtspr vrsave,w4 // update mask
291	lvx v1,0,rs // prefetch 1st source quadword
292	lvsl vp,0,rs // get permute vector to shift left
293
294
295	// Fewer than 128 bytes but not doubleword aligned: use lvx/vperm/stvx.
296
297	1: // loop over 32-byte chunks
298	lvx v2,c16,rs
299	lvx v3,c32,rs
300	addi rs,rs,32
301	vperm vx,v1,v2,vp
302	vperm vy,v2,v3,vp
303	vor v1,v3,v3 // v1 <- v3
304	stvx vx,0,rd
305	stvx vy,c16,rd
306	addi rd,rd,32
307	bdnz 1b
308
309	mtspr vrsave,rv // restore bitmap of live vr's
310	b LShort32
311
312
313	// Fewer than 128 bytes and doubleword aligned: use ld/std.
314
315	.align 5
316	LFwdMedAligned: // loop over 32-byte chunks
317	ld w1,0(rs)
318	ld w2,8(rs)
319	ld w3,16(rs)
320	ld w4,24(rs)
321	addi rs,rs,32
322	std w1,0(rd)
323	std w2,8(rd)
324	std w3,16(rd)
325	std w4,24(rd)
326	addi rd,rd,32
327	bdnz LFwdMedAligned
328
329	b LShort32
330
331
332	// Forward, 128 bytes or more: use vectors. When entered:
333	// r0 = 128-byte chunks to move (>0)
334	// rd = 16-byte aligned
335	// cr5 = beq if source is 16-byte aligned
336	// cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
337	// We set up many registers:
338	// ctr = number of 128-byte chunks to move
339	// r0/cr0 = leftover QWs to move
340	// cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
341	// cr6 = beq if leftover byte count is 0
342	// rv = original value of VRSave
343	// c16,c32,c48 = loaded
344
345	LFwdLongVectors:
346	mfspr rv,vrsave // get bitmap of live vector registers
347	lis w3,kVeryLong>>16 // cutoff for very-long-operand special case path
348	cmplw cr1,rc,w3 // very long operand?
349	rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3
350	bgea-- cr1,_COMM_PAGE_BIGCOPY // handle big copies separately
351	mtctr r0 // set up loop count
352	cmpwi cr6,w3,0 // set cr6 on leftover byte count
353	oris w4,rv,0xFFF8 // we use v0-v12
354	rlwinm. r0,rc,28,29,31 // get number of quadword leftovers (0-7) and set cr0
355	li c16,16 // get constants used in ldvx/stvx
356	mtspr vrsave,w4 // update mask
357	li c32,32
358	li c48,48
359	beq cr5,LFwdLongAligned // source is also 16-byte aligned, no need for vperm
360	lvsl vp,0,rs // get permute vector to shift left
361	lvx v1,0,rs // prefetch 1st source quadword
362	b LFwdLongUnaligned
363
364
365	// Forward, long, unaligned vector loop.
366
367	.align 5 // align inner loops
368	LFwdLongUnaligned: // loop over 128-byte chunks
369	addi w4,rs,64
370	lvx v2,c16,rs
371	lvx v3,c32,rs
372	lvx v4,c48,rs
373	lvx v5,0,w4
374	lvx v6,c16,w4
375	vperm vw,v1,v2,vp
376	lvx v7,c32,w4
377	lvx v8,c48,w4
378	addi rs,rs,128
379	vperm vx,v2,v3,vp
380	addi w4,rd,64
381	lvx v1,0,rs
382	stvx vw,0,rd
383	vperm vy,v3,v4,vp
384	stvx vx,c16,rd
385	vperm vz,v4,v5,vp
386	stvx vy,c32,rd
387	vperm vw,v5,v6,vp
388	stvx vz,c48,rd
389	vperm vx,v6,v7,vp
390	addi rd,rd,128
391	stvx vw,0,w4
392	vperm vy,v7,v8,vp
393	stvx vx,c16,w4
394	vperm vz,v8,v1,vp
395	stvx vy,c32,w4
396	stvx vz,c48,w4
397	bdnz LFwdLongUnaligned
398
399	beq 4f // no leftover quadwords
400	mtctr r0
401	3: // loop over remaining quadwords
402	lvx v2,c16,rs
403	addi rs,rs,16
404	vperm vx,v1,v2,vp
405	vor v1,v2,v2 // v1 <- v2
406	stvx vx,0,rd
407	addi rd,rd,16
408	bdnz 3b
409	4:
410	mtspr vrsave,rv // restore bitmap of live vr's
411	bne cr6,LShort16 // handle last 0-15 bytes if any
412	blr
413
414
415	// Forward, long, 16-byte aligned vector loop.
416
417	.align 5
418	LFwdLongAligned: // loop over 128-byte chunks
419	addi w4,rs,64
420	lvx v1,0,rs
421	lvx v2,c16,rs
422	lvx v3,c32,rs
423	lvx v4,c48,rs
424	lvx v5,0,w4
425	lvx v6,c16,w4
426	lvx v7,c32,w4
427	lvx v8,c48,w4
428	addi rs,rs,128
429	addi w4,rd,64
430	stvx v1,0,rd
431	stvx v2,c16,rd
432	stvx v3,c32,rd
433	stvx v4,c48,rd
434	stvx v5,0,w4
435	stvx v6,c16,w4
436	stvx v7,c32,w4
437	stvx v8,c48,w4
438	addi rd,rd,128
439	bdnz LFwdLongAligned
440
441	beq 4f // no leftover quadwords
442	mtctr r0
443	3: // loop over remaining quadwords (1-7)
444	lvx v1,0,rs
445	addi rs,rs,16
446	stvx v1,0,rd
447	addi rd,rd,16
448	bdnz 3b
449	4:
450	mtspr vrsave,rv // restore bitmap of live vr's
451	bne cr6,LShort16 // handle last 0-15 bytes if any
452	blr
453
454
455	// Long, reverse moves.
456	// rs = source
457	// rd = destination
458	// rc = count
459	// cr5 = beq if relatively 16-byte aligned
460
461	LLongReverse:
462	add rd,rd,rc // point to end of operands
463	add rs,rs,rc
464	andi. r0,rd,0xF // #bytes to 16-byte align destination
465	beq 2f // already aligned
466
467	// 16-byte align destination.
468
469	mtctr r0 // set up for loop
470	sub rc,rc,r0
471	1:
472	lbzu w1,-1(rs)
473	stbu w1,-1(rd)
474	bdnz 1b
475
476	// Prepare for reverse vector loop. When entered:
477	// rd = 16-byte aligned
478	// cr5 = beq if source also 16-byte aligned
479	// We set up many registers:
480	// ctr/cr1 = number of 64-byte chunks to move (may be 0)
481	// r0/cr0 = leftover QWs to move
482	// cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
483	// cr6 = beq if leftover byte count is 0
484	// cm1 = -1
485	// rv = original value of vrsave
486
487	2:
488	mfspr rv,vrsave // get bitmap of live vector registers
489	srwi r0,rc,6 // get count of 64-byte chunks to move (may be 0)
490	oris w1,rv,0xFFF8 // we use v0-v12
491	mtcrf 0x01,rc // prepare for moving last 0-15 bytes in LShortReverse16
492	rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3 too
493	cmpwi cr1,r0,0 // set cr1 on chunk count
494	mtspr vrsave,w1 // update mask
495	mtctr r0 // set up loop count
496	cmpwi cr6,w3,0 // set cr6 on leftover byte count
497	rlwinm. r0,rc,28,30,31 // get number of quadword leftovers (0-3) and set cr0
498	li cm1,-1 // get constants used in ldvx/stvx
499
500	bne cr5,LReverseVecUnal // handle unaligned operands
501	beq cr1,2f // no chunks (if no chunks, must be leftover QWs)
502	li cm17,-17
503	li cm33,-33
504	li cm49,-49
505	b 1f
506
507	// Long, reverse 16-byte-aligned vector loop.
508
509	.align 5 // align inner loops
510	1: // loop over 64-byte chunks
511	lvx v1,cm1,rs
512	lvx v2,cm17,rs
513	lvx v3,cm33,rs
514	lvx v4,cm49,rs
515	subi rs,rs,64
516	stvx v1,cm1,rd
517	stvx v2,cm17,rd
518	stvx v3,cm33,rd
519	stvx v4,cm49,rd
520	subi rd,rd,64
521	bdnz 1b
522
523	beq 4f // no leftover quadwords
524	2: // r0=#QWs, rv=vrsave, cr7=(rc & F), cr6 set on cr7
525	mtctr r0
526	3: // loop over remaining quadwords (1-7)
527	lvx v1,cm1,rs
528	subi rs,rs,16
529	stvx v1,cm1,rd
530	subi rd,rd,16
531	bdnz 3b
532	4:
533	mtspr vrsave,rv // restore bitmap of live vr's
534	bne cr6,LShortReverse16 // handle last 0-15 bytes if any
535	blr
536
537
538	// Long, reverse, unaligned vector loop.
539	// ctr/cr1 = number of 64-byte chunks to move (may be 0)
540	// r0/cr0 = leftover QWs to move
541	// cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
542	// cr6 = beq if leftover byte count is 0
543	// rv = original value of vrsave
544	// cm1 = -1
545
546	LReverseVecUnal:
547	lvsl vp,0,rs // get permute vector to shift left
548	lvx v1,cm1,rs // v1 always looks ahead
549	li cm17,-17
550	beq cr1,2f // no chunks (if no chunks, must be leftover QWs)
551	li cm33,-33
552	li cm49,-49
553	b 1f
554
555	.align 5 // align the inner loops
556	1: // loop over 64-byte chunks
557	lvx v2,cm17,rs
558	lvx v3,cm33,rs
559	lvx v4,cm49,rs
560	subi rs,rs,64
561	vperm vx,v2,v1,vp
562	lvx v1,cm1,rs
563	vperm vy,v3,v2,vp
564	stvx vx,cm1,rd
565	vperm vz,v4,v3,vp
566	stvx vy,cm17,rd
567	vperm vx,v1,v4,vp
568	stvx vz,cm33,rd
569	stvx vx,cm49,rd
570	subi rd,rd,64
571	bdnz 1b
572
573	beq 4f // no leftover quadwords
574	2: // r0=#QWs, rv=vrsave, v1=next QW, cr7=(rc & F), cr6 set on cr7
575	mtctr r0
576	3: // loop over 1-3 quadwords
577	lvx v2,cm17,rs
578	subi rs,rs,16
579	vperm vx,v2,v1,vp
580	vor v1,v2,v2 // v1 <- v2
581	stvx vx,cm1,rd
582	subi rd,rd,16
583	bdnz 3b
584	4:
585	mtspr vrsave,rv // restore bitmap of live vr's
586	bne cr6,LShortReverse16 // handle last 0-15 bytes iff any
587	blr
588
589	COMMPAGE_DESCRIPTOR(bcopy_970,_COMM_PAGE_BCOPY,k64Bit+kHasAltivec,0,kCommPageMTCRF)