[apple/xnu.git] / osfmk / ppc / commpage / bcopy_970.s

/*
 * Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
 * 
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. The rights granted to you under the License
 * may not be used to create, or enable the creation or redistribution of,
 * unlawful or unlicensed copies of an Apple operating system, or to
 * circumvent, violate, or enable the circumvention or violation of, any
 * terms of an Apple operating system software license agreement.
 * 
 * Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this file.
 * 
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 * 
 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
 */
/* =======================================
 * BCOPY, MEMCPY, and MEMMOVE for Mac OS X
 * =======================================
 *
 * Version of 6/11/2003, tuned for the IBM 970.
 *
 * Register usage.  Note the rather delicate way we assign multiple uses
 * to the same register.  Beware.
 *   r0  = temp (NB: cannot use r0 for any constant such as "c16")
 *   r3  = not used, as memcpy and memmove return 1st parameter as a value
 *   r4  = source ptr ("rs")
 *   r5  = count of bytes to move ("rc")
 *   r6  = "w1", "c16", or "cm17"
 *   r7  = "w2", "c32", or "cm33"
 *   r8  = "w3", "c48", or "cm49"
 *   r9  = "w4",        or "cm1"
 *   r10 = vrsave ("rv")
 *   r11 = unused
 *   r12 = destination ptr ("rd")
 *   v0  = permute vector ("vp") 
 * v1-v8 = qw's loaded from source
 *v9-v12 = permuted qw's ("vw", "vx", "vy", and "vz")
 */
#define rs	r4
#define rd	r12
#define rc	r5
#define	rv	r10

#define w1	r6
#define w2	r7
#define w3	r8
#define	w4	r9

#define c16		r6
#define cm17	r6
#define c32		r7
#define cm33	r7
#define c48		r8
#define cm49	r8
#define cm1		r9

#define	vp	v0
#define	vw	v9
#define	vx	v10
#define	vy	v11
#define	vz	v12

#include <sys/appleapiopts.h>
#include <ppc/asm.h>
#include <machine/cpu_capabilities.h>
#include <machine/commpage.h>

        .text
/*
 * WARNING: this code is written for 32-bit mode, and ported by the kernel if necessary
 * to 64-bit mode for use in the 64-bit commpage.  This "port" consists of the following
 * simple transformations:
 *      - all word compares are changed to doubleword
 *      - all "srwi[.]" opcodes are changed to "srdi[.]"                      
 * Nothing else is done.  For this to work, the following rules must be
 * carefully followed:
 *      - do not use carry or overflow
 *      - only use record mode if you are sure the results are mode-invariant
 *        for example, all "andi." and almost all "rlwinm." are fine
 *      - do not use "slwi", "slw", or "srw"
 * An imaginative programmer could break the porting model in other ways, but the above
 * are the most likely problem areas.  It is perhaps surprising how well in practice
 * this simple method works.
 */

#define	kShort		64
#define	kVeryLong	(128*1024)


// Main entry points.

        .align 	5
bcopy_970:							// void bcopy(const void *src, void *dst, size_t len)
        cmplwi	rc,kShort			// short or long?
        sub		w1,r4,r3			// must move in reverse if (rd-rs)<rc
        mr		rd,r4				// move registers to canonic spot
        mr		rs,r3
        blt		LShort				// handle short operands
        dcbt	0,rs				// touch in the first line of source
        dcbtst	0,rd				// touch in destination
        b		LLong1				// join long operand code

// NB: memmove() must be 8 words past bcopy(), to agree with comm page addresses.

        .align	5
Lmemcpy_970:						// void* memcpy(void *dst, void *src, size_t len)
Lmemmove_970:						// void* memmove(void *dst, const void *src, size_t len)
        cmplwi	rc,kShort			// short or long?
        sub		w1,r3,r4			// must move in reverse if (rd-rs)<rc
        mr		rd,r3				// must leave r3 alone, it is return value for memcpy etc
        bge		LLong0				// handle long operands

// Handle short operands.
//		rs = source
//		rd = destination
//		rc = count
//		w1 = (rd-rs), must move reverse if (rd-rs)<rc
        
LShort:
        cmplw	cr1,w1,rc			// set cr1 blt if we must move reverse
        mtcrf	0x02,rc				// move length to cr6 and cr7 one at a time
        mtcrf	0x01,rc
        blt--	cr1,LShortReverse
        
// Forward short operands.  This is the most frequent case, so it is inline.

        bf		26,0f				// 32-byte chunk to move?
        ld		w1,0(rs)
        ld		w2,8(rs)
        ld		w3,16(rs)
        ld		w4,24(rs)
        addi	rs,rs,32
        std		w1,0(rd)
        std		w2,8(rd)
        std		w3,16(rd)
        std		w4,24(rd)
        addi	rd,rd,32
0:
LShort32:
        bf		27,1f				// quadword to move?
        ld		w1,0(rs)
        ld		w3,8(rs)
        addi	rs,rs,16
        std		w1,0(rd)
        std		w3,8(rd)
        addi	rd,rd,16
1:
LShort16:							// join here to xfer 0-15 bytes
        bf		28,2f				// doubleword?
        ld		w1,0(rs)
        addi	rs,rs,8
        std		w1,0(rd)
        addi	rd,rd,8
2:
        bf		29,3f				// word?
        lwz		w1,0(rs)
        addi	rs,rs,4
        stw		w1,0(rd)
        addi	rd,rd,4
3:
        bf		30,4f				// halfword to move?
        lhz		w1,0(rs)
        addi	rs,rs,2
        sth		w1,0(rd)
        addi	rd,rd,2
4:
        bflr	31					// skip if no odd byte
        lbz		w1,0(rs)
        stb		w1,0(rd)
        blr
        
        
// Handle short reverse operands.
//		cr = length in bits 26-31       

LShortReverse:
        add		rs,rs,rc			// adjust ptrs for reverse move
        add		rd,rd,rc
        bf		26,0f				// 32 bytes to move?
        ld		w1,-8(rs)
        ld		w2,-16(rs)
        ld		w3,-24(rs)
        ldu		w4,-32(rs)
        std		w1,-8(rd)
        std		w2,-16(rd)
        std		w3,-24(rd)
        stdu	w4,-32(rd)
0:
        bf		27,1f				// quadword to move?
        ld		w1,-8(rs)
        ldu		w2,-16(rs)
        std		w1,-8(rd)
        stdu	w2,-16(rd)
1:
LShortReverse16:					// join here to xfer 0-15 bytes and return
        bf		28,2f				// doubleword?
        ldu		w1,-8(rs)
        stdu	w1,-8(rd)
2:
        bf		29,3f				// word?
        lwzu	w1,-4(rs)
        stwu	w1,-4(rd)
3:
        bf		30,4f				// halfword to move?
        lhzu	w1,-2(rs)
        sthu	w1,-2(rd)
4:
        bflr	31					// done if no odd byte
        lbz 	w1,-1(rs)			// no update
        stb 	w1,-1(rd)
        blr
        

// Long operands, use Altivec in most cases.
//		rs = source
//		rd = destination
//		rc = count
//		w1 = (rd-rs), must move reverse if (rd-rs)<rc

LLong0:								// entry from memmove()
        dcbt	0,rs				// touch in source
        dcbtst	0,rd				// touch in destination
LLong1:								// entry from bcopy() with operands already touched in
        cmplw	cr1,w1,rc			// set cr1 blt iff we must move reverse
        neg		w3,rd				// start to compute #bytes to align destination
        rlwinm	w2,w1,0,0xF			// 16-byte aligned?  (w2==0 if so)
        andi.	w4,w3,0xF			// w4 <- #bytes to 16-byte align destination
        cmpwi	cr5,w2,0			// set cr5 beq if relatively 16-byte aligned
        blt--	cr1,LLongReverse	// handle reverse moves
        sub		rc,rc,w4			// adjust length for aligning destination
        srwi	r0,rc,7				// get #cache lines to copy (may be 0)
        cmpwi	cr1,r0,0			// set cr1 on #chunks
        beq		LFwdAligned			// dest is already aligned
        
// 16-byte align destination.

        mtcrf	0x01,w4				// cr7 <- #bytes to align dest (nonzero)
        bf		31,1f				// byte to move?
        lbz		w1,0(rs)
        addi	rs,rs,1
        stb		w1,0(rd)
        addi	rd,rd,1
1:
        bf		30,2f				// halfword?
        lhz		w1,0(rs)
        addi	rs,rs,2
        sth		w1,0(rd)
        addi	rd,rd,2
2:
        bf		29,3f				// word?
        lwz		w1,0(rs)
        addi	rs,rs,4
        stw		w1,0(rd)
        addi	rd,rd,4
3:
        bf		28,LFwdAligned		// doubleword?
        ld		w1,0(rs)
        addi	rs,rs,8
        std		w1,0(rd)
        addi	rd,rd,8


// Forward, destination is 16-byte aligned.  There are five cases:
//  1. If the length>=kVeryLong (ie, several pages), then use the
//     "bigcopy" path that pulls all the punches.  This is the fastest
//	   case for cold-cache operands, as any this long will likely be.
//	2. If length>=128 and source is 16-byte aligned, then use the
//	   lvx/stvx loop over 128-byte chunks.  This is the fastest
//     case for hot-cache operands, 2nd fastest for cold.
//	3. If length>=128 and source is not 16-byte aligned, then use the
//	   lvx/vperm/stvx loop over 128-byte chunks.
//	4. If length<128 and source is 8-byte aligned, then use the
//	   ld/std loop over 32-byte chunks.
//	5. If length<128 and source is not 8-byte aligned, then use the
//	   lvx/vperm/stvx loop over 32-byte chunks.  This is the slowest case.
// Registers at this point:
//		r0/cr1 = count of cache lines ("chunks") that we'll cover (may be 0)
//			rs = alignment unknown
//		    rd = 16-byte aligned
//			rc = bytes remaining
//			w2 = low 4 bits of (rd-rs), used to check alignment
//		   cr5 = beq if source is also 16-byte aligned

LFwdAligned:
        andi.	w3,w2,7				// is source at least 8-byte aligned?
        mtcrf	0x01,rc				// move leftover count to cr7 for LShort16
        bne		cr1,LFwdLongVectors	// at least one 128-byte chunk, so use vectors
        srwi	w1,rc,5				// get 32-byte chunk count
        mtcrf	0x02,rc				// move bit 27 of length to cr6 for LShort32
        mtctr	w1					// set up 32-byte loop (w1!=0)
        beq		LFwdMedAligned		// source is 8-byte aligned, so use ld/std loop
        mfspr	rv,vrsave			// get bitmap of live vector registers
        oris	w4,rv,0xFFF8		// we use v0-v12
        li		c16,16				// get constant used in lvx
        li		c32,32
        mtspr	vrsave,w4			// update mask
        lvx		v1,0,rs				// prefetch 1st source quadword
        lvsl	vp,0,rs				// get permute vector to shift left
        
        
// Fewer than 128 bytes but not doubleword aligned: use lvx/vperm/stvx.

1:									// loop over 32-byte chunks
        lvx		v2,c16,rs
        lvx		v3,c32,rs
        addi	rs,rs,32
        vperm	vx,v1,v2,vp
        vperm	vy,v2,v3,vp
        vor		v1,v3,v3			// v1 <- v3
        stvx	vx,0,rd
        stvx	vy,c16,rd
        addi	rd,rd,32
        bdnz	1b
        
        mtspr	vrsave,rv			// restore bitmap of live vr's
        b		LShort32

        
// Fewer than 128 bytes and doubleword aligned: use ld/std.

        .align	5
LFwdMedAligned:									// loop over 32-byte chunks
        ld		w1,0(rs)
        ld		w2,8(rs)
        ld		w3,16(rs)
        ld		w4,24(rs)
        addi	rs,rs,32
        std		w1,0(rd)
        std		w2,8(rd)
        std		w3,16(rd)
        std		w4,24(rd)
        addi	rd,rd,32
        bdnz	LFwdMedAligned
        
        b		LShort32

        
// Forward, 128 bytes or more: use vectors.  When entered:
//	    r0 = 128-byte chunks to move (>0)
//		rd = 16-byte aligned
//	   cr5 = beq if source is 16-byte aligned
//	   cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
// We set up many registers:
//	   ctr = number of 128-byte chunks to move
//	r0/cr0 = leftover QWs to move
//	   cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
//	   cr6 = beq if leftover byte count is 0
//		rv = original value of VRSave
// c16,c32,c48 = loaded

LFwdLongVectors:
        mfspr	rv,vrsave			// get bitmap of live vector registers
        lis		w3,kVeryLong>>16	// cutoff for very-long-operand special case path
        cmplw	cr1,rc,w3			// very long operand?
        rlwinm	w3,rc,0,28,31		// move last 0-15 byte count to w3
        bge--	cr1,LBigCopy        // handle big copies separately
        mtctr	r0					// set up loop count
        cmpwi	cr6,w3,0			// set cr6 on leftover byte count
        oris	w4,rv,0xFFF8		// we use v0-v12
        rlwinm.	r0,rc,28,29,31		// get number of quadword leftovers (0-7) and set cr0
        li		c16,16				// get constants used in ldvx/stvx
        mtspr	vrsave,w4			// update mask
        li		c32,32
        li		c48,48
        beq		cr5,LFwdLongAligned	// source is also 16-byte aligned, no need for vperm
        lvsl	vp,0,rs				// get permute vector to shift left
        lvx		v1,0,rs				// prefetch 1st source quadword
        b		LFwdLongUnaligned


// Forward, long, unaligned vector loop.

        .align	5					// align inner loops
LFwdLongUnaligned:					// loop over 128-byte chunks
        addi	w4,rs,64
        lvx		v2,c16,rs
        lvx		v3,c32,rs
        lvx		v4,c48,rs
        lvx		v5,0,w4
        lvx		v6,c16,w4
        vperm	vw,v1,v2,vp
        lvx		v7,c32,w4
        lvx		v8,c48,w4
        addi	rs,rs,128
        vperm	vx,v2,v3,vp
        addi	w4,rd,64
        lvx		v1,0,rs
        stvx	vw,0,rd
        vperm	vy,v3,v4,vp
        stvx	vx,c16,rd
        vperm	vz,v4,v5,vp
        stvx	vy,c32,rd
        vperm	vw,v5,v6,vp
        stvx	vz,c48,rd
        vperm	vx,v6,v7,vp
        addi	rd,rd,128
        stvx	vw,0,w4
        vperm	vy,v7,v8,vp
        stvx	vx,c16,w4
        vperm	vz,v8,v1,vp
        stvx	vy,c32,w4
        stvx	vz,c48,w4
        bdnz	LFwdLongUnaligned

        beq		4f					// no leftover quadwords
        mtctr	r0
3:									// loop over remaining quadwords
        lvx		v2,c16,rs
        addi	rs,rs,16
        vperm	vx,v1,v2,vp
        vor		v1,v2,v2			// v1 <- v2
        stvx	vx,0,rd
        addi	rd,rd,16
        bdnz	3b
4:
        mtspr	vrsave,rv			// restore bitmap of live vr's
        bne		cr6,LShort16		// handle last 0-15 bytes if any
        blr


// Forward, long, 16-byte aligned vector loop.

        .align	5
LFwdLongAligned:        			// loop over 128-byte chunks
        addi	w4,rs,64
        lvx		v1,0,rs
        lvx		v2,c16,rs
        lvx		v3,c32,rs
        lvx		v4,c48,rs
        lvx		v5,0,w4
        lvx		v6,c16,w4
        lvx		v7,c32,w4
        lvx		v8,c48,w4
        addi	rs,rs,128
        addi	w4,rd,64
        stvx	v1,0,rd 
        stvx	v2,c16,rd
        stvx	v3,c32,rd
        stvx	v4,c48,rd
        stvx	v5,0,w4
        stvx	v6,c16,w4
        stvx	v7,c32,w4
        stvx	v8,c48,w4
        addi	rd,rd,128
        bdnz	LFwdLongAligned
                
        beq		4f					// no leftover quadwords
        mtctr	r0
3:									// loop over remaining quadwords (1-7)
        lvx		v1,0,rs
        addi	rs,rs,16
        stvx	v1,0,rd
        addi	rd,rd,16
        bdnz	3b
4:
        mtspr	vrsave,rv			// restore bitmap of live vr's
        bne		cr6,LShort16		// handle last 0-15 bytes if any
        blr
        

// Long, reverse moves.
//		rs = source
//		rd = destination
//		rc = count
//	   cr5 = beq if relatively 16-byte aligned

LLongReverse:
        add		rd,rd,rc			// point to end of operands
        add		rs,rs,rc
        andi.	r0,rd,0xF			// #bytes to 16-byte align destination
        beq		2f					// already aligned
        
// 16-byte align destination.

        mtctr	r0					// set up for loop
        sub		rc,rc,r0
1:
        lbzu	w1,-1(rs)
        stbu	w1,-1(rd)
        bdnz	1b

// Prepare for reverse vector loop.  When entered:
//		rd = 16-byte aligned
//		cr5 = beq if source also 16-byte aligned
// We set up many registers:
//		ctr/cr1 = number of 64-byte chunks to move (may be 0)
//		r0/cr0 = leftover QWs to move
//		cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
//		cr6 = beq if leftover byte count is 0
//		cm1 = -1
//		rv = original value of vrsave

2:
        mfspr	rv,vrsave			// get bitmap of live vector registers
        srwi	r0,rc,6				// get count of 64-byte chunks to move (may be 0)
        oris	w1,rv,0xFFF8		// we use v0-v12
        mtcrf	0x01,rc				// prepare for moving last 0-15 bytes in LShortReverse16
        rlwinm	w3,rc,0,28,31		// move last 0-15 byte count to w3 too
        cmpwi	cr1,r0,0			// set cr1 on chunk count
        mtspr	vrsave,w1			// update mask
        mtctr	r0					// set up loop count
        cmpwi	cr6,w3,0			// set cr6 on leftover byte count
        rlwinm.	r0,rc,28,30,31		// get number of quadword leftovers (0-3) and set cr0
        li		cm1,-1				// get constants used in ldvx/stvx
        
        bne		cr5,LReverseVecUnal	// handle unaligned operands
        beq		cr1,2f				// no chunks (if no chunks, must be leftover QWs)
        li		cm17,-17
        li		cm33,-33
        li		cm49,-49
        b		1f

// Long, reverse 16-byte-aligned vector loop.
      
        .align	5					// align inner loops
1:        							// loop over 64-byte chunks
        lvx		v1,cm1,rs
        lvx		v2,cm17,rs
        lvx		v3,cm33,rs
        lvx		v4,cm49,rs
        subi	rs,rs,64
        stvx	v1,cm1,rd
        stvx	v2,cm17,rd
        stvx	v3,cm33,rd
        stvx	v4,cm49,rd
        subi	rd,rd,64
        bdnz	1b
        
        beq		4f					// no leftover quadwords
2:									// r0=#QWs, rv=vrsave, cr7=(rc & F), cr6 set on cr7
        mtctr	r0
3:									// loop over remaining quadwords (1-7)
        lvx		v1,cm1,rs
        subi	rs,rs,16
        stvx	v1,cm1,rd
        subi	rd,rd,16
        bdnz	3b
4:
        mtspr	vrsave,rv			// restore bitmap of live vr's
        bne		cr6,LShortReverse16	// handle last 0-15 bytes if any
        blr


// Long, reverse, unaligned vector loop.
//		ctr/cr1 = number of 64-byte chunks to move (may be 0)
//		r0/cr0 = leftover QWs to move
//		cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
//		cr6 = beq if leftover byte count is 0
//		rv = original value of vrsave
//		cm1 = -1

LReverseVecUnal:
        lvsl	vp,0,rs				// get permute vector to shift left
        lvx		v1,cm1,rs			// v1 always looks ahead
        li		cm17,-17
        beq		cr1,2f				// no chunks (if no chunks, must be leftover QWs)
        li		cm33,-33
        li		cm49,-49
        b		1f
        
        .align	5					// align the inner loops
1:									// loop over 64-byte chunks
        lvx		v2,cm17,rs
        lvx		v3,cm33,rs
        lvx		v4,cm49,rs
        subi	rs,rs,64
        vperm	vx,v2,v1,vp
        lvx		v1,cm1,rs
        vperm	vy,v3,v2,vp
        stvx	vx,cm1,rd
        vperm	vz,v4,v3,vp
        stvx	vy,cm17,rd
        vperm	vx,v1,v4,vp
        stvx	vz,cm33,rd
        stvx	vx,cm49,rd
        subi	rd,rd,64
        bdnz	1b

        beq		4f					// no leftover quadwords
2:									// r0=#QWs, rv=vrsave, v1=next QW, cr7=(rc & F), cr6 set on cr7
        mtctr	r0
3:									// loop over 1-3 quadwords
        lvx		v2,cm17,rs
        subi	rs,rs,16
        vperm	vx,v2,v1,vp
        vor		v1,v2,v2			// v1 <- v2
        stvx	vx,cm1,rd
        subi	rd,rd,16
        bdnz	3b
4:
        mtspr	vrsave,rv			// restore bitmap of live vr's
        bne		cr6,LShortReverse16	// handle last 0-15 bytes iff any
        blr

        
// Very Big Copy Path.  Save our return address in the stack for help decoding backtraces.
// The conditions bigcopy expects are:
//  r0 = return address (also stored in caller's SF)
//	r4 = source ptr
//	r5 = length (at least several pages)
// r12 = dest ptr

LBigCopy:
		lis		r2,0x4000			// r2 <- 0x40000000
        mflr    r0                  // get our return address
		add.	r2,r2,r2			// set cr0_lt if running in 32-bit mode
        stw     r0,8(r1)            // save return, assuming 32-bit mode ("crsave" if 64-bit mode)
		blta	_COMM_PAGE_BIGCOPY  // 32-bit mode, join big operand copy
		std		r0,16(r1)			// save return in correct spot for 64-bit mode
        ba      _COMM_PAGE_BIGCOPY  // then join big operand code
        

	COMMPAGE_DESCRIPTOR(bcopy_970,_COMM_PAGE_BCOPY,k64Bit+kHasAltivec,0, \
				kCommPageMTCRF+kCommPageBoth+kPort32to64)
Commit	Line	Data
55e303ae A	1	/*
	2	* Copyright (c) 2003 Apple Computer, Inc. All rights reserved.
	3	*
2d21ac55	4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
55e303ae	5	*
2d21ac55 A	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. The rights granted to you under the License
	10	* may not be used to create, or enable the creation or redistribution of,
	11	* unlawful or unlicensed copies of an Apple operating system, or to
	12	* circumvent, violate, or enable the circumvention or violation of, any
	13	* terms of an Apple operating system software license agreement.
8f6c56a5	14	*
2d21ac55 A	15	* Please obtain a copy of the License at
	16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
	17	*
	18	* The Original Code and all software distributed under the License are
	19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
8f6c56a5 A	20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
8f6c56a5 A	21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
2d21ac55 A	22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	23	* Please see the License for the specific language governing rights and
	24	* limitations under the License.
8f6c56a5	25	*
2d21ac55	26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
55e303ae A	27	*/
	28	/* =======================================
	29	* BCOPY, MEMCPY, and MEMMOVE for Mac OS X
	30	* =======================================
	31	*
	32	* Version of 6/11/2003, tuned for the IBM 970.
	33	*
55e303ae A	34	* Register usage. Note the rather delicate way we assign multiple uses
	35	* to the same register. Beware.
	36	* r0 = temp (NB: cannot use r0 for any constant such as "c16")
	37	* r3 = not used, as memcpy and memmove return 1st parameter as a value
	38	* r4 = source ptr ("rs")
	39	* r5 = count of bytes to move ("rc")
	40	* r6 = "w1", "c16", or "cm17"
	41	* r7 = "w2", "c32", or "cm33"
	42	* r8 = "w3", "c48", or "cm49"
	43	* r9 = "w4", or "cm1"
	44	* r10 = vrsave ("rv")
	45	* r11 = unused
	46	* r12 = destination ptr ("rd")
	47	* v0 = permute vector ("vp")
	48	* v1-v8 = qw's loaded from source
	49	*v9-v12 = permuted qw's ("vw", "vx", "vy", and "vz")
	50	*/
	51	#define rs r4
	52	#define rd r12
	53	#define rc r5
	54	#define rv r10
	55
	56	#define w1 r6
	57	#define w2 r7
	58	#define w3 r8
	59	#define w4 r9
	60
	61	#define c16 r6
	62	#define cm17 r6
	63	#define c32 r7
	64	#define cm33 r7
	65	#define c48 r8
	66	#define cm49 r8
	67	#define cm1 r9
	68
	69	#define vp v0
	70	#define vw v9
	71	#define vx v10
	72	#define vy v11
	73	#define vz v12
	74
55e303ae A	75	#include <sys/appleapiopts.h>
	76	#include <ppc/asm.h>
	77	#include <machine/cpu_capabilities.h>
	78	#include <machine/commpage.h>
	79
	80	.text
91447636 A	81	/*
	82	* WARNING: this code is written for 32-bit mode, and ported by the kernel if necessary
	83	* to 64-bit mode for use in the 64-bit commpage. This "port" consists of the following
	84	* simple transformations:
	85	* - all word compares are changed to doubleword
	86	* - all "srwi[.]" opcodes are changed to "srdi[.]"
	87	* Nothing else is done. For this to work, the following rules must be
	88	* carefully followed:
	89	* - do not use carry or overflow
	90	* - only use record mode if you are sure the results are mode-invariant
	91	* for example, all "andi." and almost all "rlwinm." are fine
	92	* - do not use "slwi", "slw", or "srw"
	93	* An imaginative programmer could break the porting model in other ways, but the above
	94	* are the most likely problem areas. It is perhaps surprising how well in practice
	95	* this simple method works.
	96	*/
55e303ae A	97
	98	#define kShort 64
	99	#define kVeryLong (128*1024)
	100
	101
	102	// Main entry points.
	103
	104	.align 5
	105	bcopy_970: // void bcopy(const void src, void dst, size_t len)
	106	cmplwi rc,kShort // short or long?
	107	sub w1,r4,r3 // must move in reverse if (rd-rs)<rc
	108	mr rd,r4 // move registers to canonic spot
	109	mr rs,r3
	110	blt LShort // handle short operands
	111	dcbt 0,rs // touch in the first line of source
	112	dcbtst 0,rd // touch in destination
	113	b LLong1 // join long operand code
	114
	115	// NB: memmove() must be 8 words past bcopy(), to agree with comm page addresses.
	116
	117	.align 5
	118	Lmemcpy_970: // void* memcpy(void dst, void src, size_t len)
	119	Lmemmove_970: // void* memmove(void dst, const void src, size_t len)
	120	cmplwi rc,kShort // short or long?
	121	sub w1,r3,r4 // must move in reverse if (rd-rs)<rc
	122	mr rd,r3 // must leave r3 alone, it is return value for memcpy etc
	123	bge LLong0 // handle long operands
	124
	125	// Handle short operands.
	126	// rs = source
	127	// rd = destination
	128	// rc = count
	129	// w1 = (rd-rs), must move reverse if (rd-rs)<rc
	130
	131	LShort:
	132	cmplw cr1,w1,rc // set cr1 blt if we must move reverse
	133	mtcrf 0x02,rc // move length to cr6 and cr7 one at a time
	134	mtcrf 0x01,rc
	135	blt-- cr1,LShortReverse
	136
	137	// Forward short operands. This is the most frequent case, so it is inline.
	138
	139	bf 26,0f // 32-byte chunk to move?
	140	ld w1,0(rs)
	141	ld w2,8(rs)
	142	ld w3,16(rs)
	143	ld w4,24(rs)
	144	addi rs,rs,32
	145	std w1,0(rd)
	146	std w2,8(rd)
	147	std w3,16(rd)
	148	std w4,24(rd)
	149	addi rd,rd,32
	150	0:
	151	LShort32:
	152	bf 27,1f // quadword to move?
	153	ld w1,0(rs)
	154	ld w3,8(rs)
	155	addi rs,rs,16
	156	std w1,0(rd)
	157	std w3,8(rd)
	158	addi rd,rd,16
	159	1:
	160	LShort16: // join here to xfer 0-15 bytes
161	bf 28,2f // doubleword?
162	ld w1,0(rs)
163	addi rs,rs,8
164	std w1,0(rd)
165	addi rd,rd,8
166	2:
167	bf 29,3f // word?
168	lwz w1,0(rs)
169	addi rs,rs,4
170	stw w1,0(rd)
171	addi rd,rd,4
172	3:
173	bf 30,4f // halfword to move?
174	lhz w1,0(rs)
175	addi rs,rs,2
176	sth w1,0(rd)
177	addi rd,rd,2
178	4:
179	bflr 31 // skip if no odd byte
180	lbz w1,0(rs)
181	stb w1,0(rd)
182	blr
183
184
185	// Handle short reverse operands.
186	// cr = length in bits 26-31
187
188	LShortReverse:
189	add rs,rs,rc // adjust ptrs for reverse move
190	add rd,rd,rc
191	bf 26,0f // 32 bytes to move?
192	ld w1,-8(rs)
193	ld w2,-16(rs)
194	ld w3,-24(rs)
195	ldu w4,-32(rs)
196	std w1,-8(rd)
197	std w2,-16(rd)
198	std w3,-24(rd)
199	stdu w4,-32(rd)
200	0:
201	bf 27,1f // quadword to move?
202	ld w1,-8(rs)
203	ldu w2,-16(rs)
204	std w1,-8(rd)
205	stdu w2,-16(rd)
206	1:
207	LShortReverse16: // join here to xfer 0-15 bytes and return
208	bf 28,2f // doubleword?
209	ldu w1,-8(rs)
210	stdu w1,-8(rd)
211	2:
212	bf 29,3f // word?
213	lwzu w1,-4(rs)
214	stwu w1,-4(rd)
215	3:
216	bf 30,4f // halfword to move?
217	lhzu w1,-2(rs)
218	sthu w1,-2(rd)
219	4:
220	bflr 31 // done if no odd byte
221	lbz w1,-1(rs) // no update
222	stb w1,-1(rd)
223	blr
224
225
226	// Long operands, use Altivec in most cases.
227	// rs = source
228	// rd = destination
229	// rc = count
230	// w1 = (rd-rs), must move reverse if (rd-rs)<rc
231
232	LLong0: // entry from memmove()
233	dcbt 0,rs // touch in source
234	dcbtst 0,rd // touch in destination
235	LLong1: // entry from bcopy() with operands already touched in
236	cmplw cr1,w1,rc // set cr1 blt iff we must move reverse
237	neg w3,rd // start to compute #bytes to align destination
238	rlwinm w2,w1,0,0xF // 16-byte aligned? (w2==0 if so)
239	andi. w4,w3,0xF // w4 <- #bytes to 16-byte align destination
240	cmpwi cr5,w2,0 // set cr5 beq if relatively 16-byte aligned
241	blt-- cr1,LLongReverse // handle reverse moves
242	sub rc,rc,w4 // adjust length for aligning destination
243	srwi r0,rc,7 // get #cache lines to copy (may be 0)
244	cmpwi cr1,r0,0 // set cr1 on #chunks
245	beq LFwdAligned // dest is already aligned
246
247	// 16-byte align destination.
248
249	mtcrf 0x01,w4 // cr7 <- #bytes to align dest (nonzero)
250	bf 31,1f // byte to move?
251	lbz w1,0(rs)
252	addi rs,rs,1
253	stb w1,0(rd)
254	addi rd,rd,1
255	1:
256	bf 30,2f // halfword?
257	lhz w1,0(rs)
258	addi rs,rs,2
259	sth w1,0(rd)
260	addi rd,rd,2
261	2:
262	bf 29,3f // word?
263	lwz w1,0(rs)
264	addi rs,rs,4
265	stw w1,0(rd)
266	addi rd,rd,4
267	3:
268	bf 28,LFwdAligned // doubleword?
269	ld w1,0(rs)
270	addi rs,rs,8
271	std w1,0(rd)
272	addi rd,rd,8
273
274
275	// Forward, destination is 16-byte aligned. There are five cases:
276	// 1. If the length>=kVeryLong (ie, several pages), then use the
277	// "bigcopy" path that pulls all the punches. This is the fastest
278	// case for cold-cache operands, as any this long will likely be.
279	// 2. If length>=128 and source is 16-byte aligned, then use the
280	// lvx/stvx loop over 128-byte chunks. This is the fastest
281	// case for hot-cache operands, 2nd fastest for cold.
282	// 3. If length>=128 and source is not 16-byte aligned, then use the
283	// lvx/vperm/stvx loop over 128-byte chunks.
284	// 4. If length<128 and source is 8-byte aligned, then use the
285	// ld/std loop over 32-byte chunks.
286	// 5. If length<128 and source is not 8-byte aligned, then use the
287	// lvx/vperm/stvx loop over 32-byte chunks. This is the slowest case.
288	// Registers at this point:
289	// r0/cr1 = count of cache lines ("chunks") that we'll cover (may be 0)
290	// rs = alignment unknown
291	// rd = 16-byte aligned
292	// rc = bytes remaining
293	// w2 = low 4 bits of (rd-rs), used to check alignment
294	// cr5 = beq if source is also 16-byte aligned
295
296	LFwdAligned:
297	andi. w3,w2,7 // is source at least 8-byte aligned?
298	mtcrf 0x01,rc // move leftover count to cr7 for LShort16
299	bne cr1,LFwdLongVectors // at least one 128-byte chunk, so use vectors
300	srwi w1,rc,5 // get 32-byte chunk count
301	mtcrf 0x02,rc // move bit 27 of length to cr6 for LShort32
302	mtctr w1 // set up 32-byte loop (w1!=0)
303	beq LFwdMedAligned // source is 8-byte aligned, so use ld/std loop
304	mfspr rv,vrsave // get bitmap of live vector registers
305	oris w4,rv,0xFFF8 // we use v0-v12
306	li c16,16 // get constant used in lvx
307	li c32,32
308	mtspr vrsave,w4 // update mask
309	lvx v1,0,rs // prefetch 1st source quadword
310	lvsl vp,0,rs // get permute vector to shift left
311
312
313	// Fewer than 128 bytes but not doubleword aligned: use lvx/vperm/stvx.
314
315	1: // loop over 32-byte chunks
316	lvx v2,c16,rs
317	lvx v3,c32,rs
318	addi rs,rs,32
319	vperm vx,v1,v2,vp
320	vperm vy,v2,v3,vp
321	vor v1,v3,v3 // v1 <- v3
322	stvx vx,0,rd
323	stvx vy,c16,rd
324	addi rd,rd,32
325	bdnz 1b
326
327	mtspr vrsave,rv // restore bitmap of live vr's
328	b LShort32
329
330
331	// Fewer than 128 bytes and doubleword aligned: use ld/std.
332
333	.align 5
334	LFwdMedAligned: // loop over 32-byte chunks
335	ld w1,0(rs)
336	ld w2,8(rs)
337	ld w3,16(rs)
338	ld w4,24(rs)
339	addi rs,rs,32
340	std w1,0(rd)
341	std w2,8(rd)
342	std w3,16(rd)
343	std w4,24(rd)
344	addi rd,rd,32
345	bdnz LFwdMedAligned
346
347	b LShort32
348
349
350	// Forward, 128 bytes or more: use vectors. When entered:
351	// r0 = 128-byte chunks to move (>0)
352	// rd = 16-byte aligned
353	// cr5 = beq if source is 16-byte aligned
354	// cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
355	// We set up many registers:
356	// ctr = number of 128-byte chunks to move
357	// r0/cr0 = leftover QWs to move
358	// cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
359	// cr6 = beq if leftover byte count is 0
360	// rv = original value of VRSave
361	// c16,c32,c48 = loaded
362
363	LFwdLongVectors:
364	mfspr rv,vrsave // get bitmap of live vector registers
365	lis w3,kVeryLong>>16 // cutoff for very-long-operand special case path
366	cmplw cr1,rc,w3 // very long operand?
367	rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3
91447636	368	bge-- cr1,LBigCopy // handle big copies separately
55e303ae A	369	mtctr r0 // set up loop count
	370	cmpwi cr6,w3,0 // set cr6 on leftover byte count
	371	oris w4,rv,0xFFF8 // we use v0-v12
	372	rlwinm. r0,rc,28,29,31 // get number of quadword leftovers (0-7) and set cr0
	373	li c16,16 // get constants used in ldvx/stvx
	374	mtspr vrsave,w4 // update mask
	375	li c32,32
	376	li c48,48
	377	beq cr5,LFwdLongAligned // source is also 16-byte aligned, no need for vperm
	378	lvsl vp,0,rs // get permute vector to shift left
	379	lvx v1,0,rs // prefetch 1st source quadword
	380	b LFwdLongUnaligned
	381
	382
	383	// Forward, long, unaligned vector loop.
	384
	385	.align 5 // align inner loops
	386	LFwdLongUnaligned: // loop over 128-byte chunks
	387	addi w4,rs,64
	388	lvx v2,c16,rs
	389	lvx v3,c32,rs
	390	lvx v4,c48,rs
	391	lvx v5,0,w4
	392	lvx v6,c16,w4
	393	vperm vw,v1,v2,vp
	394	lvx v7,c32,w4
	395	lvx v8,c48,w4
	396	addi rs,rs,128
	397	vperm vx,v2,v3,vp
	398	addi w4,rd,64
	399	lvx v1,0,rs
	400	stvx vw,0,rd
	401	vperm vy,v3,v4,vp
	402	stvx vx,c16,rd
	403	vperm vz,v4,v5,vp
	404	stvx vy,c32,rd
	405	vperm vw,v5,v6,vp
	406	stvx vz,c48,rd
	407	vperm vx,v6,v7,vp
	408	addi rd,rd,128
	409	stvx vw,0,w4
	410	vperm vy,v7,v8,vp
	411	stvx vx,c16,w4
	412	vperm vz,v8,v1,vp
	413	stvx vy,c32,w4
	414	stvx vz,c48,w4
	415	bdnz LFwdLongUnaligned
	416
	417	beq 4f // no leftover quadwords
	418	mtctr r0
	419	3: // loop over remaining quadwords
	420	lvx v2,c16,rs
	421	addi rs,rs,16
	422	vperm vx,v1,v2,vp
	423	vor v1,v2,v2 // v1 <- v2
	424	stvx vx,0,rd
	425	addi rd,rd,16
	426	bdnz 3b
	427	4:
	428	mtspr vrsave,rv // restore bitmap of live vr's
	429	bne cr6,LShort16 // handle last 0-15 bytes if any
	430	blr
	431
	432
433	// Forward, long, 16-byte aligned vector loop.
434
435	.align 5
436	LFwdLongAligned: // loop over 128-byte chunks
437	addi w4,rs,64
438	lvx v1,0,rs
439	lvx v2,c16,rs
440	lvx v3,c32,rs
441	lvx v4,c48,rs
442	lvx v5,0,w4
443	lvx v6,c16,w4
444	lvx v7,c32,w4
445	lvx v8,c48,w4
446	addi rs,rs,128
447	addi w4,rd,64
448	stvx v1,0,rd
449	stvx v2,c16,rd
450	stvx v3,c32,rd
451	stvx v4,c48,rd
452	stvx v5,0,w4
453	stvx v6,c16,w4
454	stvx v7,c32,w4
455	stvx v8,c48,w4
456	addi rd,rd,128
457	bdnz LFwdLongAligned
458
459	beq 4f // no leftover quadwords
460	mtctr r0
461	3: // loop over remaining quadwords (1-7)
462	lvx v1,0,rs
463	addi rs,rs,16
464	stvx v1,0,rd
465	addi rd,rd,16
466	bdnz 3b
467	4:
468	mtspr vrsave,rv // restore bitmap of live vr's
469	bne cr6,LShort16 // handle last 0-15 bytes if any
470	blr
471
472
473	// Long, reverse moves.
474	// rs = source
475	// rd = destination
476	// rc = count
477	// cr5 = beq if relatively 16-byte aligned
478
479	LLongReverse:
480	add rd,rd,rc // point to end of operands
481	add rs,rs,rc
482	andi. r0,rd,0xF // #bytes to 16-byte align destination
483	beq 2f // already aligned
484
485	// 16-byte align destination.
486
487	mtctr r0 // set up for loop
488	sub rc,rc,r0
489	1:
490	lbzu w1,-1(rs)
491	stbu w1,-1(rd)
492	bdnz 1b
493
494	// Prepare for reverse vector loop. When entered:
495	// rd = 16-byte aligned
496	// cr5 = beq if source also 16-byte aligned
497	// We set up many registers:
498	// ctr/cr1 = number of 64-byte chunks to move (may be 0)
499	// r0/cr0 = leftover QWs to move
500	// cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
501	// cr6 = beq if leftover byte count is 0
502	// cm1 = -1
503	// rv = original value of vrsave
504
505	2:
506	mfspr rv,vrsave // get bitmap of live vector registers
507	srwi r0,rc,6 // get count of 64-byte chunks to move (may be 0)
508	oris w1,rv,0xFFF8 // we use v0-v12
509	mtcrf 0x01,rc // prepare for moving last 0-15 bytes in LShortReverse16
510	rlwinm w3,rc,0,28,31 // move last 0-15 byte count to w3 too
511	cmpwi cr1,r0,0 // set cr1 on chunk count
512	mtspr vrsave,w1 // update mask
513	mtctr r0 // set up loop count
514	cmpwi cr6,w3,0 // set cr6 on leftover byte count
515	rlwinm. r0,rc,28,30,31 // get number of quadword leftovers (0-3) and set cr0
516	li cm1,-1 // get constants used in ldvx/stvx
517
518	bne cr5,LReverseVecUnal // handle unaligned operands
519	beq cr1,2f // no chunks (if no chunks, must be leftover QWs)
520	li cm17,-17
521	li cm33,-33
522	li cm49,-49
523	b 1f
524
525	// Long, reverse 16-byte-aligned vector loop.
526
527	.align 5 // align inner loops
528	1: // loop over 64-byte chunks
529	lvx v1,cm1,rs
530	lvx v2,cm17,rs
531	lvx v3,cm33,rs
532	lvx v4,cm49,rs
533	subi rs,rs,64
534	stvx v1,cm1,rd
535	stvx v2,cm17,rd
536	stvx v3,cm33,rd
537	stvx v4,cm49,rd
538	subi rd,rd,64
539	bdnz 1b
540
541	beq 4f // no leftover quadwords
542	2: // r0=#QWs, rv=vrsave, cr7=(rc & F), cr6 set on cr7
543	mtctr r0
544	3: // loop over remaining quadwords (1-7)
545	lvx v1,cm1,rs
546	subi rs,rs,16
547	stvx v1,cm1,rd
548	subi rd,rd,16
549	bdnz 3b
550	4:
551	mtspr vrsave,rv // restore bitmap of live vr's
552	bne cr6,LShortReverse16 // handle last 0-15 bytes if any
553	blr
554
555
556	// Long, reverse, unaligned vector loop.
557	// ctr/cr1 = number of 64-byte chunks to move (may be 0)
558	// r0/cr0 = leftover QWs to move
559	// cr7 = low 4 bits of rc (ie, leftover byte count 0-15)
560	// cr6 = beq if leftover byte count is 0
561	// rv = original value of vrsave
562	// cm1 = -1
563
564	LReverseVecUnal:
565	lvsl vp,0,rs // get permute vector to shift left
566	lvx v1,cm1,rs // v1 always looks ahead
567	li cm17,-17
568	beq cr1,2f // no chunks (if no chunks, must be leftover QWs)
569	li cm33,-33
570	li cm49,-49
571	b 1f
572
573	.align 5 // align the inner loops
574	1: // loop over 64-byte chunks
575	lvx v2,cm17,rs
576	lvx v3,cm33,rs
577	lvx v4,cm49,rs
578	subi rs,rs,64
579	vperm vx,v2,v1,vp
580	lvx v1,cm1,rs
581	vperm vy,v3,v2,vp
582	stvx vx,cm1,rd
583	vperm vz,v4,v3,vp
584	stvx vy,cm17,rd
585	vperm vx,v1,v4,vp
586	stvx vz,cm33,rd
587	stvx vx,cm49,rd
588	subi rd,rd,64
589	bdnz 1b
590
591	beq 4f // no leftover quadwords
592	2: // r0=#QWs, rv=vrsave, v1=next QW, cr7=(rc & F), cr6 set on cr7
593	mtctr r0
594	3: // loop over 1-3 quadwords
595	lvx v2,cm17,rs
596	subi rs,rs,16
597	vperm vx,v2,v1,vp
598	vor v1,v2,v2 // v1 <- v2
599	stvx vx,cm1,rd
600	subi rd,rd,16
601	bdnz 3b
602	4:
603	mtspr vrsave,rv // restore bitmap of live vr's
604	bne cr6,LShortReverse16 // handle last 0-15 bytes iff any
605	blr
606
91447636 A	607
	608	// Very Big Copy Path. Save our return address in the stack for help decoding backtraces.
	609	// The conditions bigcopy expects are:
	610	// r0 = return address (also stored in caller's SF)
	611	// r4 = source ptr
	612	// r5 = length (at least several pages)
	613	// r12 = dest ptr
	614
	615	LBigCopy:
	616	lis r2,0x4000 // r2 <- 0x40000000
	617	mflr r0 // get our return address
	618	add. r2,r2,r2 // set cr0_lt if running in 32-bit mode
	619	stw r0,8(r1) // save return, assuming 32-bit mode ("crsave" if 64-bit mode)
	620	blta _COMM_PAGE_BIGCOPY // 32-bit mode, join big operand copy
	621	std r0,16(r1) // save return in correct spot for 64-bit mode
	622	ba _COMM_PAGE_BIGCOPY // then join big operand code
	623
	624
	625	COMMPAGE_DESCRIPTOR(bcopy_970,_COMM_PAGE_BCOPY,k64Bit+kHasAltivec,0, \
	626	kCommPageMTCRF+kCommPageBoth+kPort32to64)