/*
- * Copyright (c) 2000 Apple Computer, Inc. All rights reserved.
+ * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved.
*
- * @APPLE_LICENSE_HEADER_START@
- *
- * Copyright (c) 1999-2003 Apple Computer, Inc. All Rights Reserved.
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
*
* This file contains Original Code and/or Modifications of Original Code
* as defined in and that are subject to the Apple Public Source License
* Version 2.0 (the 'License'). You may not use this file except in
- * compliance with the License. Please obtain a copy of the License at
- * http://www.opensource.apple.com/apsl/ and read it before using this
- * file.
+ * compliance with the License. The rights granted to you under the License
+ * may not be used to create, or enable the creation or redistribution of,
+ * unlawful or unlicensed copies of an Apple operating system, or to
+ * circumvent, violate, or enable the circumvention or violation of, any
+ * terms of an Apple operating system software license agreement.
+ *
+ * Please obtain a copy of the License at
+ * http://www.opensource.apple.com/apsl/ and read it before using this file.
*
* The Original Code and all software distributed under the License are
* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
* Please see the License for the specific language governing rights and
* limitations under the License.
*
- * @APPLE_LICENSE_HEADER_END@
+ * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
*/
/*
* @OSF_COPYRIGHT@
#include <assym.s>
#include <sys/errno.h>
+#define INSTRUMENT 0
+
+//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
/*
* void pmap_zero_page(vm_offset_t pa)
*
- * zero a page of physical memory.
+ * Zero a page of physical memory. This routine runs in 32 or 64-bit mode,
+ * and handles 32 and 128-byte cache lines.
*/
-#if DEBUG
- /* C debug stub in pmap.c calls this */
-ENTRY(pmap_zero_page_assembler, TAG_NO_FRAME_USED)
-#else
-ENTRY(pmap_zero_page, TAG_NO_FRAME_USED)
-#endif /* DEBUG */
-
- mfmsr r6 /* Get the MSR */
- rlwinm r6,r6,0,MSR_FP_BIT+1,MSR_FP_BIT-1 ; Force floating point off
- rlwinm r6,r6,0,MSR_VEC_BIT+1,MSR_VEC_BIT-1 ; Force vectors off
- rlwinm r7, r6, 0, MSR_DR_BIT+1, MSR_DR_BIT-1 /* Turn off DR */
- rlwinm r7,r7,0,MSR_EE_BIT+1,MSR_EE_BIT-1 ; Disable interruptions
- li r4,PPC_PGBYTES-CACHE_LINE_SIZE /* Point to the end of the page */
- mtmsr r7 /* Set MSR to DR off */
- isync /* Ensure data translations are off */
-
-
-.L_phys_zero_loop:
- subic. r5,r4,CACHE_LINE_SIZE /* Point to the next one */
- dcbz r4, r3 /* Clear the whole thing to 0s */
- subi r4,r5,CACHE_LINE_SIZE /* Point to the next one */
- dcbz r5, r3 /* Clear the next to zeros */
- bgt+ .L_phys_zero_loop /* Keep going until we do the page... */
-
- sync /* Make sure they're all done */
- li r4,PPC_PGBYTES-CACHE_LINE_SIZE /* Point to the end of the page */
-
-.L_inst_inval_loop:
- subic. r5,r4,CACHE_LINE_SIZE /* Point to the next one */
- icbi r4, r3 /* Clear the whole thing to 0s */
- subi r4,r5,CACHE_LINE_SIZE /* Point to the next one */
- icbi r5, r3 /* Clear the next to zeros */
- bgt+ .L_inst_inval_loop /* Keep going until we do the page... */
-
- sync /* Make sure they're all done */
-
- mtmsr r6 /* Restore original translations */
- isync /* Ensure data translations are on */
- blr
+ .align 5
+ .globl EXT(pmap_zero_page)
+
+LEXT(pmap_zero_page)
+
+ mflr r12 // save return address
+ bl EXT(ml_set_physical_disabled) // turn DR and EE off, SF on, get features in r10
+ mtlr r12 // restore return address
+ andi. r9,r10,pf32Byte+pf128Byte // r9 <- cache line size
+
+ subfic r4,r9,PPC_PGBYTES // r4 <- starting offset in page
+
+ bt++ pf64Bitb,page0S4 // Go do the big guys...
+
+ slwi r3,r3,12 // get page address from page num
+ b page_zero_1 // Jump to line aligned loop...
+ .align 5
+
+ nop
+ nop
+ nop
+ nop
+ nop
+ nop
+ nop
+
+page0S4:
+ sldi r3,r3,12 // get page address from page num
+
+page_zero_1: // loop zeroing cache lines
+ sub. r5,r4,r9 // more to go?
+ dcbz128 r3,r4 // zero either 32 or 128 bytes
+ sub r4,r5,r9 // generate next offset
+ dcbz128 r3,r5
+ bne-- page_zero_1
+
+ b EXT(ml_restore) // restore MSR and do the isync
+
+
+//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
/* void
* phys_copy(src, dst, bytecount)
- * vm_offset_t src;
- * vm_offset_t dst;
+ * addr64_t src;
+ * addr64_t dst;
* int bytecount
*
* This routine will copy bytecount bytes from physical address src to physical
- * address dst.
+ * address dst. It runs in 64-bit mode if necessary, but does not handle
+ * overlap or make any attempt to be optimal. Length must be a signed word.
+ * Not performance critical.
*/
-ENTRY(phys_copy, TAG_NO_FRAME_USED)
-
- /* Switch off data translations */
- mfmsr r6
- rlwinm r6,r6,0,MSR_FP_BIT+1,MSR_FP_BIT-1 ; Force floating point off
- rlwinm r6,r6,0,MSR_VEC_BIT+1,MSR_VEC_BIT-1 ; Force vectors off
- rlwinm r7, r6, 0, MSR_DR_BIT+1, MSR_DR_BIT-1
- rlwinm r7, r7, 0, MSR_EE_BIT+1, MSR_EE_BIT-1
- mtmsr r7
- isync /* Ensure data translations are off */
-
- subi r3, r3, 4
- subi r4, r4, 4
-
- cmpwi r5, 3
- ble- .L_phys_copy_bytes
-.L_phys_copy_loop:
- lwz r0, 4(r3)
- addi r3, r3, 4
- subi r5, r5, 4
- stw r0, 4(r4)
- addi r4, r4, 4
- cmpwi r5, 3
- bgt+ .L_phys_copy_loop
-
- /* If no leftover bytes, we're done now */
- cmpwi r5, 0
- beq+ .L_phys_copy_done
-
-.L_phys_copy_bytes:
- addi r3, r3, 3
- addi r4, r4, 3
-.L_phys_copy_byte_loop:
- lbz r0, 1(r3)
- addi r3, r3, 1
- subi r5, r5, 1
- stb r0, 1(r4)
- addi r4, r4, 1
- cmpwi r5, 0
- bne+ .L_phys_copy_byte_loop
-
-.L_phys_copy_done:
- mtmsr r6 /* Restore original translations */
- isync /* Ensure data translations are off */
-
- blr
+ .align 5
+ .globl EXT(phys_copy)
+
+LEXT(phys_copy)
+
+ rlwinm r3,r3,0,1,0 ; Duplicate high half of long long paddr into top of reg
+ mflr r12 // get return address
+ rlwimi r3,r4,0,0,31 ; Combine bottom of long long to full 64-bits
+ rlwinm r4,r5,0,1,0 ; Duplicate high half of long long paddr into top of reg
+ bl EXT(ml_set_physical_disabled) // turn DR and EE off, SF on, get features in r10
+ rlwimi r4,r6,0,0,31 ; Combine bottom of long long to full 64-bits
+ mtlr r12 // restore return address
+ subic. r5,r7,4 // a word to copy?
+ b phys_copy_2
+
+ .align 5
+
+phys_copy_1: // loop copying words
+ subic. r5,r5,4 // more to go?
+ lwz r0,0(r3)
+ addi r3,r3,4
+ stw r0,0(r4)
+ addi r4,r4,4
+phys_copy_2:
+ bge phys_copy_1
+ addic. r5,r5,4 // restore count
+ ble phys_copy_4 // no more
+
+ // Loop is aligned here
+
+phys_copy_3: // loop copying bytes
+ subic. r5,r5,1 // more to go?
+ lbz r0,0(r3)
+ addi r3,r3,1
+ stb r0,0(r4)
+ addi r4,r4,1
+ bgt phys_copy_3
+phys_copy_4:
+ b EXT(ml_restore) // restore MSR and do the isync
+
+
+//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
/* void
* pmap_copy_page(src, dst)
- * vm_offset_t src;
- * vm_offset_t dst;
+ * ppnum_t src;
+ * ppnum_t dst;
*
* This routine will copy the physical page src to physical page dst
*
- * This routine assumes that the src and dst are page aligned and that the
- * destination is cached.
- *
- * We also must assume that noone will be executing within the destination
- * page. We also assume that this will be used for paging
+ * This routine assumes that the src and dst are page numbers and that the
+ * destination is cached. It runs on 32 and 64 bit processors, with and
+ * without altivec, and with 32 and 128 byte cache lines.
+ * We also must assume that no-one will be executing within the destination
+ * page, and that this will be used for paging. Because this
+ * is a common routine, we have tuned loops for each processor class.
*
*/
+#define kSFSize (FM_SIZE+160)
-#if DEBUG
- /* if debug, we have a little piece of C around this
- * in pmap.c that gives some trace ability
- */
-ENTRY(pmap_copy_page_assembler, TAG_NO_FRAME_USED)
-#else
ENTRY(pmap_copy_page, TAG_NO_FRAME_USED)
-#endif /* DEBUG */
-#if 0
- mfpvr r9 ; Get the PVR
- rlwinm r9,r9,16,16,31 ; Isolate the PPC processor
- cmplwi r9,PROCESSOR_VERSION_Max ; Do we have Altivec?
- beq+ wegotaltivec ; Yeah...
-#endif
-
- mfmsr r9 ; Get the MSR
- rlwinm r9,r9,0,MSR_FP_BIT+1,MSR_FP_BIT-1 ; Force floating point off
- rlwinm r9,r9,0,MSR_VEC_BIT+1,MSR_VEC_BIT-1 ; Force vectors off
- stwu r1,-(FM_SIZE+32)(r1) ; Make a frame for us
- rlwinm r7,r9,0,MSR_EE_BIT+1,MSR_EE_BIT-1 ; Disable interruptions
- ori r7,r7,lo16(MASK(MSR_FP)) ; Turn on the FPU
- mtmsr r7 ; Disable rupts and enable FPU
- isync
-
- stfd f0,FM_SIZE+0(r1) ; Save an FP register
- rlwinm r7,r7,0,MSR_DR_BIT+1,MSR_DR_BIT-1 ; Clear the DDAT bit
- stfd f1,FM_SIZE+8(r1) ; Save an FP register
- addi r6,r3,PPC_PGBYTES ; Point to the start of the next page
- stfd f2,FM_SIZE+16(r1) ; Save an FP register
- mr r8,r4 ; Save the destination
- stfd f3,FM_SIZE+24(r1) ; Save an FP register
-
- mtmsr r7 ; Set the new MSR
- isync ; Ensure data translations are off
-
- dcbt br0, r3 /* Start in first input line */
- li r5, CACHE_LINE_SIZE /* Get the line size */
-
-.L_pmap_copy_page_loop:
- dcbz 0, r4 /* Allocate a line for the output */
- lfd f0, 0(r3) /* Get first 8 */
- lfd f1, 8(r3) /* Get second 8 */
- lfd f2, 16(r3) /* Get third 8 */
- stfd f0, 0(r4) /* Put first 8 */
- dcbt r5, r3 /* Start next line coming in */
- lfd f3, 24(r3) /* Get fourth 8 */
- stfd f1, 8(r4) /* Put second 8 */
- addi r3,r3,CACHE_LINE_SIZE /* Point to the next line in */
- stfd f2, 16(r4) /* Put third 8 */
- cmplw cr0,r3,r6 /* See if we're finished yet */
- stfd f3, 24(r4) /* Put fourth 8 */
- dcbst br0,r4 /* Force it out */
- addi r4,r4,CACHE_LINE_SIZE /* Point to the next line out */
- blt+ .L_pmap_copy_page_loop /* Copy the whole page */
-
- sync /* Make sure they're all done */
- li r4,PPC_PGBYTES-CACHE_LINE_SIZE /* Point to the end of the page */
-
-invalinst:
- subic. r5,r4,CACHE_LINE_SIZE /* Point to the next one */
- icbi r4, r8 /* Trash the i-cache */
- subi r4,r5,CACHE_LINE_SIZE /* Point to the next one */
- icbi r5, r8 /* Trash the i-cache */
- bgt+ invalinst /* Keep going until we do the page... */
-
- rlwimi r7,r9,0,MSR_DR_BIT,MSR_DR_BIT ; Set DDAT if on
- sync ; Make sure all invalidates done
-
- mtmsr r7 ; Set DDAT correctly
- isync
-
- lfd f0,FM_SIZE+0(r1) ; Restore an FP register
- lfd f1,FM_SIZE+8(r1) ; Restore an FP register
- lfd f2,FM_SIZE+16(r1) ; Restore an FP register
- lfd f3,FM_SIZE+24(r1) ; Restore an FP register
-
- lwz r1,0(r1) ; Pop up the stack
-
- mtmsr r9 ; Turn off FPU now and maybe rupts back on
- isync
- blr
-
-#if 0
-;
-; This is not very optimal. We just do it here for a test of
-; Altivec in the kernel.
-;
-wegotaltivec:
- mfmsr r9 ; Get the MSR
- lis r8,hi16(0xC0000000) ; Make sure we keep the first 2 vector registers
- rlwinm r7,r9,0,MSR_EE_BIT+1,MSR_EE_BIT-1 ; Disable interruptions
- lis r6,lo16(2*256+128) ; Specify 128 blocks of 2 vectors each
- rlwinm r7,r7,0,MSR_DR_BIT+1,MSR_DR_BIT-1 ; Clear the DDAT bit
- ori r6,r6,32 ; Set a 32-byte stride
- mtsprg 256,r8 ; Set VRSave
- mtmsr r7 ; Disable rupts and turn xlate off
- isync
-
- addi r11,r3,4096 ; Point to the next page
- li r10,16 ; Get vector size
-
-avmovepg: lvxl v0,br0,r3 ; Get first half of line
- dcba br0,r4 ; Allocate output
- lvxl v1,r10,r3 ; Get second half of line
- stvxl v0,br0,r4 ; Save first half of line
- addi r3,r3,32 ; Point to the next line
- icbi br0,r4 ; Make the icache go away also
- stvxl v1,r10,r4 ; Save second half of line
- cmplw r3,r11 ; Have we reached the next page?
- dcbst br0,r4 ; Make sure the line is on its way out
- addi r4,r4,32 ; Point to the next line
- blt+ avmovepg ; Move the next line...
-
- li r8,0 ; Clear this
- sync ; Make sure all the memory stuff is done
- mtsprg 256,r8 ; Show we are not using VRs any more
- mtmsr r9 ; Translation and interruptions back on
- isync
- blr
-#endif
+ lis r2,hi16(MASK(MSR_VEC)) ; Get the vector flag
+ mflr r0 // get return
+ ori r2,r2,lo16(MASK(MSR_FP)) ; Add the FP flag
+ stw r0,8(r1) // save
+ stwu r1,-kSFSize(r1) // set up a stack frame for VRs or FPRs
+ mfmsr r11 // save MSR at entry
+ mfsprg r10,2 // get feature flags
+ andc r11,r11,r2 // Clear out vec and fp
+ ori r2,r2,lo16(MASK(MSR_EE)) // Get EE on also
+ andc r2,r11,r2 // Clear out EE as well
+ mtcrf 0x02,r10 // we need to test pf64Bit
+ ori r2,r2,MASK(MSR_FP) // must enable FP for G3...
+ mtcrf 0x80,r10 // we need to test pfAltivec too
+ oris r2,r2,hi16(MASK(MSR_VEC)) // enable altivec for G4 (ignored if G3)
+ mtmsr r2 // turn EE off, FP and VEC on
+ isync
+ bt++ pf64Bitb,pmap_copy_64 // skip if 64-bit processor (only they take hint)
+ slwi r3,r3,12 // get page address from page num
+ slwi r4,r4,12 // get page address from page num
+ rlwinm r12,r2,0,MSR_DR_BIT+1,MSR_DR_BIT-1 // get ready to turn off DR
+ bt pfAltivecb,pmap_copy_g4 // altivec but not 64-bit means G4
+
+
+ // G3 -- copy using FPRs
+
+ stfd f0,FM_SIZE+0(r1) // save the 4 FPRs we use to copy
+ stfd f1,FM_SIZE+8(r1)
+ li r5,PPC_PGBYTES/32 // count of cache lines in a page
+ stfd f2,FM_SIZE+16(r1)
+ mtctr r5
+ stfd f3,FM_SIZE+24(r1)
+ mtmsr r12 // turn off DR after saving FPRs on stack
+ isync
+
+pmap_g3_copy_loop: // loop over 32-byte cache lines
+ dcbz 0,r4 // avoid read of dest line
+ lfd f0,0(r3)
+ lfd f1,8(r3)
+ lfd f2,16(r3)
+ lfd f3,24(r3)
+ addi r3,r3,32
+ stfd f0,0(r4)
+ stfd f1,8(r4)
+ stfd f2,16(r4)
+ stfd f3,24(r4)
+ dcbst 0,r4 // flush dest line to RAM
+ addi r4,r4,32
+ bdnz pmap_g3_copy_loop
+
+ sync // wait for stores to take
+ subi r4,r4,PPC_PGBYTES // restore ptr to destintation page
+ li r6,PPC_PGBYTES-32 // point to last line in page
+pmap_g3_icache_flush:
+ subic. r5,r6,32 // more to go?
+ icbi r4,r6 // flush another line in icache
+ subi r6,r5,32 // get offset to next line
+ icbi r4,r5
+ bne pmap_g3_icache_flush
+
+ sync
+ mtmsr r2 // turn DR back on
+ isync
+ lfd f0,FM_SIZE+0(r1) // restore the FPRs
+ lfd f1,FM_SIZE+8(r1)
+ lfd f2,FM_SIZE+16(r1)
+ lfd f3,FM_SIZE+24(r1)
+
+ b pmap_g4_restore // restore MSR and done
+
+
+ // G4 -- copy using VRs
+
+pmap_copy_g4: // r2=(MSR-EE), r12=(r2-DR), r10=features, r11=old MSR
+ la r9,FM_SIZE+16(r1) // place where we save VRs to r9
+ li r5,16 // load x-form offsets into r5-r9
+ li r6,32 // another offset
+ stvx v0,0,r9 // save some VRs so we can use to copy
+ li r7,48 // another offset
+ stvx v1,r5,r9
+ li r0,PPC_PGBYTES/64 // we loop over 64-byte chunks
+ stvx v2,r6,r9
+ mtctr r0
+ li r8,96 // get look-ahead for touch
+ stvx v3,r7,r9
+ li r9,128
+ mtmsr r12 // now we've saved VRs on stack, turn off DR
+ isync // wait for it to happen
+ b pmap_g4_copy_loop
+
+ .align 5 // align inner loops
+pmap_g4_copy_loop: // loop over 64-byte chunks
+ dcbt r3,r8 // touch 3 lines ahead
+ nop // avoid a 17-word loop...
+ dcbt r3,r9 // touch 4 lines ahead
+ nop // more padding
+ dcba 0,r4 // avoid pre-fetch of 1st dest line
+ lvx v0,0,r3 // offset 0
+ lvx v1,r5,r3 // offset 16
+ lvx v2,r6,r3 // offset 32
+ lvx v3,r7,r3 // offset 48
+ addi r3,r3,64
+ dcba r6,r4 // avoid pre-fetch of 2nd line
+ stvx v0,0,r4 // offset 0
+ stvx v1,r5,r4 // offset 16
+ stvx v2,r6,r4 // offset 32
+ stvx v3,r7,r4 // offset 48
+ dcbf 0,r4 // push line 1
+ dcbf r6,r4 // and line 2
+ addi r4,r4,64
+ bdnz pmap_g4_copy_loop
+
+ sync // wait for stores to take
+ subi r4,r4,PPC_PGBYTES // restore ptr to destintation page
+ li r8,PPC_PGBYTES-32 // point to last line in page
+pmap_g4_icache_flush:
+ subic. r9,r8,32 // more to go?
+ icbi r4,r8 // flush from icache
+ subi r8,r9,32 // get offset to next line
+ icbi r4,r9
+ bne pmap_g4_icache_flush
+
+ sync
+ mtmsr r2 // turn DR back on
+ isync
+ la r9,FM_SIZE+16(r1) // get base of VR save area
+ lvx v0,0,r9 // restore the VRs
+ lvx v1,r5,r9
+ lvx v2,r6,r9
+ lvx v3,r7,r9
+
+pmap_g4_restore: // r11=MSR
+ mtmsr r11 // turn EE on, VEC and FR off
+ isync // wait for it to happen
+ addi r1,r1,kSFSize // pop off our stack frame
+ lwz r0,8(r1) // restore return address
+ mtlr r0
+ blr
+
+
+ // 64-bit/128-byte processor: copy using VRs
+
+pmap_copy_64: // r10=features, r11=old MSR
+ sldi r3,r3,12 // get page address from page num
+ sldi r4,r4,12 // get page address from page num
+ la r9,FM_SIZE+16(r1) // get base of VR save area
+ li r5,16 // load x-form offsets into r5-r9
+ li r6,32 // another offset
+ bf pfAltivecb,pmap_novmx_copy // altivec suppressed...
+ stvx v0,0,r9 // save 8 VRs so we can copy wo bubbles
+ stvx v1,r5,r9
+ li r7,48 // another offset
+ li r0,PPC_PGBYTES/128 // we loop over 128-byte chunks
+ stvx v2,r6,r9
+ stvx v3,r7,r9
+ addi r9,r9,64 // advance base ptr so we can store another 4
+ mtctr r0
+ li r0,MASK(MSR_DR) // get DR bit
+ stvx v4,0,r9
+ stvx v5,r5,r9
+ andc r12,r2,r0 // turn off DR bit
+ li r0,1 // get a 1 to slam into SF
+ stvx v6,r6,r9
+ stvx v7,r7,r9
+ rldimi r12,r0,63,MSR_SF_BIT // set SF bit (bit 0)
+ li r8,-128 // offset so we can reach back one line
+ mtmsrd r12 // now we've saved VRs, turn DR off and SF on
+ isync // wait for it to happen
+ dcbt128 0,r3,1 // start a forward stream
+ b pmap_64_copy_loop
+
+ .align 5 // align inner loops
+pmap_64_copy_loop: // loop over 128-byte chunks
+ dcbz128 0,r4 // avoid read of destination line
+ lvx v0,0,r3 // offset 0
+ lvx v1,r5,r3 // offset 16
+ lvx v2,r6,r3 // offset 32
+ lvx v3,r7,r3 // offset 48
+ addi r3,r3,64 // don't have enough GPRs so add 64 2x
+ lvx v4,0,r3 // offset 64
+ lvx v5,r5,r3 // offset 80
+ lvx v6,r6,r3 // offset 96
+ lvx v7,r7,r3 // offset 112
+ addi r3,r3,64
+ stvx v0,0,r4 // offset 0
+ stvx v1,r5,r4 // offset 16
+ stvx v2,r6,r4 // offset 32
+ stvx v3,r7,r4 // offset 48
+ addi r4,r4,64
+ stvx v4,0,r4 // offset 64
+ stvx v5,r5,r4 // offset 80
+ stvx v6,r6,r4 // offset 96
+ stvx v7,r7,r4 // offset 112
+ addi r4,r4,64
+ dcbf r8,r4 // flush the line we just wrote
+ bdnz pmap_64_copy_loop
+
+ sync // wait for stores to take
+ subi r4,r4,PPC_PGBYTES // restore ptr to destintation page
+ li r8,PPC_PGBYTES-128 // point to last line in page
+pmap_64_icache_flush:
+ subic. r9,r8,128 // more to go?
+ icbi r4,r8 // flush from icache
+ subi r8,r9,128 // get offset to next line
+ icbi r4,r9
+ bne pmap_64_icache_flush
+
+ sync
+ mtmsrd r2 // turn DR back on, SF off
+ isync
+ la r9,FM_SIZE+16(r1) // get base address of VR save area on stack
+ lvx v0,0,r9 // restore the VRs
+ lvx v1,r5,r9
+ lvx v2,r6,r9
+ lvx v3,r7,r9
+ addi r9,r9,64
+ lvx v4,0,r9
+ lvx v5,r5,r9
+ lvx v6,r6,r9
+ lvx v7,r7,r9
+
+ b pmap_g4_restore // restore lower half of MSR and return
+
+ //
+ // Copy on 64-bit without VMX
+ //
+
+pmap_novmx_copy:
+ li r0,PPC_PGBYTES/128 // we loop over 128-byte chunks
+ mtctr r0
+ li r0,MASK(MSR_DR) // get DR bit
+ andc r12,r2,r0 // turn off DR bit
+ li r0,1 // get a 1 to slam into SF
+ rldimi r12,r0,63,MSR_SF_BIT // set SF bit (bit 0)
+ mtmsrd r12 // now we've saved VRs, turn DR off and SF on
+ isync // wait for it to happen
+ dcbt128 0,r3,1 // start a forward stream
+
+pmap_novmx_copy_loop: // loop over 128-byte cache lines
+ dcbz128 0,r4 // avoid read of dest line
+
+ ld r0,0(r3) // Load half a line
+ ld r12,8(r3)
+ ld r5,16(r3)
+ ld r6,24(r3)
+ ld r7,32(r3)
+ ld r8,40(r3)
+ ld r9,48(r3)
+ ld r10,56(r3)
+
+ std r0,0(r4) // Store half a line
+ std r12,8(r4)
+ std r5,16(r4)
+ std r6,24(r4)
+ std r7,32(r4)
+ std r8,40(r4)
+ std r9,48(r4)
+ std r10,56(r4)
+
+ ld r0,64(r3) // Load half a line
+ ld r12,72(r3)
+ ld r5,80(r3)
+ ld r6,88(r3)
+ ld r7,96(r3)
+ ld r8,104(r3)
+ ld r9,112(r3)
+ ld r10,120(r3)
+
+ addi r3,r3,128
+
+ std r0,64(r4) // Store half a line
+ std r12,72(r4)
+ std r5,80(r4)
+ std r6,88(r4)
+ std r7,96(r4)
+ std r8,104(r4)
+ std r9,112(r4)
+ std r10,120(r4)
+
+ dcbf 0,r4 // flush the line we just wrote
+ addi r4,r4,128
+ bdnz pmap_novmx_copy_loop
+
+ sync // wait for stores to take
+ subi r4,r4,PPC_PGBYTES // restore ptr to destintation page
+ li r8,PPC_PGBYTES-128 // point to last line in page
+
+pmap_novmx_icache_flush:
+ subic. r9,r8,128 // more to go?
+ icbi r4,r8 // flush from icache
+ subi r8,r9,128 // get offset to next line
+ icbi r4,r9
+ bne pmap_novmx_icache_flush
+
+ sync
+ mtmsrd r2 // turn DR back on, SF off
+ isync
+
+ b pmap_g4_restore // restore lower half of MSR and return
+
+
+
+//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
+// Stack frame format used by copyin, copyout, copyinstr and copyoutstr.
+// These routines all run both on 32 and 64-bit machines, though because they are called
+// by the BSD kernel they are always in 32-bit mode when entered. The mapped ptr returned
+// by MapUserMemoryWindow will be 64 bits however on 64-bit machines. Beware to avoid
+// using compare instructions on this ptr. This mapped ptr is kept globally in r31, so there
+// is no need to store or load it, which are mode-dependent operations since it could be
+// 32 or 64 bits.
+
+#define kkFrameSize (FM_SIZE+32)
+
+#define kkBufSize (FM_SIZE+0)
+#define kkCR3 (FM_SIZE+4)
+#define kkSource (FM_SIZE+8)
+#define kkDest (FM_SIZE+12)
+#define kkCountPtr (FM_SIZE+16)
+#define kkR31Save (FM_SIZE+20)
+#define kkThrErrJmp (FM_SIZE+24)
+
+
+// nonvolatile CR bits we use as flags in cr3
+
+#define kk64bit 12
+#define kkNull 13
+#define kkIn 14
+#define kkString 15
+#define kkZero 15
-
+//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
/*
* int
- * copyin(src, dst, count)
- * vm_offset_t src;
- * vm_offset_t dst;
- * int count;
+ * copyoutstr(src, dst, maxcount, count)
+ * vm_offset_t src; // r3
+ * addr64_t dst; // r4 and r5
+ * vm_size_t maxcount; // r6
+ * vm_size_t* count; // r7
*
+ * Set *count to the number of bytes copied.
*/
-ENTRY2(copyin, copyinmsg, TAG_NO_FRAME_USED)
-
-/* Preamble allowing us to call a sub-function */
- mflr r0
- stw r0,FM_LR_SAVE(r1)
- stwu r1,-(FM_SIZE+16)(r1)
-
- cmpli cr0,r5,0
- ble- cr0,.L_copyinout_trivial
+ENTRY(copyoutstr, TAG_NO_FRAME_USED)
+ mfcr r2,0x10 // save caller's cr3, which we use for flags
+ mr r10,r4 // move high word of 64-bit user address to r10
+ li r0,0
+ crset kkString // flag as a string op
+ mr r11,r5 // move low word of 64-bit user address to r11
+ stw r0,0(r7) // initialize #bytes moved
+ crclr kkIn // flag as copyout
+ b copyJoin
-/* we know we have a valid copyin to do now */
-/* Set up thread_recover in case we hit an illegal address */
-
- mfsprg r8,1 /* Get the current act */
- lwz r10,ACT_THREAD(r8)
- lis r11,hi16(.L_copyinout_error)
- lwz r8,ACT_VMMAP(r8)
- ori r11,r11,lo16(.L_copyinout_error)
- add r9,r3,r5 /* Get the end of the source */
- lwz r8,VMMAP_PMAP(r8) ; Get the pmap
- rlwinm r12,r3,6,26,29 ; Get index to the segment slot
- subi r9,r9,1 /* Make sure we don't go too far */
- add r8,r8,r12 ; Start indexing to the segment value
- stw r11,THREAD_RECOVER(r10)
- xor r9,r9,r3 /* Smoosh 'em together */
- lwz r8,PMAP_SEGS(r8) ; Get the source SR value
- rlwinm. r9,r9,0,1,3 /* Top nybble equal? */
- mtsr SR_COPYIN,r8 ; Set the SR
- isync
-#if 0
- lis r0,HIGH_ADDR(EXT(dbgRegsCall)) /* (TEST/DEBUG) */
- ori r0,r0,LOW_ADDR(EXT(dbgRegsCall)) /* (TEST/DEBUG) */
- sc /* (TEST/DEBUG) */
-#endif
-
-/* For optimization, we check if the copyin lies on a segment
- * boundary. If it doesn't, we can use a simple copy. If it
- * does, we split it into two separate copies in some C code.
- */
-
- bne- .L_call_copyin_multiple /* Nope, we went past the segment boundary... */
- rlwinm r3,r3,0,4,31
- oris r3,r3,(SR_COPYIN_NUM << (28-16)) /* Set the copyin segment as the source */
-
- bl EXT(bcopy)
-
-/* Now that copyin is done, we don't need a recovery point */
-
- addi r1,r1,FM_SIZE+16
- mfsprg r6,1 /* Get the current act */
- lwz r10,ACT_THREAD(r6)
- li r3,0
- lwz r0,FM_LR_SAVE(r1)
- stw r3,THREAD_RECOVER(r10) /* Clear recovery */
- mtlr r0
- blr
-
-/* we get here via the exception handler if an illegal
- * user memory reference was made.
+//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
+/*
+ * int
+ * copyinstr(src, dst, maxcount, count)
+ * addr64_t src; // r3 and r4
+ * vm_offset_t dst; // r5
+ * vm_size_t maxcount; // r6
+ * vm_size_t* count; // r7
+ *
+ * Set *count to the number of bytes copied
+ * If dst == NULL, don't copy, just count bytes.
+ * Only currently called from klcopyinstr.
*/
-.L_copyinout_error:
-
-/* Now that copyin is done, we don't need a recovery point */
-
- mfsprg r6,1 /* Get the current act */
- addi r1,r1,FM_SIZE+16
- lwz r10,ACT_THREAD(r6)
- li r4,0
- lwz r0,FM_LR_SAVE(r1)
- stw r4,THREAD_RECOVER(r10) /* Clear recovery */
- mtlr r0
- li r3,EFAULT ; Indicate error (EFAULT)
- blr
-.L_copyinout_trivial:
- /* The copyin/out was for either 0 bytes or a negative
- * number of bytes, return an appropriate value (0 == SUCCESS).
- * cr0 still contains result of comparison of len with 0.
- */
- li r3, 0
- beq+ cr0, .L_copyinout_negative
- li r3, 1
-.L_copyinout_negative:
+ENTRY(copyinstr, TAG_NO_FRAME_USED)
+ mfcr r2,0x10 // save caller's cr3, which we use for flags
+ cmplwi r5,0 // dst==NULL?
+ mr r10,r3 // move high word of 64-bit user address to r10
+ li r0,0
+ crset kkString // flag as a string op
+ mr r11,r4 // move low word of 64-bit user address to r11
+ crmove kkNull,cr0_eq // remember if (dst==NULL)
+ stw r0,0(r7) // initialize #bytes moved
+ crset kkIn // flag as copyin (rather than copyout)
+ b copyJoin1 // skip over the "crclr kkNull"
- /* unwind the stack */
- addi r1, r1, FM_SIZE+16
- lwz r0, FM_LR_SAVE(r1)
- mtlr r0
- blr
+//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
+/*
+ * int
+ * copyout(src, dst, count)
+ * vm_offset_t src; // r3
+ * addr64_t dst; // r4 and r5
+ * size_t count; // r6
+ */
-.L_call_copyin_multiple:
+ .align 5
+ .globl EXT(copyout)
+ .globl EXT(copyoutmsg)
- /* unwind the stack */
- addi r1, r1, FM_SIZE+16
- lwz r0, FM_LR_SAVE(r1)
- mtlr r0
+LEXT(copyout)
+LEXT(copyoutmsg)
- b EXT(copyin_multiple) /* not a call - a jump! */
+#if INSTRUMENT
+ mfspr r12,pmc1 ; INSTRUMENT - saveinstr[12] - Take stamp at copyout
+ stw r12,0x6100+(12*16)+0x0(0) ; INSTRUMENT - Save it
+ mfspr r12,pmc2 ; INSTRUMENT - Get stamp
+ stw r12,0x6100+(12*16)+0x4(0) ; INSTRUMENT - Save it
+ mfspr r12,pmc3 ; INSTRUMENT - Get stamp
+ stw r12,0x6100+(12*16)+0x8(0) ; INSTRUMENT - Save it
+ mfspr r12,pmc4 ; INSTRUMENT - Get stamp
+ stw r12,0x6100+(12*16)+0xC(0) ; INSTRUMENT - Save it
+#endif
+ mfcr r2,0x10 // save caller's cr3, which we use for flags
+ mr r10,r4 // move high word of 64-bit user address to r10
+ crclr kkString // not a string version
+ mr r11,r5 // move low word of 64-bit user address to r11
+ crclr kkIn // flag as copyout
+ b copyJoin
+
+//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
/*
* int
- * copyout(src, dst, count)
- * vm_offset_t src;
- * vm_offset_t dst;
- * int count;
- *
+ * copyin(src, dst, count)
+ * addr64_t src; // r3 and r4
+ * vm_offset_t dst; // r5
+ * size_t count; // r6
*/
-ENTRY2(copyout, copyoutmsg, TAG_NO_FRAME_USED)
-/* Preamble allowing us to call a sub-function */
+ .align 5
+ .globl EXT(copyin)
+ .globl EXT(copyinmsg)
- mflr r0
- stw r0,FM_LR_SAVE(r1)
- stwu r1,-(FM_SIZE+16)(r1)
-
-#if 0
- stw r3,FM_SIZE+0(r1) /* (TEST/DEBUG) */
- stw r4,FM_SIZE+4(r1) /* (TEST/DEBUG) */
- stw r5,FM_SIZE+8(r1) /* (TEST/DEBUG) */
- mr r6,r0 /* (TEST/DEBUG) */
-
- bl EXT(tracecopyout) /* (TEST/DEBUG) */
-
- lwz r3,FM_SIZE+0(r1) /* (TEST/DEBUG) */
- lwz r4,FM_SIZE+4(r1) /* (TEST/DEBUG) */
- lwz r5,FM_SIZE+8(r1) /* (TEST/DEBUG) */
-#endif
-
- cmpli cr0,r5,0
- ble- cr0,.L_copyinout_trivial
-/* we know we have a valid copyout to do now */
-/* Set up thread_recover in case we hit an illegal address */
-
+LEXT(copyin)
+LEXT(copyinmsg)
- mfsprg r8,1 /* Get the current act */
- lwz r10,ACT_THREAD(r8)
- lis r11,HIGH_ADDR(.L_copyinout_error)
- lwz r8,ACT_VMMAP(r8)
- rlwinm r12,r4,6,26,29 ; Get index to the segment slot
- ori r11,r11,LOW_ADDR(.L_copyinout_error)
- add r9,r4,r5 /* Get the end of the destination */
- lwz r8,VMMAP_PMAP(r8)
- subi r9,r9,1 /* Make sure we don't go too far */
- add r8,r8,r12 ; Start indexing to the segment value
- stw r11,THREAD_RECOVER(r10)
- xor r9,r9,r4 /* Smoosh 'em together */
- lwz r8,PMAP_SEGS(r8) ; Get the source SR value
- rlwinm. r9,r9,0,1,3 /* Top nybble equal? */
- mtsr SR_COPYIN,r8
- isync
-
-
-/* For optimisation, we check if the copyout lies on a segment
- * boundary. If it doesn't, we can use a simple copy. If it
- * does, we split it into two separate copies in some C code.
- */
-
- bne- .L_call_copyout_multiple /* Nope, we went past the segment boundary... */
+ mfcr r2,0x10 // save caller's cr3, which we use for flags
+ mr r10,r3 // move high word of 64-bit user address to r10
+ crclr kkString // not a string version
+ mr r11,r4 // move low word of 64-bit user address to r11
+ crset kkIn // flag as copyin
+
+
+// Common code to handle setup for all the copy variants:
+// r2 = caller's cr3
+// r3 = source if copyout
+// r5 = dest if copyin
+// r6 = buffer length or count
+// r7 = count output ptr (if kkString set)
+// r10 = high word of 64-bit user-space address (source if copyin, dest if copyout)
+// r11 = low word of 64-bit user-space address
+// cr3 = kkIn, kkString, kkNull flags
- rlwinm r4,r4,0,4,31
- oris r4,r4,(SR_COPYIN_NUM << (28-16)) /* Set the copyin segment as the source */
-
- bl EXT(bcopy)
-
-/* Now that copyout is done, we don't need a recovery point */
- mfsprg r6,1 /* Get the current act */
- addi r1,r1,FM_SIZE+16
- lwz r10,ACT_THREAD(r6)
- li r3,0
- lwz r0,FM_LR_SAVE(r1)
- stw r3,THREAD_RECOVER(r10) /* Clear recovery */
+copyJoin:
+ crclr kkNull // (dst==NULL) convention not used with this call
+copyJoin1: // enter from copyinstr with kkNull set
+ mflr r0 // get return address
+ cmplwi r6,0 // buffer length 0?
+ lis r9,0x1000 // r9 <- 0x10000000 (256MB)
+ stw r0,FM_LR_SAVE(r1) // save return
+ cmplw cr1,r6,r9 // buffer length > 256MB ?
+ mfsprg r8,2 // get the features
+ beq-- copyinout_0 // 0 length is degenerate case
+ stwu r1,-kkFrameSize(r1) // set up stack frame
+ stw r2,kkCR3(r1) // save caller's cr3, which we use for flags
+ mtcrf 0x02,r8 // move pf64Bit to cr6
+ stw r3,kkSource(r1) // save args across MapUserMemoryWindow
+ stw r5,kkDest(r1)
+ stw r6,kkBufSize(r1)
+ crmove kk64bit,pf64Bitb // remember if this is a 64-bit processor
+ stw r7,kkCountPtr(r1)
+ stw r31,kkR31Save(r1) // we use r31 globally for mapped user ptr
+
+
+
+// Handle buffer length > 256MB. This is an error (ENAMETOOLONG) on copyin and copyout.
+// The string ops are passed -1 lengths by some BSD callers, so for them we silently clamp
+// the buffer length to 256MB. This isn't an issue if the string is less than 256MB
+// (as most are!), but if they are >256MB we eventually return ENAMETOOLONG. This restriction
+// is due to MapUserMemoryWindow; we don't want to consume more than two segments for
+// the mapping.
+
+ ble++ cr1,copyin0 // skip if buffer length <= 256MB
+ bf kkString,copyinout_too_big // error if not string op
+ mr r6,r9 // silently clamp buffer length to 256MB
+ stw r9,kkBufSize(r1) // update saved copy too
+
+
+// Set up thread_recover in case we hit an illegal address.
+
+copyin0:
+ li r31,0 // no mapped ptr yet
+ mfsprg r8,1 // Get the current thread
+ lis r2,hi16(copyinout_error)
+ ori r2,r2,lo16(copyinout_error)
+ lwz r4,THREAD_RECOVER(r8)
+ lwz r3,ACT_VMMAP(r8) // r3 <- vm_map virtual address
+ stw r2,THREAD_RECOVER(r8)
+ stw r4,kkThrErrJmp(r1)
+
+
+// Map user segment into kernel map, turn on 64-bit mode. At this point:
+// r3 = vm map
+// r6 = buffer length
+// r10/r11 = 64-bit user-space ptr (source if copyin, dest if copyout)
+//
+// When we call MapUserMemoryWindow, we pass:
+// r3 = vm map ptr
+// r4/r5 = 64-bit user space address as an addr64_t
+
+ mr r4,r10 // copy user ptr into r4/r5
+ mr r5,r11
+#if INSTRUMENT
+ mfspr r12,pmc1 ; INSTRUMENT - saveinstr[13] - Take stamp before mapuseraddressspace
+ stw r12,0x6100+(13*16)+0x0(0) ; INSTRUMENT - Save it
+ mfspr r12,pmc2 ; INSTRUMENT - Get stamp
+ stw r12,0x6100+(13*16)+0x4(0) ; INSTRUMENT - Save it
+ mfspr r12,pmc3 ; INSTRUMENT - Get stamp
+ stw r12,0x6100+(13*16)+0x8(0) ; INSTRUMENT - Save it
+ mfspr r12,pmc4 ; INSTRUMENT - Get stamp
+ stw r12,0x6100+(13*16)+0xC(0) ; INSTRUMENT - Save it
+#endif
+ bl EXT(MapUserMemoryWindow) // get r3/r4 <- 64-bit address in kernel map of user operand
+#if INSTRUMENT
+ mfspr r12,pmc1 ; INSTRUMENT - saveinstr[14] - Take stamp after mapuseraddressspace
+ stw r12,0x6100+(14*16)+0x0(0) ; INSTRUMENT - Save it
+ mfspr r12,pmc2 ; INSTRUMENT - Get stamp
+ stw r12,0x6100+(14*16)+0x4(0) ; INSTRUMENT - Save it
+ mfspr r12,pmc3 ; INSTRUMENT - Get stamp
+ stw r12,0x6100+(14*16)+0x8(0) ; INSTRUMENT - Save it
+ mfspr r12,pmc4 ; INSTRUMENT - Get stamp
+ stw r12,0x6100+(14*16)+0xC(0) ; INSTRUMENT - Save it
+#endif
+ mr r31,r4 // r31 <- mapped ptr into user space (may be 64-bit)
+ bf-- kk64bit,copyin1 // skip if a 32-bit processor
+
+ rldimi r31,r3,32,0 // slam high-order bits into mapped ptr
+ mfmsr r4 // if 64-bit, turn on SF so we can use returned ptr
+ li r0,1
+ rldimi r4,r0,63,MSR_SF_BIT // light bit 0
+ mtmsrd r4 // turn on 64-bit mode
+ isync // wait for mode to change
+
+
+// Load r3-r5, substituting mapped ptr as appropriate.
+
+copyin1:
+ lwz r5,kkBufSize(r1) // restore length to copy
+ bf kkIn,copyin2 // skip if copyout
+ lwz r4,kkDest(r1) // copyin: dest is kernel ptr
+ mr r3,r31 // source is mapped ptr
+ b copyin3
+copyin2: // handle copyout
+ lwz r3,kkSource(r1) // source is kernel buffer (r3 at entry)
+ mr r4,r31 // dest is mapped ptr into user space
+
+
+// Finally, all set up to copy:
+// r3 = source ptr (mapped if copyin)
+// r4 = dest ptr (mapped if copyout)
+// r5 = length
+// r31 = mapped ptr returned by MapUserMemoryWindow
+// cr3 = kkIn, kkString, kk64bit, and kkNull flags
+
+copyin3:
+ bt kkString,copyString // handle copyinstr and copyoutstr
+ bl EXT(bcopy) // copyin and copyout: let bcopy do the work
+ li r3,0 // return success
+
+
+// Main exit point for copyin, copyout, copyinstr, and copyoutstr. Also reached
+// from error recovery if we get a DSI accessing user space. Clear recovery ptr,
+// and pop off frame.
+// r3 = 0, EFAULT, or ENAMETOOLONG
+
+copyinx:
+ lwz r2,kkCR3(r1) // get callers cr3
+ mfsprg r6,1 // Get the current thread
+ bf-- kk64bit,copyinx1 // skip if 32-bit processor
+ mfmsr r12
+ rldicl r12,r12,0,MSR_SF_BIT+1 // if 64-bit processor, turn 64-bit mode off
+ mtmsrd r12 // turn SF off
+ isync // wait for the mode to change
+copyinx1:
+ lwz r0,FM_LR_SAVE+kkFrameSize(r1) // get return address
+ lwz r31,kkR31Save(r1) // restore callers r31
+ lwz r4,kkThrErrJmp(r1) // load saved thread recover
+ addi r1,r1,kkFrameSize // pop off our stack frame
mtlr r0
+ stw r4,THREAD_RECOVER(r6) // restore thread recover
+ mtcrf 0x10,r2 // restore cr3
blr
-.L_call_copyout_multiple:
- /* unwind the stack */
- addi r1, r1, FM_SIZE+16
- lwz r0, FM_LR_SAVE(r1)
- mtlr r0
- b EXT(copyout_multiple) /* not a call - a jump! */
+/* We get here via the exception handler if an illegal
+ * user memory reference was made. This error handler is used by
+ * copyin, copyout, copyinstr, and copyoutstr. Registers are as
+ * they were at point of fault, so for example cr3 flags are valid.
+ */
+
+copyinout_error:
+ li r3,EFAULT // return error
+ b copyinx
+
+copyinout_0: // degenerate case: 0-length copy
+ mtcrf 0x10,r2 // restore cr3
+ li r3,0 // return success
+ blr
+
+copyinout_too_big: // degenerate case
+ mtcrf 0x10,r2 // restore cr3
+ lwz r1,0(r1) // pop off stack frame
+ li r3,ENAMETOOLONG
+ blr
+
+
+//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
+// Handle copyinstr and copyoutstr. At this point the stack frame is set up,
+// the recovery ptr is set, the user's buffer is mapped, we're in 64-bit mode
+// if necessary, and:
+// r3 = source ptr, mapped if copyinstr
+// r4 = dest ptr, mapped if copyoutstr
+// r5 = buffer length
+// r31 = mapped ptr returned by MapUserMemoryWindow
+// cr3 = kkIn, kkString, kkNull, and kk64bit flags
+// We do word copies unless the buffer is very short, then use a byte copy loop
+// for the leftovers if necessary. The crossover at which the word loop becomes
+// faster is about seven bytes, counting the zero.
+//
+// We first must word-align the source ptr, in order to avoid taking a spurious
+// page fault.
+
+copyString:
+ cmplwi cr1,r5,15 // is buffer very short?
+ mr r12,r3 // remember ptr to 1st source byte
+ mtctr r5 // assuming short, set up loop count for bytes
+ blt-- cr1,copyinstr8 // too short for word loop
+ rlwinm r2,r3,0,0x3 // get byte offset of 1st byte within word
+ rlwinm r9,r3,3,0x18 // get bit offset of 1st byte within word
+ li r7,-1
+ sub r3,r3,r2 // word-align source address
+ add r6,r5,r2 // get length starting at byte 0 in word
+ srw r7,r7,r9 // get mask for bytes in first word
+ srwi r0,r6,2 // get #words in buffer
+ lwz r5,0(r3) // get aligned word with first source byte
+ lis r10,hi16(0xFEFEFEFF) // load magic constants into r10 and r11
+ lis r11,hi16(0x80808080)
+ mtctr r0 // set up word loop count
+ addi r3,r3,4 // advance past the source word
+ ori r10,r10,lo16(0xFEFEFEFF)
+ ori r11,r11,lo16(0x80808080)
+ orc r8,r5,r7 // map bytes preceeding first source byte into 0xFF
+ bt-- kkNull,copyinstr5enter // enter loop that just counts
+
+// Special case 1st word, which has been 0xFF filled on left. Note that we use
+// "and.", even though we execute both in 32 and 64-bit mode. This is OK.
+
+ slw r5,r5,r9 // left justify payload bytes
+ add r9,r10,r8 // r9 = data + 0xFEFEFEFF
+ andc r7,r11,r8 // r7 = ~data & 0x80808080
+ subfic r0,r2,4 // get r0 <- #payload bytes in 1st word
+ and. r7,r9,r7 // if r7==0, then all bytes in r8 are nonzero
+ stw r5,0(r4) // copy payload bytes to dest buffer
+ add r4,r4,r0 // then point to next byte in dest buffer
+ bdnzt cr0_eq,copyinstr6 // use loop that copies if 0 not found
+
+ b copyinstr7 // 0 found (buffer can't be full)
+
+
+// Word loop(s). They do a word-parallel search for 0s, using the following
+// inobvious but very efficient test:
+// y = data + 0xFEFEFEFF
+// z = ~data & 0x80808080
+// If (y & z)==0, then all bytes in dataword are nonzero. There are two copies
+// of this loop, one that just counts and another that copies.
+// r3 = ptr to next word of source (word aligned)
+// r4 = ptr to next byte in buffer
+// r6 = original buffer length (adjusted to be word origin)
+// r10 = 0xFEFEFEFE
+// r11 = 0x80808080
+// r12 = ptr to 1st source byte (used to determine string length)
+
+ .align 5 // align inner loops for speed
+copyinstr5: // version that counts but does not copy
+ lwz r8,0(r3) // get next word of source
+ addi r3,r3,4 // advance past it
+copyinstr5enter:
+ add r9,r10,r8 // r9 = data + 0xFEFEFEFF
+ andc r7,r11,r8 // r7 = ~data & 0x80808080
+ and. r7,r9,r7 // r7 = r9 & r7 ("." ok even in 64-bit mode)
+ bdnzt cr0_eq,copyinstr5 // if r7==0, then all bytes in r8 are nonzero
+
+ b copyinstr7
+
+ .align 5 // align inner loops for speed
+copyinstr6: // version that counts and copies
+ lwz r8,0(r3) // get next word of source
+ addi r3,r3,4 // advance past it
+ addi r4,r4,4 // increment dest ptr while we wait for data
+ add r9,r10,r8 // r9 = data + 0xFEFEFEFF
+ andc r7,r11,r8 // r7 = ~data & 0x80808080
+ and. r7,r9,r7 // r7 = r9 & r7 ("." ok even in 64-bit mode)
+ stw r8,-4(r4) // pack all 4 bytes into buffer
+ bdnzt cr0_eq,copyinstr6 // if r7==0, then all bytes are nonzero
+
+
+// Either 0 found or buffer filled. The above algorithm has mapped nonzero bytes to 0
+// and 0 bytes to 0x80 with one exception: 0x01 bytes preceeding the first 0 are also
+// mapped to 0x80. We must mask out these false hits before searching for an 0x80 byte.
+// r3 = word aligned ptr to next word of source (ie, r8==mem(r3-4))
+// r6 = original buffer length (adjusted to be word origin)
+// r7 = computed vector of 0x00 and 0x80 bytes
+// r8 = original source word, coming from -4(r3), possibly padded with 0xFFs on left if 1st word
+// r12 = ptr to 1st source byte (used to determine string length)
+// cr0 = beq set iff 0 not found
+
+copyinstr7:
+ rlwinm r2,r8,7,0,31 // move 0x01 bits to 0x80 position
+ rlwinm r6,r6,0,0x3 // mask down to partial byte count in last word
+ andc r7,r7,r2 // turn off false hits from 0x0100 worst case
+ crnot kkZero,cr0_eq // 0 found iff cr0_eq is off
+ srwi r7,r7,8 // we want to count the 0 as a byte xferred
+ cmpwi r6,0 // any bytes left over in last word?
+ cntlzw r7,r7 // now we can find the 0 byte (ie, the 0x80)
+ subi r3,r3,4 // back up r3 to point to 1st byte in r8
+ srwi r7,r7,3 // convert 8,16,24,32 to 1,2,3,4
+ add r3,r3,r7 // now r3 points one past 0 byte, or at 1st byte not xferred
+ bt++ kkZero,copyinstr10 // 0 found, so done
+
+ beq copyinstr10 // r6==0, so buffer truly full
+ mtctr r6 // 0 not found, loop over r6 bytes
+ b copyinstr8 // enter byte loop for last 1-3 leftover bytes
+
+
+// Byte loop. This is used for very small buffers and for the odd bytes left over
+// after searching and copying words at a time.
+// r3 = ptr to next byte of source
+// r4 = ptr to next dest byte
+// r12 = ptr to first byte of source
+// ctr = count of bytes to check
+
+ .align 5 // align inner loops for speed
+copyinstr8: // loop over bytes of source
+ lbz r0,0(r3) // get next byte of source
+ addi r3,r3,1
+ addi r4,r4,1 // increment dest addr whether we store or not
+ cmpwi r0,0 // the 0?
+ bt-- kkNull,copyinstr9 // don't store if copyinstr with NULL ptr
+ stb r0,-1(r4)
+copyinstr9:
+ bdnzf cr0_eq,copyinstr8 // loop if byte not 0 and more room in buffer
+
+ crmove kkZero,cr0_eq // remember if 0 found or buffer filled
+
+
+// Buffer filled or 0 found. Unwind and return.
+// r3 = ptr to 1st source byte not transferred
+// r12 = ptr to 1st source byte
+// r31 = mapped ptr returned by MapUserMemoryWindow
+// cr3 = kkZero set iff 0 found
+
+copyinstr10:
+ lwz r9,kkCountPtr(r1) // get ptr to place to store count of bytes moved
+ sub r2,r3,r12 // compute #bytes copied (including the 0)
+ li r3,0 // assume success return status
+ stw r2,0(r9) // store #bytes moved
+ bt++ kkZero,copyinx // we did find the 0 so return 0
+ li r3,ENAMETOOLONG // buffer filled
+ b copyinx // join main exit routine
+//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
/*
- * boolean_t
- * copyinstr(src, dst, count, maxcount)
- * vm_offset_t src;
- * vm_offset_t dst;
- * vm_size_t maxcount;
- * vm_size_t* count;
+ * int
+ * copypv(source, sink, size, which)
+ * addr64_t src; // r3 and r4
+ * addr64_t dst; // r5 and r6
+ * size_t size; // r7
+ * int which; // r8
*
- * Set *count to the number of bytes copied
- *
- * If dst == NULL, don't copy, just count bytes.
- * Only currently called from klcopyinstr.
+ * Operand size bytes are copied from operand src into operand dst. The source and
+ * destination operand addresses are given as addr64_t, and may designate starting
+ * locations in physical or virtual memory in any combination except where both are
+ * virtual. Virtual memory locations may be in either the kernel or the current thread's
+ * address space. Operand size may be up to 256MB.
+ *
+ * Operation is controlled by operand which, which offers these options:
+ * cppvPsrc : source operand is (1) physical or (0) virtual
+ * cppvPsnk : destination operand is (1) physical or (0) virtual
+ * cppvKmap : virtual operand is in (1) kernel or (0) current thread
+ * cppvFsnk : (1) flush destination before and after transfer
+ * cppvFsrc : (1) flush source before and after transfer
+ * cppvNoModSnk : (1) don't set source operand's changed bit(s)
+ * cppvNoRefSrc : (1) don't set destination operand's referenced bit(s)
+ *
+ * Implementation is now split into this new 64-bit path and the old path, hw_copypv_32().
+ * This section describes the operation of the new 64-bit path.
+ *
+ * The 64-bit path utilizes the more capacious 64-bit kernel address space to create a
+ * window in the kernel address space into all of physical RAM plus the I/O hole. Since
+ * the window's mappings specify the proper access policies for the underlying memory,
+ * the new path does not have to flush caches to avoid a cache paradox, so cppvFsnk
+ * and cppvFsrc are ignored. Physical operand adresses are relocated into the physical
+ * memory window, and are accessed with data relocation on. Virtual addresses are either
+ * within the kernel, or are mapped into the kernel address space through the user memory
+ * window. Because accesses to a virtual operand are performed with data relocation on,
+ * the new path does not have to translate the address, disable/enable interrupts, lock
+ * the mapping, or update referenced and changed bits.
+ *
+ * The IBM 970 (a.k.a. G5) processor treats real-mode accesses as guarded, so there is
+ * a substantial performance penalty for copypv operating in real mode. Utilizing the
+ * new 64-bit path, transfer performance increases >100% on the G5.
+ *
+ * The attentive reader may notice that mtmsrd ops are not followed by isync ops as
+ * might be expected. The 970 follows PowerPC architecture version 2.01, which defines
+ * mtmsrd with L=0 as a context synchronizing op, so a following isync is no longer
+ * required.
+ *
+ * To keep things exciting, we develop 64-bit values in non-volatiles, but we also need
+ * to call 32-bit functions, which would lead to the high-order 32 bits of our values
+ * getting clobbered unless we do something special. So, we preserve our 64-bit non-volatiles
+ * in our own stack frame across calls to 32-bit functions.
+ *
*/
-ENTRY(copyinstr, TAG_NO_FRAME_USED)
+// Map operand which bits into non-volatile CR2 and CR3 bits.
+#define whichAlign ((3+1)*4)
+#define whichMask 0x007F0000
+#define pvPsnk (cppvPsnkb - whichAlign)
+#define pvPsrc (cppvPsrcb - whichAlign)
+#define pvFsnk (cppvFsnkb - whichAlign)
+#define pvFsrc (cppvFsrcb - whichAlign)
+#define pvNoModSnk (cppvNoModSnkb - whichAlign)
+#define pvNoRefSrc (cppvNoRefSrcb - whichAlign)
+#define pvKmap (cppvKmapb - whichAlign)
+#define pvNoCache cr2_lt
-/* Preamble allowing us to call a sub-function */
- mflr r0
- stw r0,FM_LR_SAVE(r1)
- stwu r1,-(FM_SIZE+16)(r1)
+ .align 5
+ .globl EXT(copypv)
-#if 0
- stw r3,FM_SIZE+0(r1) /* (TEST/DEBUG) */
- stw r4,FM_SIZE+4(r1) /* (TEST/DEBUG) */
- stw r5,FM_SIZE+8(r1) /* (TEST/DEBUG) */
- stw r6,FM_SIZE+12(r1) /* (TEST/DEBUG) */
- mr r7,r0 /* (TEST/DEBUG) */
+LEXT(copypv)
+ mfsprg r10,2 // get feature flags
+ mtcrf 0x02,r10 // we need to test pf64Bit
+ bt++ pf64Bitb,copypv_64 // skip if 64-bit processor (only they take hint)
+
+ b EXT(hw_copypv_32) // carry on with 32-bit copypv
+
+// Push a 32-bit ABI-compliant stack frame and preserve all non-volatiles that we'll clobber.
+copypv_64:
+ mfsprg r9,1 // get current thread
+ stwu r1,-(FM_ALIGN((31-26+11)*4)+FM_SIZE)(r1)
+ // allocate stack frame and link it
+ mflr r0 // get return address
+ mfcr r10 // get cr2 and cr3
+ lwz r12,THREAD_RECOVER(r9) // get error callback
+ stw r26,FM_ARG0+0x00(r1) // save non-volatile r26
+ stw r27,FM_ARG0+0x04(r1) // save non-volatile r27
+ stw r28,FM_ARG0+0x08(r1) // save non-volatile r28
+ stw r29,FM_ARG0+0x0C(r1) // save non-volatile r29
+ stw r30,FM_ARG0+0x10(r1) // save non-volatile r30
+ stw r31,FM_ARG0+0x14(r1) // save non-volatile r31
+ stw r12,FM_ARG0+0x20(r1) // save error callback
+ stw r0,(FM_ALIGN((31-26+11)*4)+FM_SIZE+FM_LR_SAVE)(r1)
+ // save return address
+ stw r10,(FM_ALIGN((31-26+11)*4)+FM_SIZE+FM_CR_SAVE)(r1)
+ // save non-volatile cr2 and cr3
+
+// Non-volatile register usage in this routine is:
+// r26: saved msr image
+// r27: current pmap_t / virtual source address
+// r28: destination virtual address
+// r29: source address
+// r30: destination address
+// r31: byte count to copy
+// cr2/3: parameter 'which' bits
+
+ rlwinm r8,r8,whichAlign,whichMask // align and mask which bits
+ mr r31,r7 // copy size to somewhere non-volatile
+ mtcrf 0x20,r8 // insert which bits into cr2 and cr3
+ mtcrf 0x10,r8 // insert which bits into cr2 and cr3
+ rlwinm r29,r3,0,1,0 // form source address high-order bits
+ rlwinm r30,r5,0,1,0 // form destination address high-order bits
+ rlwimi r29,r4,0,0,31 // form source address low-order bits
+ rlwimi r30,r6,0,0,31 // form destination address low-order bits
+ crand cr7_lt,pvPsnk,pvPsrc // are both operand addresses physical?
+ cntlzw r0,r31 // count leading zeroes in byte count
+ cror cr7_eq,pvPsnk,pvPsrc // cr7_eq <- source or destination is physical
+ bf-- cr7_eq,copypv_einval // both operands may not be virtual
+ cmplwi r0,4 // byte count greater than or equal 256M (2**28)?
+ blt-- copypv_einval // byte count too big, give EINVAL
+ cmplwi r31,0 // byte count zero?
+ beq-- copypv_zero // early out
+ bt cr7_lt,copypv_phys // both operand addresses are physical
+ mr r28,r30 // assume destination is virtual
+ bf pvPsnk,copypv_dv // is destination virtual?
+ mr r28,r29 // no, so source must be virtual
+copypv_dv:
+ lis r27,ha16(EXT(kernel_pmap)) // get kernel's pmap_t *, high-order
+ lwz r27,lo16(EXT(kernel_pmap))(r27) // get kernel's pmap_t
+ bt pvKmap,copypv_kern // virtual address in kernel map?
+ lwz r3,ACT_VMMAP(r9) // get user's vm_map *
+ rldicl r4,r28,32,32 // r4, r5 <- addr64_t virtual address
+ rldicl r5,r28,0,32
+ std r29,FM_ARG0+0x30(r1) // preserve 64-bit r29 across 32-bit call
+ std r30,FM_ARG0+0x38(r1) // preserve 64-bit r30 across 32-bit call
+ bl EXT(MapUserMemoryWindow) // map slice of user space into kernel space
+ ld r29,FM_ARG0+0x30(r1) // restore 64-bit r29
+ ld r30,FM_ARG0+0x38(r1) // restore 64-bit r30
+ rlwinm r28,r3,0,1,0 // convert relocated addr64_t virtual address
+ rlwimi r28,r4,0,0,31 // into a single 64-bit scalar
+copypv_kern:
+
+// Since we'll be accessing the virtual operand with data-relocation on, we won't need to
+// update the referenced and changed bits manually after the copy. So, force the appropriate
+// flag bit on for the virtual operand.
+ crorc pvNoModSnk,pvNoModSnk,pvPsnk // for virtual dest, let hardware do ref/chg bits
+ crorc pvNoRefSrc,pvNoRefSrc,pvPsrc // for virtual source, let hardware do ref bit
- bl EXT(tracecopystr) /* (TEST/DEBUG) */
+// We'll be finding a mapping and looking at, so we need to disable 'rupts.
+ lis r0,hi16(MASK(MSR_VEC)) // get vector mask
+ ori r0,r0,lo16(MASK(MSR_FP)) // insert fp mask
+ mfmsr r26 // save current msr
+ andc r26,r26,r0 // turn off VEC and FP in saved copy
+ ori r0,r0,lo16(MASK(MSR_EE)) // add EE to our mask
+ andc r0,r26,r0 // disable EE in our new msr image
+ mtmsrd r0 // introduce new msr image
+
+// We're now holding the virtual operand's pmap_t in r27 and its virtual address in r28. We now
+// try to find a mapping corresponding to this address in order to determine whether the address
+// is cacheable. If we don't find a mapping, we can safely assume that the operand is cacheable
+// (a non-cacheable operand must be a block mapping, which will always exist); otherwise, we
+// examine the mapping's caching-inhibited bit.
+ mr r3,r27 // r3 <- pmap_t pmap
+ rldicl r4,r28,32,32 // r4, r5 <- addr64_t va
+ rldicl r5,r28,0,32
+ la r6,FM_ARG0+0x18(r1) // r6 <- addr64_t *nextva
+ li r7,1 // r7 <- int full, search nested mappings
+ std r26,FM_ARG0+0x28(r1) // preserve 64-bit r26 across 32-bit calls
+ std r28,FM_ARG0+0x30(r1) // preserve 64-bit r28 across 32-bit calls
+ std r29,FM_ARG0+0x38(r1) // preserve 64-bit r29 across 32-bit calls
+ std r30,FM_ARG0+0x40(r1) // preserve 64-bit r30 across 32-bit calls
+ bl EXT(mapping_find) // find mapping for virtual operand
+ mr. r3,r3 // did we find it?
+ beq copypv_nomapping // nope, so we'll assume it's cacheable
+ lwz r4,mpVAddr+4(r3) // get low half of virtual addr for hw flags
+ rlwinm. r4,r4,0,mpIb-32,mpIb-32 // caching-inhibited bit set?
+ crnot pvNoCache,cr0_eq // if it is, use bcopy_nc
+ bl EXT(mapping_drop_busy) // drop busy on the mapping
+copypv_nomapping:
+ ld r26,FM_ARG0+0x28(r1) // restore 64-bit r26
+ ld r28,FM_ARG0+0x30(r1) // restore 64-bit r28
+ ld r29,FM_ARG0+0x38(r1) // restore 64-bit r29
+ ld r30,FM_ARG0+0x40(r1) // restore 64-bit r30
+ mtmsrd r26 // restore msr to it's previous state
+
+// Set both the source and destination virtual addresses to the virtual operand's address --
+// we'll overlay one of them with the physical operand's address.
+ mr r27,r28 // make virtual operand BOTH source AND destination
+
+// Now we're ready to relocate the physical operand address(es) into the physical memory window.
+// Recall that we've mapped physical memory (including the I/O hole) into the kernel's address
+// space somewhere at or over the 2**32 line. If one or both of the operands are in the I/O hole,
+// we'll set the pvNoCache flag, forcing use of non-caching bcopy_nc() to do the copy.
+copypv_phys:
+ ld r6,lgPMWvaddr(0) // get physical memory window virtual address
+ bf pvPsnk,copypv_dstvirt // is destination address virtual?
+ cntlzd r4,r30 // count leading zeros in destination address
+ cmplwi r4,32 // if it's 32, then it's in the I/O hole (2**30 to 2**31-1)
+ cror pvNoCache,cr0_eq,pvNoCache // use bcopy_nc for I/O hole locations
+ add r28,r30,r6 // relocate physical destination into physical window
+copypv_dstvirt:
+ bf pvPsrc,copypv_srcvirt // is source address virtual?
+ cntlzd r4,r29 // count leading zeros in source address
+ cmplwi r4,32 // if it's 32, then it's in the I/O hole (2**30 to 2**31-1)
+ cror pvNoCache,cr0_eq,pvNoCache // use bcopy_nc for I/O hole locations
+ add r27,r29,r6 // relocate physical source into physical window
+copypv_srcvirt:
+
+// Once the copy is under way (bcopy or bcopy_nc), we will want to get control if anything
+// funny happens during the copy. So, we set a pointer to our error handler in the per-thread
+// control block.
+ mfsprg r8,1 // get current threads stuff
+ lis r3,hi16(copypv_error) // get our error callback's address, high
+ ori r3,r3,lo16(copypv_error) // get our error callback's address, low
+ stw r3,THREAD_RECOVER(r8) // set our error callback
- lwz r3,FM_SIZE+0(r1) /* (TEST/DEBUG) */
- lwz r4,FM_SIZE+4(r1) /* (TEST/DEBUG) */
- lwz r5,FM_SIZE+8(r1) /* (TEST/DEBUG) */
- stw r6,FM_SIZE+12(r1) /* (TEST/DEBUG) */
+// Since our physical operand(s) are relocated at or above the 2**32 line, we must enter
+// 64-bit mode.
+ li r0,1 // get a handy one bit
+ mfmsr r3 // get current msr
+ rldimi r3,r0,63,MSR_SF_BIT // set SF bit on in our msr copy
+ mtmsrd r3 // enter 64-bit mode
+
+// If requested, flush data cache
+// Note that we don't flush, the code is being saved "just in case".
+#if 0
+ bf pvFsrc,copypv_nfs // do we flush the source?
+ rldicl r3,r27,32,32 // r3, r4 <- addr64_t source virtual address
+ rldicl r4,r27,0,32
+ mr r5,r31 // r5 <- count (in bytes)
+ li r6,0 // r6 <- boolean phys (false, not physical)
+ bl EXT(flush_dcache) // flush the source operand
+copypv_nfs:
+ bf pvFsnk,copypv_nfdx // do we flush the destination?
+ rldicl r3,r28,32,32 // r3, r4 <- addr64_t destination virtual address
+ rldicl r4,r28,0,32
+ mr r5,r31 // r5 <- count (in bytes)
+ li r6,0 // r6 <- boolean phys (false, not physical)
+ bl EXT(flush_dcache) // flush the destination operand
+copypv_nfdx:
#endif
-
- cmpli cr0,r5,0
- ble- cr0,.L_copyinout_trivial
-/* we know we have a valid copyin to do now */
-/* Set up thread_recover in case we hit an illegal address */
-
- li r0,0
- mfsprg r8,1 /* Get the current act */
- lwz r10,ACT_THREAD(r8)
- stw r0,0(r6) /* Clear result length */
- lis r11,HIGH_ADDR(.L_copyinout_error)
- lwz r8,ACT_VMMAP(r8) ; Get the map for this activation
- rlwinm r12,r3,6,26,29 ; Get index to the segment slot
- lwz r8,VMMAP_PMAP(r8)
- ori r11,r11,LOW_ADDR(.L_copyinout_error)
- add r8,r8,r12 ; Start indexing to the segment value
- stw r11,THREAD_RECOVER(r10)
- rlwinm r3,r3,0,4,31
- lwz r7,PMAP_SEGS(r8) ; Get the source SR value
- oris r3,r3,(SR_COPYIN_NUM << (28-16)) /* Set the copyin segment as the source */
-
-/* Copy byte by byte for now - TODO NMGS speed this up with
- * some clever (but fairly standard) logic for word copies.
- * We don't use a copyinstr_multiple since copyinstr is called
- * with INT_MAX in the linux server. Eugh.
- */
+// Call bcopy or bcopy_nc to perform the copy.
+ mr r3,r27 // r3 <- source virtual address
+ mr r4,r28 // r4 <- destination virtual address
+ mr r5,r31 // r5 <- bytes to copy
+ bt pvNoCache,copypv_nc // take non-caching route
+ bl EXT(bcopy) // call bcopy to do the copying
+ b copypv_copydone
+copypv_nc:
+ bl EXT(bcopy_nc) // call bcopy_nc to do the copying
+copypv_copydone:
- li r9,0 /* Clear byte counter */
+// If requested, flush data cache
+// Note that we don't flush, the code is being saved "just in case".
+#if 0
+ bf pvFsrc,copypv_nfsx // do we flush the source?
+ rldicl r3,r27,32,32 // r3, r4 <- addr64_t source virtual address
+ rldicl r4,r27,0,32
+ mr r5,r31 // r5 <- count (in bytes)
+ li r6,0 // r6 <- boolean phys (false, not physical)
+ bl EXT(flush_dcache) // flush the source operand
+copypv_nfsx:
+ bf pvFsnk,copypv_nfd // do we flush the destination?
+ rldicl r3,r28,32,32 // r3, r4 <- addr64_t destination virtual address
+ rldicl r4,r28,0,32
+ mr r5,r31 // r5 <- count (in bytes)
+ li r6,0 // r6 <- boolean phys (false, not physical)
+ bl EXT(flush_dcache) // flush the destination operand
+copypv_nfd:
+#endif
-/* If the destination is NULL, don't do writes,
- * just count bytes. We set CR7 outside the loop to save time
- */
- cmpwi cr7,r4,0 /* Is the destination null? */
-
-nxtseg: mtsr SR_COPYIN,r7 /* Set the source SR */
- isync
+// Leave 64-bit mode.
+ mfmsr r3 // get current msr
+ rldicl r3,r3,0,MSR_SF_BIT+1 // clear SF bit in our copy
+ mtmsrd r3 // leave 64-bit mode
-.L_copyinstr_loop:
- lbz r0,0(r3) /* Get the source */
- addic. r5,r5,-1 /* Have we gone far enough? */
- addi r3,r3,1 /* Bump source pointer */
+// If requested, set ref/chg on source/dest physical operand(s). It is possible that copy is
+// from/to a RAM disk situated outside of mapped physical RAM, so we check each page by calling
+// mapping_phys_lookup() before we try to set its ref/chg bits; otherwise, we might panic.
+// Note that this code is page-size sensitive, so it should probably be a part of our low-level
+// code in hw_vm.s.
+ bt pvNoModSnk,copypv_nomod // skip destination update if not requested
+ std r29,FM_ARG0+0x30(r1) // preserve 64-bit r29 across 32-bit calls
+ li r26,1 // r26 <- 4K-page count
+ mr r27,r31 // r27 <- byte count
+ rlwinm r3,r30,0,20,31 // does destination cross a page boundary?
+ subfic r3,r3,4096 //
+ cmplw r3,r27 //
+ blt copypv_modnox // skip if not crossing case
+ subf r27,r3,r27 // r27 <- byte count less initial fragment
+ addi r26,r26,1 // increment page count
+copypv_modnox:
+ srdi r3,r27,12 // pages to update (not including crosser)
+ add r26,r26,r3 // add in crosser
+ srdi r27,r30,12 // r27 <- destination page number
+copypv_modloop:
+ mr r3,r27 // r3 <- destination page number
+ la r4,FM_ARG0+0x18(r1) // r4 <- unsigned int *pindex
+ bl EXT(mapping_phys_lookup) // see if page is really there
+ mr. r3,r3 // is it?
+ beq-- copypv_modend // nope, break out of modify loop
+ mr r3,r27 // r3 <- destination page number
+ bl EXT(mapping_set_mod) // set page changed status
+ subi r26,r26,1 // decrement page count
+ cmpwi r26,0 // done yet?
+ bgt copypv_modloop // nope, iterate
+copypv_modend:
+ ld r29,FM_ARG0+0x30(r1) // restore 64-bit r29
+copypv_nomod:
+ bt pvNoRefSrc,copypv_done // skip source update if not requested
+copypv_debugref:
+ li r26,1 // r26 <- 4K-page count
+ mr r27,r31 // r27 <- byte count
+ rlwinm r3,r29,0,20,31 // does source cross a page boundary?
+ subfic r3,r3,4096 //
+ cmplw r3,r27 //
+ blt copypv_refnox // skip if not crossing case
+ subf r27,r3,r27 // r27 <- byte count less initial fragment
+ addi r26,r26,1 // increment page count
+copypv_refnox:
+ srdi r3,r27,12 // pages to update (not including crosser)
+ add r26,r26,r3 // add in crosser
+ srdi r27,r29,12 // r27 <- source page number
+copypv_refloop:
+ mr r3,r27 // r3 <- source page number
+ la r4,FM_ARG0+0x18(r1) // r4 <- unsigned int *pindex
+ bl EXT(mapping_phys_lookup) // see if page is really there
+ mr. r3,r3 // is it?
+ beq-- copypv_done // nope, break out of modify loop
+ mr r3,r27 // r3 <- source page number
+ bl EXT(mapping_set_ref) // set page referenced status
+ subi r26,r26,1 // decrement page count
+ cmpwi r26,0 // done yet?
+ bgt copypv_refloop // nope, iterate
- cmpwi cr1,r0,0 /* Did we hit a null? */
+// Return, indicating success.
+copypv_done:
+copypv_zero:
+ li r3,0 // our efforts were crowned with success
- beq cr7,.L_copyinstr_no_store /* If we are just counting, skip the store... */
-
- stb r0,0(r4) /* Move to sink */
- addi r4,r4,1 /* Advance sink pointer */
+// Pop frame, restore caller's non-volatiles, clear recovery routine pointer.
+copypv_return:
+ mfsprg r9,1 // get current threads stuff
+ lwz r0,(FM_ALIGN((31-26+11)*4)+FM_SIZE+FM_LR_SAVE)(r1)
+ // get return address
+ lwz r4,(FM_ALIGN((31-26+11)*4)+FM_SIZE+FM_CR_SAVE)(r1)
+ // get non-volatile cr2 and cr3
+ lwz r26,FM_ARG0+0x00(r1) // restore non-volatile r26
+ lwz r27,FM_ARG0+0x04(r1) // restore non-volatile r27
+ mtlr r0 // restore return address
+ lwz r28,FM_ARG0+0x08(r1) // restore non-volatile r28
+ mtcrf 0x20,r4 // restore non-volatile cr2
+ mtcrf 0x10,r4 // restore non-volatile cr3
+ lwz r11,FM_ARG0+0x20(r1) // save error callback
+ lwz r29,FM_ARG0+0x0C(r1) // restore non-volatile r29
+ lwz r30,FM_ARG0+0x10(r1) // restore non-volatile r30
+ lwz r31,FM_ARG0+0x14(r1) // restore non-volatile r31
+ stw r11,THREAD_RECOVER(r9) // restore our error callback
+ lwz r1,0(r1) // release stack frame
+
+ blr // y'all come back now
-.L_copyinstr_no_store:
+// Invalid argument handler.
+copypv_einval:
+ li r3,EINVAL // invalid argument
+ b copypv_return // return
- addi r9,r9,1 /* Count the character */
- beq- cr1,.L_copyinstr_done /* We're done if we did a null... */
- beq- cr0,L_copyinstr_toobig /* Also if we maxed the count... */
-
-/* Check to see if the copyin pointer has moved out of the
- * copyin segment, if it has we must remap.
- */
-
- rlwinm. r0,r3,0,4,31 /* Did we wrap around to 0? */
- bne+ cr0,.L_copyinstr_loop /* Nope... */
-
- lwz r7,PMAP_SEGS+4(r8) ; Get the next source SR value
- addi r8,r8,4 ; Point to the next segment
- oris r3,r0,(SR_COPYIN_NUM << (28-16)) /* Reset the segment number */
- b nxtseg /* Keep going... */
-
-L_copyinstr_toobig:
- li r3,ENAMETOOLONG
- b L_copyinstr_return
-.L_copyinstr_done:
- li r3,0 /* Normal return */
-L_copyinstr_return:
- li r4,0 /* to clear thread_recover */
- stw r9,0(r6) /* Set how many bytes we did */
- stw r4,THREAD_RECOVER(r10) /* Clear recovery exit */
-
- addi r1, r1, FM_SIZE+16
- lwz r0, FM_LR_SAVE(r1)
- mtlr r0
- blr
+// Error encountered during bcopy or bcopy_nc.
+copypv_error:
+ mfmsr r3 // get current msr
+ rldicl r3,r3,0,MSR_SF_BIT+1 // clear SF bit in our copy
+ mtmsrd r3 // leave 64-bit mode
+ li r3,EFAULT // it was all his fault
+ b copypv_return // return