X-Git-Url: https://git.saurik.com/apple/xnu.git/blobdiff_plain/9bccf70c0258c7cac2dcb80011b2a964d884c552..a3d08fcd5120d2aa8303b6349ca8b14e3f284af3:/osfmk/ppc/movc.s diff --git a/osfmk/ppc/movc.s b/osfmk/ppc/movc.s index cb3188f33..6ef231f25 100644 --- a/osfmk/ppc/movc.s +++ b/osfmk/ppc/movc.s @@ -1,5 +1,5 @@ /* - * Copyright (c) 2000 Apple Computer, Inc. All rights reserved. + * Copyright (c) 2000-2002 Apple Computer, Inc. All rights reserved. * * @APPLE_LICENSE_HEADER_START@ * @@ -29,558 +29,876 @@ #include #include +#define INSTRUMENT 0 + +//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> /* * void pmap_zero_page(vm_offset_t pa) * - * zero a page of physical memory. + * Zero a page of physical memory. This routine runs in 32 or 64-bit mode, + * and handles 32 and 128-byte cache lines. */ -#if DEBUG - /* C debug stub in pmap.c calls this */ -ENTRY(pmap_zero_page_assembler, TAG_NO_FRAME_USED) -#else -ENTRY(pmap_zero_page, TAG_NO_FRAME_USED) -#endif /* DEBUG */ - - mfmsr r6 /* Get the MSR */ - rlwinm r6,r6,0,MSR_FP_BIT+1,MSR_FP_BIT-1 ; Force floating point off - rlwinm r6,r6,0,MSR_VEC_BIT+1,MSR_VEC_BIT-1 ; Force vectors off - rlwinm r7, r6, 0, MSR_DR_BIT+1, MSR_DR_BIT-1 /* Turn off DR */ - rlwinm r7,r7,0,MSR_EE_BIT+1,MSR_EE_BIT-1 ; Disable interruptions - li r4,PPC_PGBYTES-CACHE_LINE_SIZE /* Point to the end of the page */ - mtmsr r7 /* Set MSR to DR off */ - isync /* Ensure data translations are off */ - - -.L_phys_zero_loop: - subic. r5,r4,CACHE_LINE_SIZE /* Point to the next one */ - dcbz r4, r3 /* Clear the whole thing to 0s */ - subi r4,r5,CACHE_LINE_SIZE /* Point to the next one */ - dcbz r5, r3 /* Clear the next to zeros */ - bgt+ .L_phys_zero_loop /* Keep going until we do the page... */ - - sync /* Make sure they're all done */ - li r4,PPC_PGBYTES-CACHE_LINE_SIZE /* Point to the end of the page */ - -.L_inst_inval_loop: - subic. r5,r4,CACHE_LINE_SIZE /* Point to the next one */ - icbi r4, r3 /* Clear the whole thing to 0s */ - subi r4,r5,CACHE_LINE_SIZE /* Point to the next one */ - icbi r5, r3 /* Clear the next to zeros */ - bgt+ .L_inst_inval_loop /* Keep going until we do the page... */ - - sync /* Make sure they're all done */ - - mtmsr r6 /* Restore original translations */ - isync /* Ensure data translations are on */ - blr + .align 5 + .globl EXT(pmap_zero_page) + +LEXT(pmap_zero_page) + + mflr r12 // save return address + bl EXT(ml_set_physical_disabled) // turn DR and EE off, SF on, get features in r10 + mtlr r12 // restore return address + andi. r9,r10,pf32Byte+pf128Byte // r9 <- cache line size + subfic r4,r9,PPC_PGBYTES // r4 <- starting offset in page + + bt++ pf64Bitb,page0S4 // Go do the big guys... + + slwi r3,r3,12 // get page address from page num + b page_zero_1 // Jump to line aligned loop... + + .align 5 + + nop + nop + nop + nop + nop + nop + nop + +page0S4: + sldi r3,r3,12 // get page address from page num + +page_zero_1: // loop zeroing cache lines + sub. r5,r4,r9 // more to go? + dcbz128 r3,r4 // zero either 32 or 128 bytes + sub r4,r5,r9 // generate next offset + dcbz128 r3,r5 + bne-- page_zero_1 + + b EXT(ml_restore) // restore MSR and do the isync + + +//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> /* void * phys_copy(src, dst, bytecount) - * vm_offset_t src; - * vm_offset_t dst; + * addr64_t src; + * addr64_t dst; * int bytecount * * This routine will copy bytecount bytes from physical address src to physical - * address dst. + * address dst. It runs in 64-bit mode if necessary, but does not handle + * overlap or make any attempt to be optimal. Length must be a signed word. + * Not performance critical. */ -ENTRY(phys_copy, TAG_NO_FRAME_USED) - - /* Switch off data translations */ - mfmsr r6 - rlwinm r6,r6,0,MSR_FP_BIT+1,MSR_FP_BIT-1 ; Force floating point off - rlwinm r6,r6,0,MSR_VEC_BIT+1,MSR_VEC_BIT-1 ; Force vectors off - rlwinm r7, r6, 0, MSR_DR_BIT+1, MSR_DR_BIT-1 - rlwinm r7, r7, 0, MSR_EE_BIT+1, MSR_EE_BIT-1 - mtmsr r7 - isync /* Ensure data translations are off */ - - subi r3, r3, 4 - subi r4, r4, 4 - - cmpwi r5, 3 - ble- .L_phys_copy_bytes -.L_phys_copy_loop: - lwz r0, 4(r3) - addi r3, r3, 4 - subi r5, r5, 4 - stw r0, 4(r4) - addi r4, r4, 4 - cmpwi r5, 3 - bgt+ .L_phys_copy_loop - - /* If no leftover bytes, we're done now */ - cmpwi r5, 0 - beq+ .L_phys_copy_done - -.L_phys_copy_bytes: - addi r3, r3, 3 - addi r4, r4, 3 -.L_phys_copy_byte_loop: - lbz r0, 1(r3) - addi r3, r3, 1 - subi r5, r5, 1 - stb r0, 1(r4) - addi r4, r4, 1 - cmpwi r5, 0 - bne+ .L_phys_copy_byte_loop - -.L_phys_copy_done: - mtmsr r6 /* Restore original translations */ - isync /* Ensure data translations are off */ - - blr + .align 5 + .globl EXT(phys_copy) + +LEXT(phys_copy) + + rlwinm r3,r3,0,1,0 ; Duplicate high half of long long paddr into top of reg + mflr r12 // get return address + rlwimi r3,r4,0,0,31 ; Combine bottom of long long to full 64-bits + rlwinm r4,r5,0,1,0 ; Duplicate high half of long long paddr into top of reg + bl EXT(ml_set_physical_disabled) // turn DR and EE off, SF on, get features in r10 + rlwimi r4,r6,0,0,31 ; Combine bottom of long long to full 64-bits + mtlr r12 // restore return address + subic. r5,r7,4 // a word to copy? + b phys_copy_2 + + .align 5 + +phys_copy_1: // loop copying words + subic. r5,r5,4 // more to go? + lwz r0,0(r3) + addi r3,r3,4 + stw r0,0(r4) + addi r4,r4,4 +phys_copy_2: + bge phys_copy_1 + addic. r5,r5,4 // restore count + ble phys_copy_4 // no more + + // Loop is aligned here + +phys_copy_3: // loop copying bytes + subic. r5,r5,1 // more to go? + lbz r0,0(r3) + addi r3,r3,1 + stb r0,0(r4) + addi r4,r4,1 + bgt phys_copy_3 +phys_copy_4: + b EXT(ml_restore) // restore MSR and do the isync + + +//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> /* void * pmap_copy_page(src, dst) - * vm_offset_t src; - * vm_offset_t dst; + * ppnum_t src; + * ppnum_t dst; * * This routine will copy the physical page src to physical page dst * - * This routine assumes that the src and dst are page aligned and that the - * destination is cached. - * - * We also must assume that noone will be executing within the destination - * page. We also assume that this will be used for paging + * This routine assumes that the src and dst are page numbers and that the + * destination is cached. It runs on 32 and 64 bit processors, with and + * without altivec, and with 32 and 128 byte cache lines. + * We also must assume that no-one will be executing within the destination + * page, and that this will be used for paging. Because this + * is a common routine, we have tuned loops for each processor class. * */ +#define kSFSize (FM_SIZE+160) -#if DEBUG - /* if debug, we have a little piece of C around this - * in pmap.c that gives some trace ability - */ -ENTRY(pmap_copy_page_assembler, TAG_NO_FRAME_USED) -#else ENTRY(pmap_copy_page, TAG_NO_FRAME_USED) -#endif /* DEBUG */ - -#if 0 - mfpvr r9 ; Get the PVR - rlwinm r9,r9,16,16,31 ; Isolate the PPC processor - cmplwi r9,PROCESSOR_VERSION_Max ; Do we have Altivec? - beq+ wegotaltivec ; Yeah... -#endif - - mfmsr r9 ; Get the MSR - rlwinm r9,r9,0,MSR_FP_BIT+1,MSR_FP_BIT-1 ; Force floating point off - rlwinm r9,r9,0,MSR_VEC_BIT+1,MSR_VEC_BIT-1 ; Force vectors off - stwu r1,-(FM_SIZE+32)(r1) ; Make a frame for us - rlwinm r7,r9,0,MSR_EE_BIT+1,MSR_EE_BIT-1 ; Disable interruptions - ori r7,r7,lo16(MASK(MSR_FP)) ; Turn on the FPU - mtmsr r7 ; Disable rupts and enable FPU - isync - - stfd f0,FM_SIZE+0(r1) ; Save an FP register - rlwinm r7,r7,0,MSR_DR_BIT+1,MSR_DR_BIT-1 ; Clear the DDAT bit - stfd f1,FM_SIZE+8(r1) ; Save an FP register - addi r6,r3,PPC_PGBYTES ; Point to the start of the next page - stfd f2,FM_SIZE+16(r1) ; Save an FP register - mr r8,r4 ; Save the destination - stfd f3,FM_SIZE+24(r1) ; Save an FP register - - mtmsr r7 ; Set the new MSR - isync ; Ensure data translations are off - - dcbt br0, r3 /* Start in first input line */ - li r5, CACHE_LINE_SIZE /* Get the line size */ - -.L_pmap_copy_page_loop: - dcbz 0, r4 /* Allocate a line for the output */ - lfd f0, 0(r3) /* Get first 8 */ - lfd f1, 8(r3) /* Get second 8 */ - lfd f2, 16(r3) /* Get third 8 */ - stfd f0, 0(r4) /* Put first 8 */ - dcbt r5, r3 /* Start next line coming in */ - lfd f3, 24(r3) /* Get fourth 8 */ - stfd f1, 8(r4) /* Put second 8 */ - addi r3,r3,CACHE_LINE_SIZE /* Point to the next line in */ - stfd f2, 16(r4) /* Put third 8 */ - cmplw cr0,r3,r6 /* See if we're finished yet */ - stfd f3, 24(r4) /* Put fourth 8 */ - dcbst br0,r4 /* Force it out */ - addi r4,r4,CACHE_LINE_SIZE /* Point to the next line out */ - blt+ .L_pmap_copy_page_loop /* Copy the whole page */ - - sync /* Make sure they're all done */ - li r4,PPC_PGBYTES-CACHE_LINE_SIZE /* Point to the end of the page */ - -invalinst: - subic. r5,r4,CACHE_LINE_SIZE /* Point to the next one */ - icbi r4, r8 /* Trash the i-cache */ - subi r4,r5,CACHE_LINE_SIZE /* Point to the next one */ - icbi r5, r8 /* Trash the i-cache */ - bgt+ invalinst /* Keep going until we do the page... */ - - rlwimi r7,r9,0,MSR_DR_BIT,MSR_DR_BIT ; Set DDAT if on - sync ; Make sure all invalidates done - - mtmsr r7 ; Set DDAT correctly - isync - - lfd f0,FM_SIZE+0(r1) ; Restore an FP register - lfd f1,FM_SIZE+8(r1) ; Restore an FP register - lfd f2,FM_SIZE+16(r1) ; Restore an FP register - lfd f3,FM_SIZE+24(r1) ; Restore an FP register - - lwz r1,0(r1) ; Pop up the stack - - mtmsr r9 ; Turn off FPU now and maybe rupts back on - isync - blr - -#if 0 -; -; This is not very optimal. We just do it here for a test of -; Altivec in the kernel. -; -wegotaltivec: - mfmsr r9 ; Get the MSR - lis r8,hi16(0xC0000000) ; Make sure we keep the first 2 vector registers - rlwinm r7,r9,0,MSR_EE_BIT+1,MSR_EE_BIT-1 ; Disable interruptions - lis r6,lo16(2*256+128) ; Specify 128 blocks of 2 vectors each - rlwinm r7,r7,0,MSR_DR_BIT+1,MSR_DR_BIT-1 ; Clear the DDAT bit - ori r6,r6,32 ; Set a 32-byte stride - mtsprg 256,r8 ; Set VRSave - mtmsr r7 ; Disable rupts and turn xlate off - isync - - addi r11,r3,4096 ; Point to the next page - li r10,16 ; Get vector size - -avmovepg: lvxl v0,br0,r3 ; Get first half of line - dcba br0,r4 ; Allocate output - lvxl v1,r10,r3 ; Get second half of line - stvxl v0,br0,r4 ; Save first half of line - addi r3,r3,32 ; Point to the next line - icbi br0,r4 ; Make the icache go away also - stvxl v1,r10,r4 ; Save second half of line - cmplw r3,r11 ; Have we reached the next page? - dcbst br0,r4 ; Make sure the line is on its way out - addi r4,r4,32 ; Point to the next line - blt+ avmovepg ; Move the next line... - - li r8,0 ; Clear this - sync ; Make sure all the memory stuff is done - mtsprg 256,r8 ; Show we are not using VRs any more - mtmsr r9 ; Translation and interruptions back on - isync - blr -#endif - - - - -/* - * int - * copyin(src, dst, count) - * vm_offset_t src; - * vm_offset_t dst; - * int count; - * - */ -ENTRY2(copyin, copyinmsg, TAG_NO_FRAME_USED) - -/* Preamble allowing us to call a sub-function */ - mflr r0 - stw r0,FM_LR_SAVE(r1) - stwu r1,-(FM_SIZE+16)(r1) - - cmpli cr0,r5,0 - ble- cr0,.L_copyinout_trivial - -/* we know we have a valid copyin to do now */ -/* Set up thread_recover in case we hit an illegal address */ - - mfsprg r8,1 /* Get the current act */ - lwz r10,ACT_THREAD(r8) - lis r11,hi16(.L_copyinout_error) - lwz r8,ACT_VMMAP(r8) - ori r11,r11,lo16(.L_copyinout_error) - add r9,r3,r5 /* Get the end of the source */ - lwz r8,VMMAP_PMAP(r8) ; Get the pmap - rlwinm r12,r3,6,26,29 ; Get index to the segment slot - subi r9,r9,1 /* Make sure we don't go too far */ - add r8,r8,r12 ; Start indexing to the segment value - stw r11,THREAD_RECOVER(r10) - xor r9,r9,r3 /* Smoosh 'em together */ - lwz r8,PMAP_SEGS(r8) ; Get the source SR value - rlwinm. r9,r9,0,1,3 /* Top nybble equal? */ - mtsr SR_COPYIN,r8 ; Set the SR - isync -#if 0 - lis r0,HIGH_ADDR(EXT(dbgRegsCall)) /* (TEST/DEBUG) */ - ori r0,r0,LOW_ADDR(EXT(dbgRegsCall)) /* (TEST/DEBUG) */ - sc /* (TEST/DEBUG) */ -#endif - -/* For optimization, we check if the copyin lies on a segment - * boundary. If it doesn't, we can use a simple copy. If it - * does, we split it into two separate copies in some C code. - */ - - bne- .L_call_copyin_multiple /* Nope, we went past the segment boundary... */ - - rlwinm r3,r3,0,4,31 - oris r3,r3,(SR_COPYIN_NUM << (28-16)) /* Set the copyin segment as the source */ - - bl EXT(bcopy) + lis r2,hi16(MASK(MSR_VEC)) ; Get the vector flag + mflr r0 // get return + ori r2,r2,lo16(MASK(MSR_FP)) ; Add the FP flag + stw r0,8(r1) // save + stwu r1,-kSFSize(r1) // set up a stack frame for VRs or FPRs + mfmsr r11 // save MSR at entry + mfsprg r10,2 // get feature flags + andc r11,r11,r2 // Clear out vec and fp + ori r2,r2,lo16(MASK(MSR_EE)) // Get EE on also + andc r2,r11,r2 // Clear out EE as well + mtcrf 0x02,r10 // we need to test pf64Bit + ori r2,r2,MASK(MSR_FP) // must enable FP for G3... + mtcrf 0x80,r10 // we need to test pfAltivec too + oris r2,r2,hi16(MASK(MSR_VEC)) // enable altivec for G4 (ignored if G3) + mtmsr r2 // turn EE off, FP and VEC on + isync + bt++ pf64Bitb,pmap_copy_64 // skip if 64-bit processor (only they take hint) + slwi r3,r3,12 // get page address from page num + slwi r4,r4,12 // get page address from page num + rlwinm r12,r2,0,MSR_DR_BIT+1,MSR_DR_BIT-1 // get ready to turn off DR + bt pfAltivecb,pmap_copy_g4 // altivec but not 64-bit means G4 + + + // G3 -- copy using FPRs + + stfd f0,FM_SIZE+0(r1) // save the 4 FPRs we use to copy + stfd f1,FM_SIZE+8(r1) + li r5,PPC_PGBYTES/32 // count of cache lines in a page + stfd f2,FM_SIZE+16(r1) + mtctr r5 + stfd f3,FM_SIZE+24(r1) + mtmsr r12 // turn off DR after saving FPRs on stack + isync + +pmap_g3_copy_loop: // loop over 32-byte cache lines + dcbz 0,r4 // avoid read of dest line + lfd f0,0(r3) + lfd f1,8(r3) + lfd f2,16(r3) + lfd f3,24(r3) + addi r3,r3,32 + stfd f0,0(r4) + stfd f1,8(r4) + stfd f2,16(r4) + stfd f3,24(r4) + dcbst 0,r4 // flush dest line to RAM + addi r4,r4,32 + bdnz pmap_g3_copy_loop + + sync // wait for stores to take + subi r4,r4,PPC_PGBYTES // restore ptr to destintation page + li r6,PPC_PGBYTES-32 // point to last line in page +pmap_g3_icache_flush: + subic. r5,r6,32 // more to go? + icbi r4,r6 // flush another line in icache + subi r6,r5,32 // get offset to next line + icbi r4,r5 + bne pmap_g3_icache_flush + + sync + mtmsr r2 // turn DR back on + isync + lfd f0,FM_SIZE+0(r1) // restore the FPRs + lfd f1,FM_SIZE+8(r1) + lfd f2,FM_SIZE+16(r1) + lfd f3,FM_SIZE+24(r1) + + b pmap_g4_restore // restore MSR and done + + + // G4 -- copy using VRs + +pmap_copy_g4: // r2=(MSR-EE), r12=(r2-DR), r10=features, r11=old MSR + la r9,FM_SIZE+16(r1) // place where we save VRs to r9 + li r5,16 // load x-form offsets into r5-r9 + li r6,32 // another offset + stvx v0,0,r9 // save some VRs so we can use to copy + li r7,48 // another offset + stvx v1,r5,r9 + li r0,PPC_PGBYTES/64 // we loop over 64-byte chunks + stvx v2,r6,r9 + mtctr r0 + li r8,96 // get look-ahead for touch + stvx v3,r7,r9 + li r9,128 + mtmsr r12 // now we've saved VRs on stack, turn off DR + isync // wait for it to happen + b pmap_g4_copy_loop + + .align 5 // align inner loops +pmap_g4_copy_loop: // loop over 64-byte chunks + dcbt r3,r8 // touch 3 lines ahead + nop // avoid a 17-word loop... + dcbt r3,r9 // touch 4 lines ahead + nop // more padding + dcba 0,r4 // avoid pre-fetch of 1st dest line + lvx v0,0,r3 // offset 0 + lvx v1,r5,r3 // offset 16 + lvx v2,r6,r3 // offset 32 + lvx v3,r7,r3 // offset 48 + addi r3,r3,64 + dcba r6,r4 // avoid pre-fetch of 2nd line + stvx v0,0,r4 // offset 0 + stvx v1,r5,r4 // offset 16 + stvx v2,r6,r4 // offset 32 + stvx v3,r7,r4 // offset 48 + dcbf 0,r4 // push line 1 + dcbf r6,r4 // and line 2 + addi r4,r4,64 + bdnz pmap_g4_copy_loop + + sync // wait for stores to take + subi r4,r4,PPC_PGBYTES // restore ptr to destintation page + li r8,PPC_PGBYTES-32 // point to last line in page +pmap_g4_icache_flush: + subic. r9,r8,32 // more to go? + icbi r4,r8 // flush from icache + subi r8,r9,32 // get offset to next line + icbi r4,r9 + bne pmap_g4_icache_flush + + sync + mtmsr r2 // turn DR back on + isync + la r9,FM_SIZE+16(r1) // get base of VR save area + lvx v0,0,r9 // restore the VRs + lvx v1,r5,r9 + lvx v2,r6,r9 + lvx v3,r7,r9 + +pmap_g4_restore: // r11=MSR + mtmsr r11 // turn EE on, VEC and FR off + isync // wait for it to happen + addi r1,r1,kSFSize // pop off our stack frame + lwz r0,8(r1) // restore return address + mtlr r0 + blr + + + // 64-bit/128-byte processor: copy using VRs + +pmap_copy_64: // r10=features, r11=old MSR + sldi r3,r3,12 // get page address from page num + sldi r4,r4,12 // get page address from page num + la r9,FM_SIZE+16(r1) // get base of VR save area + li r5,16 // load x-form offsets into r5-r9 + li r6,32 // another offset + bf pfAltivecb,pmap_novmx_copy // altivec suppressed... + stvx v0,0,r9 // save 8 VRs so we can copy wo bubbles + stvx v1,r5,r9 + li r7,48 // another offset + li r0,PPC_PGBYTES/128 // we loop over 128-byte chunks + stvx v2,r6,r9 + stvx v3,r7,r9 + addi r9,r9,64 // advance base ptr so we can store another 4 + mtctr r0 + li r0,MASK(MSR_DR) // get DR bit + stvx v4,0,r9 + stvx v5,r5,r9 + andc r12,r2,r0 // turn off DR bit + li r0,1 // get a 1 to slam into SF + stvx v6,r6,r9 + stvx v7,r7,r9 + rldimi r12,r0,63,MSR_SF_BIT // set SF bit (bit 0) + li r8,-128 // offset so we can reach back one line + mtmsrd r12 // now we've saved VRs, turn DR off and SF on + isync // wait for it to happen + dcbt128 0,r3,1 // start a forward stream + b pmap_64_copy_loop + + .align 5 // align inner loops +pmap_64_copy_loop: // loop over 128-byte chunks + dcbz128 0,r4 // avoid read of destination line + lvx v0,0,r3 // offset 0 + lvx v1,r5,r3 // offset 16 + lvx v2,r6,r3 // offset 32 + lvx v3,r7,r3 // offset 48 + addi r3,r3,64 // don't have enough GPRs so add 64 2x + lvx v4,0,r3 // offset 64 + lvx v5,r5,r3 // offset 80 + lvx v6,r6,r3 // offset 96 + lvx v7,r7,r3 // offset 112 + addi r3,r3,64 + stvx v0,0,r4 // offset 0 + stvx v1,r5,r4 // offset 16 + stvx v2,r6,r4 // offset 32 + stvx v3,r7,r4 // offset 48 + addi r4,r4,64 + stvx v4,0,r4 // offset 64 + stvx v5,r5,r4 // offset 80 + stvx v6,r6,r4 // offset 96 + stvx v7,r7,r4 // offset 112 + addi r4,r4,64 + dcbf r8,r4 // flush the line we just wrote + bdnz pmap_64_copy_loop + + sync // wait for stores to take + subi r4,r4,PPC_PGBYTES // restore ptr to destintation page + li r8,PPC_PGBYTES-128 // point to last line in page +pmap_64_icache_flush: + subic. r9,r8,128 // more to go? + icbi r4,r8 // flush from icache + subi r8,r9,128 // get offset to next line + icbi r4,r9 + bne pmap_64_icache_flush + + sync + mtmsrd r2 // turn DR back on, SF off + isync + la r9,FM_SIZE+16(r1) // get base address of VR save area on stack + lvx v0,0,r9 // restore the VRs + lvx v1,r5,r9 + lvx v2,r6,r9 + lvx v3,r7,r9 + addi r9,r9,64 + lvx v4,0,r9 + lvx v5,r5,r9 + lvx v6,r6,r9 + lvx v7,r7,r9 + + b pmap_g4_restore // restore lower half of MSR and return + + // + // Copy on 64-bit without VMX + // + +pmap_novmx_copy: + li r0,PPC_PGBYTES/128 // we loop over 128-byte chunks + mtctr r0 + li r0,MASK(MSR_DR) // get DR bit + andc r12,r2,r0 // turn off DR bit + li r0,1 // get a 1 to slam into SF + rldimi r12,r0,63,MSR_SF_BIT // set SF bit (bit 0) + mtmsrd r12 // now we've saved VRs, turn DR off and SF on + isync // wait for it to happen + dcbt128 0,r3,1 // start a forward stream + +pmap_novmx_copy_loop: // loop over 128-byte cache lines + dcbz128 0,r4 // avoid read of dest line + + ld r0,0(r3) // Load half a line + ld r12,8(r3) + ld r5,16(r3) + ld r6,24(r3) + ld r7,32(r3) + ld r8,40(r3) + ld r9,48(r3) + ld r10,56(r3) + + std r0,0(r4) // Store half a line + std r12,8(r4) + std r5,16(r4) + std r6,24(r4) + std r7,32(r4) + std r8,40(r4) + std r9,48(r4) + std r10,56(r4) + + ld r0,64(r3) // Load half a line + ld r12,72(r3) + ld r5,80(r3) + ld r6,88(r3) + ld r7,96(r3) + ld r8,104(r3) + ld r9,112(r3) + ld r10,120(r3) + + addi r3,r3,128 + + std r0,64(r4) // Store half a line + std r12,72(r4) + std r5,80(r4) + std r6,88(r4) + std r7,96(r4) + std r8,104(r4) + std r9,112(r4) + std r10,120(r4) + + dcbf 0,r4 // flush the line we just wrote + addi r4,r4,128 + bdnz pmap_novmx_copy_loop + + sync // wait for stores to take + subi r4,r4,PPC_PGBYTES // restore ptr to destintation page + li r8,PPC_PGBYTES-128 // point to last line in page + +pmap_novmx_icache_flush: + subic. r9,r8,128 // more to go? + icbi r4,r8 // flush from icache + subi r8,r9,128 // get offset to next line + icbi r4,r9 + bne pmap_novmx_icache_flush + + sync + mtmsrd r2 // turn DR back on, SF off + isync + + b pmap_g4_restore // restore lower half of MSR and return + + + +//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> -/* Now that copyin is done, we don't need a recovery point */ - - addi r1,r1,FM_SIZE+16 - mfsprg r6,1 /* Get the current act */ - lwz r10,ACT_THREAD(r6) - li r3,0 - lwz r0,FM_LR_SAVE(r1) - stw r3,THREAD_RECOVER(r10) /* Clear recovery */ - mtlr r0 - blr - -/* we get here via the exception handler if an illegal - * user memory reference was made. - */ -.L_copyinout_error: - -/* Now that copyin is done, we don't need a recovery point */ - - mfsprg r6,1 /* Get the current act */ - addi r1,r1,FM_SIZE+16 - lwz r10,ACT_THREAD(r6) - li r4,0 - lwz r0,FM_LR_SAVE(r1) - stw r4,THREAD_RECOVER(r10) /* Clear recovery */ - mtlr r0 - li r3,EFAULT ; Indicate error (EFAULT) - blr - -.L_copyinout_trivial: - /* The copyin/out was for either 0 bytes or a negative - * number of bytes, return an appropriate value (0 == SUCCESS). - * cr0 still contains result of comparison of len with 0. - */ - li r3, 0 - beq+ cr0, .L_copyinout_negative - li r3, 1 -.L_copyinout_negative: - - /* unwind the stack */ - addi r1, r1, FM_SIZE+16 - lwz r0, FM_LR_SAVE(r1) - mtlr r0 - - blr - -.L_call_copyin_multiple: - - /* unwind the stack */ - addi r1, r1, FM_SIZE+16 - lwz r0, FM_LR_SAVE(r1) - mtlr r0 - - b EXT(copyin_multiple) /* not a call - a jump! */ - +// Stack frame format used by copyin, copyout, copyinstr and copyoutstr. +// These routines all run both on 32 and 64-bit machines, though because they are called +// by the BSD kernel they are always in 32-bit mode when entered. The mapped ptr returned +// by MapUserAddressSpace will be 64 bits however on 64-bit machines. Beware to avoid +// using compare instructions on this ptr. This mapped ptr is kept globally in r31, so there +// is no need to store or load it, which are mode-dependent operations since it could be +// 32 or 64 bits. + +#define kkFrameSize (FM_SIZE+32) + +#define kkBufSize (FM_SIZE+0) +#define kkCR (FM_SIZE+4) +#define kkSource (FM_SIZE+8) +#define kkDest (FM_SIZE+12) +#define kkCountPtr (FM_SIZE+16) +#define kkR31Save (FM_SIZE+20) + + +// nonvolatile CR bits we use as flags in cr3 + +#define kk64bit 12 +#define kkNull 13 +#define kkIn 14 +#define kkString 15 +#define kkZero 15 + + +//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> /* * int - * copyout(src, dst, count) + * copyoutstr(src, dst, maxcount, count) * vm_offset_t src; * vm_offset_t dst; - * int count; + * vm_size_t maxcount; + * vm_size_t* count; * + * Set *count to the number of bytes copied. */ -ENTRY2(copyout, copyoutmsg, TAG_NO_FRAME_USED) +ENTRY(copyoutstr, TAG_NO_FRAME_USED) + mfcr r2 // we use nonvolatile cr3 + li r0,0 + crset kkString // flag as a string op + mr r10,r4 // for copyout, dest ptr (r4) is in user space + stw r0,0(r6) // initialize #bytes moved + crclr kkIn // flag as copyout + b copyJoin -/* Preamble allowing us to call a sub-function */ - - mflr r0 - stw r0,FM_LR_SAVE(r1) - stwu r1,-(FM_SIZE+16)(r1) - -#if 0 - stw r3,FM_SIZE+0(r1) /* (TEST/DEBUG) */ - stw r4,FM_SIZE+4(r1) /* (TEST/DEBUG) */ - stw r5,FM_SIZE+8(r1) /* (TEST/DEBUG) */ - mr r6,r0 /* (TEST/DEBUG) */ - - bl EXT(tracecopyout) /* (TEST/DEBUG) */ - - lwz r3,FM_SIZE+0(r1) /* (TEST/DEBUG) */ - lwz r4,FM_SIZE+4(r1) /* (TEST/DEBUG) */ - lwz r5,FM_SIZE+8(r1) /* (TEST/DEBUG) */ -#endif - - cmpli cr0,r5,0 - ble- cr0,.L_copyinout_trivial -/* we know we have a valid copyout to do now */ -/* Set up thread_recover in case we hit an illegal address */ - - - mfsprg r8,1 /* Get the current act */ - lwz r10,ACT_THREAD(r8) - lis r11,HIGH_ADDR(.L_copyinout_error) - lwz r8,ACT_VMMAP(r8) - rlwinm r12,r4,6,26,29 ; Get index to the segment slot - ori r11,r11,LOW_ADDR(.L_copyinout_error) - add r9,r4,r5 /* Get the end of the destination */ - lwz r8,VMMAP_PMAP(r8) - subi r9,r9,1 /* Make sure we don't go too far */ - add r8,r8,r12 ; Start indexing to the segment value - stw r11,THREAD_RECOVER(r10) - xor r9,r9,r4 /* Smoosh 'em together */ - lwz r8,PMAP_SEGS(r8) ; Get the source SR value - rlwinm. r9,r9,0,1,3 /* Top nybble equal? */ - mtsr SR_COPYIN,r8 - isync - - -/* For optimisation, we check if the copyout lies on a segment - * boundary. If it doesn't, we can use a simple copy. If it - * does, we split it into two separate copies in some C code. - */ - - bne- .L_call_copyout_multiple /* Nope, we went past the segment boundary... */ - - rlwinm r4,r4,0,4,31 - oris r4,r4,(SR_COPYIN_NUM << (28-16)) /* Set the copyin segment as the source */ - - bl EXT(bcopy) - -/* Now that copyout is done, we don't need a recovery point */ - mfsprg r6,1 /* Get the current act */ - addi r1,r1,FM_SIZE+16 - lwz r10,ACT_THREAD(r6) - li r3,0 - lwz r0,FM_LR_SAVE(r1) - stw r3,THREAD_RECOVER(r10) /* Clear recovery */ - mtlr r0 - blr - -.L_call_copyout_multiple: - /* unwind the stack */ - addi r1, r1, FM_SIZE+16 - lwz r0, FM_LR_SAVE(r1) - mtlr r0 - - b EXT(copyout_multiple) /* not a call - a jump! */ +//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> /* - * boolean_t - * copyinstr(src, dst, count, maxcount) + * int + * copyinstr(src, dst, maxcount, count) * vm_offset_t src; * vm_offset_t dst; * vm_size_t maxcount; * vm_size_t* count; * * Set *count to the number of bytes copied - * * If dst == NULL, don't copy, just count bytes. * Only currently called from klcopyinstr. */ ENTRY(copyinstr, TAG_NO_FRAME_USED) - -/* Preamble allowing us to call a sub-function */ - mflr r0 - stw r0,FM_LR_SAVE(r1) - stwu r1,-(FM_SIZE+16)(r1) - -#if 0 - stw r3,FM_SIZE+0(r1) /* (TEST/DEBUG) */ - stw r4,FM_SIZE+4(r1) /* (TEST/DEBUG) */ - stw r5,FM_SIZE+8(r1) /* (TEST/DEBUG) */ - stw r6,FM_SIZE+12(r1) /* (TEST/DEBUG) */ - mr r7,r0 /* (TEST/DEBUG) */ - - bl EXT(tracecopystr) /* (TEST/DEBUG) */ - - lwz r3,FM_SIZE+0(r1) /* (TEST/DEBUG) */ - lwz r4,FM_SIZE+4(r1) /* (TEST/DEBUG) */ - lwz r5,FM_SIZE+8(r1) /* (TEST/DEBUG) */ - stw r6,FM_SIZE+12(r1) /* (TEST/DEBUG) */ -#endif - - cmpli cr0,r5,0 - ble- cr0,.L_copyinout_trivial - -/* we know we have a valid copyin to do now */ -/* Set up thread_recover in case we hit an illegal address */ - - li r0,0 - mfsprg r8,1 /* Get the current act */ - lwz r10,ACT_THREAD(r8) - stw r0,0(r6) /* Clear result length */ - lis r11,HIGH_ADDR(.L_copyinout_error) - lwz r8,ACT_VMMAP(r8) ; Get the map for this activation - rlwinm r12,r3,6,26,29 ; Get index to the segment slot - lwz r8,VMMAP_PMAP(r8) - ori r11,r11,LOW_ADDR(.L_copyinout_error) - add r8,r8,r12 ; Start indexing to the segment value - stw r11,THREAD_RECOVER(r10) - rlwinm r3,r3,0,4,31 - lwz r7,PMAP_SEGS(r8) ; Get the source SR value - oris r3,r3,(SR_COPYIN_NUM << (28-16)) /* Set the copyin segment as the source */ - -/* Copy byte by byte for now - TODO NMGS speed this up with - * some clever (but fairly standard) logic for word copies. - * We don't use a copyinstr_multiple since copyinstr is called - * with INT_MAX in the linux server. Eugh. + mfcr r2 // we use nonvolatile cr3 + cmplwi r4,0 // dst==NULL? + li r0,0 + crset kkString // flag as a string op + mr r10,r3 // for copyin, source ptr (r3) is in user space + crmove kkNull,cr0_eq // remember if (dst==NULL) + stw r0,0(r6) // initialize #bytes moved + crset kkIn // flag as copyin (rather than copyout) + b copyJoin1 // skip over the "crclr kkNull" + + +//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> +/* + * int + * copyout(src, dst, count) + * vm_offset_t src; + * vm_offset_t dst; + * size_t count; */ - li r9,0 /* Clear byte counter */ - -/* If the destination is NULL, don't do writes, - * just count bytes. We set CR7 outside the loop to save time + .align 5 + .globl EXT(copyout) + .globl EXT(copyoutmsg) + +LEXT(copyout) +LEXT(copyoutmsg) + +#if INSTRUMENT + mfspr r12,pmc1 ; INSTRUMENT - saveinstr[12] - Take stamp at copyout + stw r12,0x6100+(12*16)+0x0(0) ; INSTRUMENT - Save it + mfspr r12,pmc2 ; INSTRUMENT - Get stamp + stw r12,0x6100+(12*16)+0x4(0) ; INSTRUMENT - Save it + mfspr r12,pmc3 ; INSTRUMENT - Get stamp + stw r12,0x6100+(12*16)+0x8(0) ; INSTRUMENT - Save it + mfspr r12,pmc4 ; INSTRUMENT - Get stamp + stw r12,0x6100+(12*16)+0xC(0) ; INSTRUMENT - Save it +#endif + mfcr r2 // save caller's CR + crclr kkString // not a string version + mr r10,r4 // dest (r4) is user-space ptr + crclr kkIn // flag as copyout + b copyJoin + + +//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> +/* + * int + * copyin(src, dst, count) + * vm_offset_t src; + * vm_offset_t dst; + * size_t count; */ - cmpwi cr7,r4,0 /* Is the destination null? */ - -nxtseg: mtsr SR_COPYIN,r7 /* Set the source SR */ - isync -.L_copyinstr_loop: - lbz r0,0(r3) /* Get the source */ - addic. r5,r5,-1 /* Have we gone far enough? */ - addi r3,r3,1 /* Bump source pointer */ - - cmpwi cr1,r0,0 /* Did we hit a null? */ - beq cr7,.L_copyinstr_no_store /* If we are just counting, skip the store... */ - - stb r0,0(r4) /* Move to sink */ - addi r4,r4,1 /* Advance sink pointer */ + .align 5 + .globl EXT(copyin) + .globl EXT(copyinmsg) + +LEXT(copyin) +LEXT(copyinmsg) + + mfcr r2 // save caller's CR + crclr kkString // not a string version + mr r10,r3 // source (r3) is user-space ptr in copyin + crset kkIn // flag as copyin + + +// Common code to handle setup for all the copy variants: +// r2 = caller's CR, since we use cr3 +// r3-r6 = parameters +// r10 = user-space ptr (r3 if copyin, r4 if copyout) +// cr3 = kkIn, kkString, kkNull flags + +copyJoin: + crclr kkNull // (dst==NULL) convention not used with this call +copyJoin1: // enter from copyinstr with kkNull set + mflr r0 // get return address + cmplwi r5,0 // buffer length 0? + lis r9,0x1000 // r9 <- 0x10000000 (256MB) + stw r0,FM_LR_SAVE(r1) // save return + cmplw cr1,r5,r9 // buffer length > 256MB ? + mfsprg r8,2 // get the features + beq-- copyinout_0 // 0 length is degenerate case + stwu r1,-kkFrameSize(r1) // set up stack frame + stw r2,kkCR(r1) // save caller's CR since we use cr3 + mtcrf 0x02,r8 // move pf64Bit to cr6 + stw r3,kkSource(r1) // save args across MapUserAddressSpace + stw r4,kkDest(r1) + stw r5,kkBufSize(r1) + crmove kk64bit,pf64Bitb // remember if this is a 64-bit processor + stw r6,kkCountPtr(r1) + stw r31,kkR31Save(r1) // we use r31 globally for mapped user ptr + li r31,0 // no mapped ptr yet + + +// Handle buffer length > 256MB. This is an error (ENAMETOOLONG) on copyin and copyout. +// The string ops are passed -1 lengths by some BSD callers, so for them we silently clamp +// the buffer length to 256MB. This isn't an issue if the string is less than 256MB +// (as most are!), but if they are >256MB we eventually return ENAMETOOLONG. This restriction +// is due to MapUserAddressSpace; we don't want to consume more than two segments for +// the mapping. + + ble++ cr1,copyin0 // skip if buffer length <= 256MB + bf kkString,copyinout_too_big // error if not string op + mr r5,r9 // silently clamp buffer length to 256MB + stw r9,kkBufSize(r1) // update saved copy too + + +// Set up thread_recover in case we hit an illegal address. + +copyin0: + mfsprg r8,1 /* Get the current act */ + lis r2,hi16(copyinout_error) + lwz r7,ACT_THREAD(r8) + ori r2,r2,lo16(copyinout_error) + lwz r3,ACT_VMMAP(r8) // r3 <- vm_map virtual address + stw r2,THREAD_RECOVER(r7) + + +// Map user segment into kernel map, turn on 64-bit mode. +// r3 = vm map +// r5 = buffer length +// r10 = user space ptr (r3 if copyin, r4 if copyout) + + mr r6,r5 // Set length to map + li r4,0 // Note: we only do this 32-bit for now + mr r5,r10 // arg2 <- user space ptr +#if INSTRUMENT + mfspr r12,pmc1 ; INSTRUMENT - saveinstr[13] - Take stamp before mapuseraddressspace + stw r12,0x6100+(13*16)+0x0(0) ; INSTRUMENT - Save it + mfspr r12,pmc2 ; INSTRUMENT - Get stamp + stw r12,0x6100+(13*16)+0x4(0) ; INSTRUMENT - Save it + mfspr r12,pmc3 ; INSTRUMENT - Get stamp + stw r12,0x6100+(13*16)+0x8(0) ; INSTRUMENT - Save it + mfspr r12,pmc4 ; INSTRUMENT - Get stamp + stw r12,0x6100+(13*16)+0xC(0) ; INSTRUMENT - Save it +#endif + bl EXT(MapUserAddressSpace) // set r3 <- address in kernel map of user operand +#if INSTRUMENT + mfspr r12,pmc1 ; INSTRUMENT - saveinstr[14] - Take stamp after mapuseraddressspace + stw r12,0x6100+(14*16)+0x0(0) ; INSTRUMENT - Save it + mfspr r12,pmc2 ; INSTRUMENT - Get stamp + stw r12,0x6100+(14*16)+0x4(0) ; INSTRUMENT - Save it + mfspr r12,pmc3 ; INSTRUMENT - Get stamp + stw r12,0x6100+(14*16)+0x8(0) ; INSTRUMENT - Save it + mfspr r12,pmc4 ; INSTRUMENT - Get stamp + stw r12,0x6100+(14*16)+0xC(0) ; INSTRUMENT - Save it +#endif + or. r0,r3,r4 // Did we fail the mapping? + mr r31,r4 // r31 <- mapped ptr into user space (may be 64-bit) + beq-- copyinout_error // was 0, so there was an error making the mapping + bf-- kk64bit,copyin1 // skip if a 32-bit processor + + rldimi r31,r3,32,0 // slam high-order bits into mapped ptr + mfmsr r4 // if 64-bit, turn on SF so we can use returned ptr + li r0,1 + rldimi r4,r0,63,MSR_SF_BIT // light bit 0 + mtmsrd r4 // turn on 64-bit mode + isync // wait for mode to change + + +// Load r3-r5, substituting mapped ptr as appropriate. + +copyin1: + lwz r5,kkBufSize(r1) // restore length to copy + bf kkIn,copyin2 // skip if copyout + lwz r4,kkDest(r1) // copyin: source is mapped, dest is r4 at entry + mr r3,r31 // source is mapped ptr + b copyin3 +copyin2: // handle copyout + lwz r3,kkSource(r1) // source is kernel buffer (r3 at entry) + mr r4,r31 // dest is mapped ptr into user space + + +// Finally, all set up to copy: +// r3 = source ptr (mapped if copyin) +// r4 = dest ptr (mapped if copyout) +// r5 = length +// r31 = mapped ptr returned by MapUserAddressSpace +// cr3 = kkIn, kkString, kk64bit, and kkNull flags + +copyin3: + bt kkString,copyString // handle copyinstr and copyoutstr + bl EXT(bcopy) // copyin and copyout: let bcopy do the work + li r3,0 // return success + + +// Main exit point for copyin, copyout, copyinstr, and copyoutstr. Also reached +// from error recovery if we get a DSI accessing user space. Clear recovery ptr, +// and pop off frame. Note that we have kept +// the mapped ptr into user space in r31, as a reg64_t type (ie, a 64-bit ptr on +// 64-bit machines.) We must unpack r31 into an addr64_t in (r3,r4) before passing +// it to ReleaseUserAddressSpace. +// r3 = 0, EFAULT, or ENAMETOOLONG + +copyinx: + lwz r2,kkCR(r1) // get callers cr3 + mfsprg r6,1 // Get the current act + lwz r10,ACT_THREAD(r6) + + bf-- kk64bit,copyinx1 // skip if 32-bit processor + mfmsr r12 + rldicl r12,r12,0,MSR_SF_BIT+1 // if 64-bit processor, turn 64-bit mode off + mtmsrd r12 // turn SF off and EE back on + isync // wait for the mode to change +copyinx1: + lwz r31,kkR31Save(r1) // restore callers r31 + addi r1,r1,kkFrameSize // pop off our stack frame + lwz r0,FM_LR_SAVE(r1) + li r4,0 + stw r4,THREAD_RECOVER(r10) // Clear recovery + mtlr r0 + mtcrf 0x10,r2 // restore cr3 + blr -.L_copyinstr_no_store: - addi r9,r9,1 /* Count the character */ - beq- cr1,.L_copyinstr_done /* We're done if we did a null... */ - beq- cr0,L_copyinstr_toobig /* Also if we maxed the count... */ - -/* Check to see if the copyin pointer has moved out of the - * copyin segment, if it has we must remap. +/* We get here via the exception handler if an illegal + * user memory reference was made. This error handler is used by + * copyin, copyout, copyinstr, and copyoutstr. Registers are as + * they were at point of fault, so for example cr3 flags are valid. */ - rlwinm. r0,r3,0,4,31 /* Did we wrap around to 0? */ - bne+ cr0,.L_copyinstr_loop /* Nope... */ - - lwz r7,PMAP_SEGS+4(r8) ; Get the next source SR value - addi r8,r8,4 ; Point to the next segment - oris r3,r0,(SR_COPYIN_NUM << (28-16)) /* Reset the segment number */ - b nxtseg /* Keep going... */ - -L_copyinstr_toobig: - li r3,ENAMETOOLONG - b L_copyinstr_return -.L_copyinstr_done: - li r3,0 /* Normal return */ -L_copyinstr_return: - li r4,0 /* to clear thread_recover */ - stw r9,0(r6) /* Set how many bytes we did */ - stw r4,THREAD_RECOVER(r10) /* Clear recovery exit */ - - addi r1, r1, FM_SIZE+16 - lwz r0, FM_LR_SAVE(r1) - mtlr r0 - blr +copyinout_error: + li r3,EFAULT // return error + b copyinx + +copyinout_0: // degenerate case: 0-length copy + mtcrf 0x10,r2 // restore cr3 + li r3,0 // return success + blr + +copyinout_too_big: // degenerate case + mtcrf 0x10,r2 // restore cr3 + lwz r1,0(r1) // pop off stack frame + li r3,ENAMETOOLONG + blr + + +//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><> +// Handle copyinstr and copyoutstr. At this point the stack frame is set up, +// the recovery ptr is set, the user's buffer is mapped, we're in 64-bit mode +// if necessary, and: +// r3 = source ptr, mapped if copyinstr +// r4 = dest ptr, mapped if copyoutstr +// r5 = buffer length +// r31 = mapped ptr returned by MapUserAddressSpace +// cr3 = kkIn, kkString, kkNull, and kk64bit flags +// We do word copies unless the buffer is very short, then use a byte copy loop +// for the leftovers if necessary. + +copyString: + li r12,0 // Set header bytes count to zero + cmplwi cr1,r5,20 // is buffer very short? + mtctr r5 // assuming short, set up loop count for bytes + blt cr1,copyinstr8 // too short for word loop + andi. r12,r3,0x3 // is source ptr word aligned? + bne copyinstr11 // bytes loop +copyinstr1: + srwi r6,r5,2 // get #words in buffer + mtctr r6 // set up word loop count + lis r10,hi16(0xFEFEFEFF) // load magic constants into r10 and r11 + lis r11,hi16(0x80808080) + ori r10,r10,lo16(0xFEFEFEFF) + ori r11,r11,lo16(0x80808080) + bf kkNull,copyinstr6 // enter loop that copies + b copyinstr5 // use loop that just counts + + +// Word loop(s). They do a word-parallel search for 0s, using the following +// inobvious but very efficient test: +// y = data + 0xFEFEFEFF +// z = ~data & 0x80808080 +// If (y & z)==0, then all bytes in dataword are nonzero. We need two copies of +// this loop, since if we test kkNull in the loop then it becomes 9 words long. + + .align 5 // align inner loops for speed +copyinstr5: // version that counts but does not copy + lwz r8,0(r3) // get next word of source + addi r3,r3,4 // increment source ptr + add r9,r10,r8 // r9 = data + 0xFEFEFEFF + andc r7,r11,r8 // r7 = ~data & 0x80808080 + and. r7,r9,r7 // r7 = r9 & r7 + bdnzt cr0_eq,copyinstr5 // if r7==0, then all bytes are nonzero + + b copyinstr7 + + .align 5 // align inner loops for speed +copyinstr6: // version that counts and copies + lwz r8,0(r3) // get next word of source + addi r3,r3,4 // increment source ptr + addi r4,r4,4 // increment dest ptr while we wait for data + add r9,r10,r8 // r9 = data + 0xFEFEFEFF + andc r7,r11,r8 // r7 = ~data & 0x80808080 + and. r7,r9,r7 // r7 = r9 & r7 + stw r8,-4(r4) // pack all 4 bytes into buffer + bdnzt cr0_eq,copyinstr6 // if r7==0, then all bytes are nonzero + + +// Either 0 found or buffer filled. The above algorithm has mapped nonzero bytes to 0 +// and 0 bytes to 0x80 with one exception: 0x01 bytes preceeding the first 0 are also +// mapped to 0x80. We must mask out these false hits before searching for an 0x80 byte. + +copyinstr7: + crnot kkZero,cr0_eq // 0 found iff cr0_eq is off + mfctr r6 // get #words remaining in buffer + rlwinm r2,r8,7,0,31 // move 0x01 bits to 0x80 position + slwi r6,r6,2 // convert to #bytes remaining + andc r7,r7,r2 // turn off false hits from 0x0100 worst case + rlwimi r6,r5,0,30,31 // add in odd bytes leftover in buffer + srwi r7,r7,8 // we want to count the 0 as a byte xferred + addi r6,r6,4 // don't count last word xferred (yet) + cntlzw r7,r7 // now we can find the 0 byte (ie, the 0x80) + srwi r7,r7,3 // convert 8,16,24,32 to 1,2,3,4 + sub. r6,r6,r7 // account for nonzero bytes in last word + bt++ kkZero,copyinstr10 // 0 found, so done + + beq copyinstr10 // r6==0, so buffer truly full + mtctr r6 // 0 not found, loop over r6 bytes + b copyinstr8 // enter byte loop for last 1-3 leftover bytes + + +// Byte loop. This is used for very small buffers and for the odd bytes left over +// after searching and copying words at a time. + + .align 5 // align inner loops for speed +copyinstr8: // loop over bytes of source + lbz r0,0(r3) // get next byte of source + addi r3,r3,1 + addi r4,r4,1 // increment dest addr whether we store or not + cmpwi r0,0 // the 0? + bt-- kkNull,copyinstr9 // don't store (was copyinstr with NULL ptr) + stb r0,-1(r4) +copyinstr9: + bdnzf cr0_eq,copyinstr8 // loop if byte not 0 and more room in buffer + + mfctr r6 // get #bytes left in buffer + crmove kkZero,cr0_eq // remember if 0 found or buffer filled + + +// Buffer filled or 0 found. Unwind and return. +// r5 = kkBufSize, ie buffer length +// r6 = untransferred bytes remaining in buffer +// r31 = mapped ptr returned by MapUserAddressSpace +// cr3 = kkZero set iff 0 found + +copyinstr10: + lwz r9,kkCountPtr(r1) // get ptr to place to store count of bytes moved + sub r2,r5,r6 // get #bytes we moved, counting the 0 iff any + add r2,r2,r12 // add the header bytes count + li r3,0 // assume 0 return status + stw r2,0(r9) // store #bytes moved + bt++ kkZero,copyinx // we did find the 0 so return 0 + li r3,ENAMETOOLONG // buffer filled + b copyinx // join main exit routine + +// Byte loop. This is used on the header bytes for unaligned source + + .align 5 // align inner loops for speed +copyinstr11: + li r10,4 // load word size + sub r12,r10,r12 // set the header bytes count + mtctr r12 // set up bytes loop count +copyinstr12: // loop over bytes of source + lbz r0,0(r3) // get next byte of source + addi r3,r3,1 + addi r4,r4,1 // increment dest addr whether we store or not + cmpwi r0,0 // the 0? + bt-- kkNull,copyinstr13 // don't store (was copyinstr with NULL ptr) + stb r0,-1(r4) +copyinstr13: + bdnzf cr0_eq,copyinstr12 // loop if byte not 0 and more room in buffer + sub r5,r5,r12 // substract the bytes copied + bne cr0_eq,copyinstr1 // branch to word loop + + mr r5,r12 // Get the header bytes count + li r12,0 // Clear the header bytes count + mfctr r6 // get #bytes left in buffer + crmove kkZero,cr0_eq // remember if 0 found or buffer filled + b copyinstr10 +