-#if 0
- mfpvr r9 ; Get the PVR
- rlwinm r9,r9,16,16,31 ; Isolate the PPC processor
- cmplwi r9,PROCESSOR_VERSION_Max ; Do we have Altivec?
- beq+ wegotaltivec ; Yeah...
-#endif
-
- mfmsr r9 ; Get the MSR
- stwu r1,-(FM_SIZE+32)(r1) ; Make a frame for us
- rlwinm r7,r9,0,MSR_EE_BIT+1,MSR_EE_BIT-1 ; Disable interruptions
- ori r7,r7,lo16(MASK(MSR_FP)) ; Turn on the FPU
- mtmsr r7 ; Disable rupts and enable FPU
- isync
-
- stfd f0,FM_SIZE+0(r1) ; Save an FP register
- rlwinm r7,r7,0,MSR_DR_BIT+1,MSR_DR_BIT-1 ; Clear the DDAT bit
- stfd f1,FM_SIZE+8(r1) ; Save an FP register
- addi r6,r3,PPC_PGBYTES ; Point to the start of the next page
- stfd f2,FM_SIZE+16(r1) ; Save an FP register
- mr r8,r4 ; Save the destination
- stfd f3,FM_SIZE+24(r1) ; Save an FP register
-
- mtmsr r7 ; Set the new MSR
- isync ; Ensure data translations are off
-
- dcbt br0, r3 /* Start in first input line */
- li r5, CACHE_LINE_SIZE /* Get the line size */
-
-.L_pmap_copy_page_loop:
- dcbz 0, r4 /* Allocate a line for the output */
- lfd f0, 0(r3) /* Get first 8 */
- lfd f1, 8(r3) /* Get second 8 */
- lfd f2, 16(r3) /* Get third 8 */
- stfd f0, 0(r4) /* Put first 8 */
- dcbt r5, r3 /* Start next line coming in */
- lfd f3, 24(r3) /* Get fourth 8 */
- stfd f1, 8(r4) /* Put second 8 */
- addi r3,r3,CACHE_LINE_SIZE /* Point to the next line in */
- stfd f2, 16(r4) /* Put third 8 */
- cmplw cr0,r3,r6 /* See if we're finished yet */
- stfd f3, 24(r4) /* Put fourth 8 */
- dcbst br0,r4 /* Force it out */
- addi r4,r4,CACHE_LINE_SIZE /* Point to the next line out */
- blt+ .L_pmap_copy_page_loop /* Copy the whole page */
-
- sync /* Make sure they're all done */
- li r4,PPC_PGBYTES-CACHE_LINE_SIZE /* Point to the end of the page */
-
-invalinst:
- subic. r5,r4,CACHE_LINE_SIZE /* Point to the next one */
- icbi r4, r8 /* Trash the i-cache */
- subi r4,r5,CACHE_LINE_SIZE /* Point to the next one */
- icbi r5, r8 /* Trash the i-cache */
- bgt+ invalinst /* Keep going until we do the page... */
-
- rlwimi r7,r9,0,MSR_DR_BIT,MSR_DR_BIT ; Set DDAT if on
- sync ; Make sure all invalidates done
-
- mtmsr r7 ; Set DDAT correctly
- isync
-
- lfd f0,FM_SIZE+0(r1) ; Restore an FP register
- lfd f1,FM_SIZE+8(r1) ; Restore an FP register
- lfd f2,FM_SIZE+16(r1) ; Restore an FP register
- lfd f3,FM_SIZE+24(r1) ; Restore an FP register
-
- lwz r1,0(r1) ; Pop up the stack
-
- mtmsr r9 ; Turn off FPU now and maybe rupts back on
- isync
- blr
-
-#if 0
-;
-; This is not very optimal. We just do it here for a test of
-; Altivec in the kernel.
-;
-wegotaltivec:
- mfmsr r9 ; Get the MSR
- lis r8,hi16(0xC0000000) ; Make sure we keep the first 2 vector registers
- rlwinm r7,r9,0,MSR_EE_BIT+1,MSR_EE_BIT-1 ; Disable interruptions
- lis r6,lo16(2*256+128) ; Specify 128 blocks of 2 vectors each
- rlwinm r7,r7,0,MSR_DR_BIT+1,MSR_DR_BIT-1 ; Clear the DDAT bit
- ori r6,r6,32 ; Set a 32-byte stride
- mtsprg 256,r8 ; Set VRSave
- mtmsr r7 ; Disable rupts and turn xlate off
- isync
-
- addi r11,r3,4096 ; Point to the next page
- li r10,16 ; Get vector size
-
-avmovepg: lvxl v0,br0,r3 ; Get first half of line
- dcba br0,r4 ; Allocate output
- lvxl v1,r10,r3 ; Get second half of line
- stvxl v0,br0,r4 ; Save first half of line
- addi r3,r3,32 ; Point to the next line
- icbi br0,r4 ; Make the icache go away also
- stvxl v1,r10,r4 ; Save second half of line
- cmplw r3,r11 ; Have we reached the next page?
- dcbst br0,r4 ; Make sure the line is on its way out
- addi r4,r4,32 ; Point to the next line
- blt+ avmovepg ; Move the next line...
-
- li r8,0 ; Clear this
- sync ; Make sure all the memory stuff is done
- mtsprg 256,r8 ; Show we are not using VRs any more
- mtmsr r9 ; Translation and interruptions back on
- isync
- blr
-#endif
+ lis r2,hi16(MASK(MSR_VEC)) ; Get the vector flag
+ mflr r0 // get return
+ ori r2,r2,lo16(MASK(MSR_FP)) ; Add the FP flag
+ stw r0,8(r1) // save
+ stwu r1,-kSFSize(r1) // set up a stack frame for VRs or FPRs
+ mfmsr r11 // save MSR at entry
+ mfsprg r10,2 // get feature flags
+ andc r11,r11,r2 // Clear out vec and fp
+ ori r2,r2,lo16(MASK(MSR_EE)) // Get EE on also
+ andc r2,r11,r2 // Clear out EE as well
+ mtcrf 0x02,r10 // we need to test pf64Bit
+ ori r2,r2,MASK(MSR_FP) // must enable FP for G3...
+ mtcrf 0x80,r10 // we need to test pfAltivec too
+ oris r2,r2,hi16(MASK(MSR_VEC)) // enable altivec for G4 (ignored if G3)
+ mtmsr r2 // turn EE off, FP and VEC on
+ isync
+ bt++ pf64Bitb,pmap_copy_64 // skip if 64-bit processor (only they take hint)
+ slwi r3,r3,12 // get page address from page num
+ slwi r4,r4,12 // get page address from page num
+ rlwinm r12,r2,0,MSR_DR_BIT+1,MSR_DR_BIT-1 // get ready to turn off DR
+ bt pfAltivecb,pmap_copy_g4 // altivec but not 64-bit means G4
+
+
+ // G3 -- copy using FPRs
+
+ stfd f0,FM_SIZE+0(r1) // save the 4 FPRs we use to copy
+ stfd f1,FM_SIZE+8(r1)
+ li r5,PPC_PGBYTES/32 // count of cache lines in a page
+ stfd f2,FM_SIZE+16(r1)
+ mtctr r5
+ stfd f3,FM_SIZE+24(r1)
+ mtmsr r12 // turn off DR after saving FPRs on stack
+ isync
+
+pmap_g3_copy_loop: // loop over 32-byte cache lines
+ dcbz 0,r4 // avoid read of dest line
+ lfd f0,0(r3)
+ lfd f1,8(r3)
+ lfd f2,16(r3)
+ lfd f3,24(r3)
+ addi r3,r3,32
+ stfd f0,0(r4)
+ stfd f1,8(r4)
+ stfd f2,16(r4)
+ stfd f3,24(r4)
+ dcbst 0,r4 // flush dest line to RAM
+ addi r4,r4,32
+ bdnz pmap_g3_copy_loop
+
+ sync // wait for stores to take
+ subi r4,r4,PPC_PGBYTES // restore ptr to destintation page
+ li r6,PPC_PGBYTES-32 // point to last line in page
+pmap_g3_icache_flush:
+ subic. r5,r6,32 // more to go?
+ icbi r4,r6 // flush another line in icache
+ subi r6,r5,32 // get offset to next line
+ icbi r4,r5
+ bne pmap_g3_icache_flush
+
+ sync
+ mtmsr r2 // turn DR back on
+ isync
+ lfd f0,FM_SIZE+0(r1) // restore the FPRs
+ lfd f1,FM_SIZE+8(r1)
+ lfd f2,FM_SIZE+16(r1)
+ lfd f3,FM_SIZE+24(r1)
+
+ b pmap_g4_restore // restore MSR and done
+
+
+ // G4 -- copy using VRs
+
+pmap_copy_g4: // r2=(MSR-EE), r12=(r2-DR), r10=features, r11=old MSR
+ la r9,FM_SIZE+16(r1) // place where we save VRs to r9
+ li r5,16 // load x-form offsets into r5-r9
+ li r6,32 // another offset
+ stvx v0,0,r9 // save some VRs so we can use to copy
+ li r7,48 // another offset
+ stvx v1,r5,r9
+ li r0,PPC_PGBYTES/64 // we loop over 64-byte chunks
+ stvx v2,r6,r9
+ mtctr r0
+ li r8,96 // get look-ahead for touch
+ stvx v3,r7,r9
+ li r9,128
+ mtmsr r12 // now we've saved VRs on stack, turn off DR
+ isync // wait for it to happen
+ b pmap_g4_copy_loop
+
+ .align 5 // align inner loops
+pmap_g4_copy_loop: // loop over 64-byte chunks
+ dcbt r3,r8 // touch 3 lines ahead
+ nop // avoid a 17-word loop...
+ dcbt r3,r9 // touch 4 lines ahead
+ nop // more padding
+ dcba 0,r4 // avoid pre-fetch of 1st dest line
+ lvx v0,0,r3 // offset 0
+ lvx v1,r5,r3 // offset 16
+ lvx v2,r6,r3 // offset 32
+ lvx v3,r7,r3 // offset 48
+ addi r3,r3,64
+ dcba r6,r4 // avoid pre-fetch of 2nd line
+ stvx v0,0,r4 // offset 0
+ stvx v1,r5,r4 // offset 16
+ stvx v2,r6,r4 // offset 32
+ stvx v3,r7,r4 // offset 48
+ dcbf 0,r4 // push line 1
+ dcbf r6,r4 // and line 2
+ addi r4,r4,64
+ bdnz pmap_g4_copy_loop
+
+ sync // wait for stores to take
+ subi r4,r4,PPC_PGBYTES // restore ptr to destintation page
+ li r8,PPC_PGBYTES-32 // point to last line in page
+pmap_g4_icache_flush:
+ subic. r9,r8,32 // more to go?
+ icbi r4,r8 // flush from icache
+ subi r8,r9,32 // get offset to next line
+ icbi r4,r9
+ bne pmap_g4_icache_flush
+
+ sync
+ mtmsr r2 // turn DR back on
+ isync
+ la r9,FM_SIZE+16(r1) // get base of VR save area
+ lvx v0,0,r9 // restore the VRs
+ lvx v1,r5,r9
+ lvx v2,r6,r9
+ lvx v3,r7,r9
+
+pmap_g4_restore: // r11=MSR
+ mtmsr r11 // turn EE on, VEC and FR off
+ isync // wait for it to happen
+ addi r1,r1,kSFSize // pop off our stack frame
+ lwz r0,8(r1) // restore return address
+ mtlr r0
+ blr
+
+
+ // 64-bit/128-byte processor: copy using VRs
+
+pmap_copy_64: // r10=features, r11=old MSR
+ sldi r3,r3,12 // get page address from page num
+ sldi r4,r4,12 // get page address from page num
+ la r9,FM_SIZE+16(r1) // get base of VR save area
+ li r5,16 // load x-form offsets into r5-r9
+ li r6,32 // another offset
+ bf pfAltivecb,pmap_novmx_copy // altivec suppressed...
+ stvx v0,0,r9 // save 8 VRs so we can copy wo bubbles
+ stvx v1,r5,r9
+ li r7,48 // another offset
+ li r0,PPC_PGBYTES/128 // we loop over 128-byte chunks
+ stvx v2,r6,r9
+ stvx v3,r7,r9
+ addi r9,r9,64 // advance base ptr so we can store another 4
+ mtctr r0
+ li r0,MASK(MSR_DR) // get DR bit
+ stvx v4,0,r9
+ stvx v5,r5,r9
+ andc r12,r2,r0 // turn off DR bit
+ li r0,1 // get a 1 to slam into SF
+ stvx v6,r6,r9
+ stvx v7,r7,r9
+ rldimi r12,r0,63,MSR_SF_BIT // set SF bit (bit 0)
+ li r8,-128 // offset so we can reach back one line
+ mtmsrd r12 // now we've saved VRs, turn DR off and SF on
+ isync // wait for it to happen
+ dcbt128 0,r3,1 // start a forward stream
+ b pmap_64_copy_loop
+
+ .align 5 // align inner loops
+pmap_64_copy_loop: // loop over 128-byte chunks
+ dcbz128 0,r4 // avoid read of destination line
+ lvx v0,0,r3 // offset 0
+ lvx v1,r5,r3 // offset 16
+ lvx v2,r6,r3 // offset 32
+ lvx v3,r7,r3 // offset 48
+ addi r3,r3,64 // don't have enough GPRs so add 64 2x
+ lvx v4,0,r3 // offset 64
+ lvx v5,r5,r3 // offset 80
+ lvx v6,r6,r3 // offset 96
+ lvx v7,r7,r3 // offset 112
+ addi r3,r3,64
+ stvx v0,0,r4 // offset 0
+ stvx v1,r5,r4 // offset 16
+ stvx v2,r6,r4 // offset 32
+ stvx v3,r7,r4 // offset 48
+ addi r4,r4,64
+ stvx v4,0,r4 // offset 64
+ stvx v5,r5,r4 // offset 80
+ stvx v6,r6,r4 // offset 96
+ stvx v7,r7,r4 // offset 112
+ addi r4,r4,64
+ dcbf r8,r4 // flush the line we just wrote
+ bdnz pmap_64_copy_loop
+
+ sync // wait for stores to take
+ subi r4,r4,PPC_PGBYTES // restore ptr to destintation page
+ li r8,PPC_PGBYTES-128 // point to last line in page
+pmap_64_icache_flush:
+ subic. r9,r8,128 // more to go?
+ icbi r4,r8 // flush from icache
+ subi r8,r9,128 // get offset to next line
+ icbi r4,r9
+ bne pmap_64_icache_flush
+
+ sync
+ mtmsrd r2 // turn DR back on, SF off
+ isync
+ la r9,FM_SIZE+16(r1) // get base address of VR save area on stack
+ lvx v0,0,r9 // restore the VRs
+ lvx v1,r5,r9
+ lvx v2,r6,r9
+ lvx v3,r7,r9
+ addi r9,r9,64
+ lvx v4,0,r9
+ lvx v5,r5,r9
+ lvx v6,r6,r9
+ lvx v7,r7,r9
+
+ b pmap_g4_restore // restore lower half of MSR and return
+
+ //
+ // Copy on 64-bit without VMX
+ //
+
+pmap_novmx_copy:
+ li r0,PPC_PGBYTES/128 // we loop over 128-byte chunks
+ mtctr r0
+ li r0,MASK(MSR_DR) // get DR bit
+ andc r12,r2,r0 // turn off DR bit
+ li r0,1 // get a 1 to slam into SF
+ rldimi r12,r0,63,MSR_SF_BIT // set SF bit (bit 0)
+ mtmsrd r12 // now we've saved VRs, turn DR off and SF on
+ isync // wait for it to happen
+ dcbt128 0,r3,1 // start a forward stream
+
+pmap_novmx_copy_loop: // loop over 128-byte cache lines
+ dcbz128 0,r4 // avoid read of dest line
+
+ ld r0,0(r3) // Load half a line
+ ld r12,8(r3)
+ ld r5,16(r3)
+ ld r6,24(r3)
+ ld r7,32(r3)
+ ld r8,40(r3)
+ ld r9,48(r3)
+ ld r10,56(r3)
+
+ std r0,0(r4) // Store half a line
+ std r12,8(r4)
+ std r5,16(r4)
+ std r6,24(r4)
+ std r7,32(r4)
+ std r8,40(r4)
+ std r9,48(r4)
+ std r10,56(r4)
+
+ ld r0,64(r3) // Load half a line
+ ld r12,72(r3)
+ ld r5,80(r3)
+ ld r6,88(r3)
+ ld r7,96(r3)
+ ld r8,104(r3)
+ ld r9,112(r3)
+ ld r10,120(r3)
+
+ addi r3,r3,128
+
+ std r0,64(r4) // Store half a line
+ std r12,72(r4)
+ std r5,80(r4)
+ std r6,88(r4)
+ std r7,96(r4)
+ std r8,104(r4)
+ std r9,112(r4)
+ std r10,120(r4)
+
+ dcbf 0,r4 // flush the line we just wrote
+ addi r4,r4,128
+ bdnz pmap_novmx_copy_loop
+
+ sync // wait for stores to take
+ subi r4,r4,PPC_PGBYTES // restore ptr to destintation page
+ li r8,PPC_PGBYTES-128 // point to last line in page
+
+pmap_novmx_icache_flush:
+ subic. r9,r8,128 // more to go?
+ icbi r4,r8 // flush from icache
+ subi r8,r9,128 // get offset to next line
+ icbi r4,r9
+ bne pmap_novmx_icache_flush
+
+ sync
+ mtmsrd r2 // turn DR back on, SF off
+ isync
+
+ b pmap_g4_restore // restore lower half of MSR and return
+
+
+
+//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>