+ lis r2,hi16(MASK(MSR_VEC)) ; Get the vector flag
+ mflr r0 // get return
+ ori r2,r2,lo16(MASK(MSR_FP)) ; Add the FP flag
+ stw r0,8(r1) // save
+ stwu r1,-kSFSize(r1) // set up a stack frame for VRs or FPRs
+ mfmsr r11 // save MSR at entry
+ mfsprg r10,2 // get feature flags
+ andc r11,r11,r2 // Clear out vec and fp
+ ori r2,r2,lo16(MASK(MSR_EE)) // Get EE on also
+ andc r2,r11,r2 // Clear out EE as well
+ mtcrf 0x02,r10 // we need to test pf64Bit
+ ori r2,r2,MASK(MSR_FP) // must enable FP for G3...
+ mtcrf 0x80,r10 // we need to test pfAltivec too
+ oris r2,r2,hi16(MASK(MSR_VEC)) // enable altivec for G4 (ignored if G3)
+ mtmsr r2 // turn EE off, FP and VEC on
+ isync
+ bt++ pf64Bitb,pmap_copy_64 // skip if 64-bit processor (only they take hint)
+ slwi r3,r3,12 // get page address from page num
+ slwi r4,r4,12 // get page address from page num
+ rlwinm r12,r2,0,MSR_DR_BIT+1,MSR_DR_BIT-1 // get ready to turn off DR
+ bt pfAltivecb,pmap_copy_g4 // altivec but not 64-bit means G4
+
+
+ // G3 -- copy using FPRs
+
+ stfd f0,FM_SIZE+0(r1) // save the 4 FPRs we use to copy
+ stfd f1,FM_SIZE+8(r1)
+ li r5,PPC_PGBYTES/32 // count of cache lines in a page
+ stfd f2,FM_SIZE+16(r1)
+ mtctr r5
+ stfd f3,FM_SIZE+24(r1)
+ mtmsr r12 // turn off DR after saving FPRs on stack
+ isync
+
+pmap_g3_copy_loop: // loop over 32-byte cache lines
+ dcbz 0,r4 // avoid read of dest line
+ lfd f0,0(r3)
+ lfd f1,8(r3)
+ lfd f2,16(r3)
+ lfd f3,24(r3)
+ addi r3,r3,32
+ stfd f0,0(r4)
+ stfd f1,8(r4)
+ stfd f2,16(r4)
+ stfd f3,24(r4)
+ dcbst 0,r4 // flush dest line to RAM
+ addi r4,r4,32
+ bdnz pmap_g3_copy_loop
+
+ sync // wait for stores to take
+ subi r4,r4,PPC_PGBYTES // restore ptr to destintation page
+ li r6,PPC_PGBYTES-32 // point to last line in page
+pmap_g3_icache_flush:
+ subic. r5,r6,32 // more to go?
+ icbi r4,r6 // flush another line in icache
+ subi r6,r5,32 // get offset to next line
+ icbi r4,r5
+ bne pmap_g3_icache_flush
+
+ sync
+ mtmsr r2 // turn DR back on
+ isync
+ lfd f0,FM_SIZE+0(r1) // restore the FPRs
+ lfd f1,FM_SIZE+8(r1)
+ lfd f2,FM_SIZE+16(r1)
+ lfd f3,FM_SIZE+24(r1)
+
+ b pmap_g4_restore // restore MSR and done
+
+
+ // G4 -- copy using VRs
+
+pmap_copy_g4: // r2=(MSR-EE), r12=(r2-DR), r10=features, r11=old MSR
+ la r9,FM_SIZE+16(r1) // place where we save VRs to r9
+ li r5,16 // load x-form offsets into r5-r9
+ li r6,32 // another offset
+ stvx v0,0,r9 // save some VRs so we can use to copy
+ li r7,48 // another offset
+ stvx v1,r5,r9
+ li r0,PPC_PGBYTES/64 // we loop over 64-byte chunks
+ stvx v2,r6,r9
+ mtctr r0
+ li r8,96 // get look-ahead for touch
+ stvx v3,r7,r9
+ li r9,128
+ mtmsr r12 // now we've saved VRs on stack, turn off DR
+ isync // wait for it to happen
+ b pmap_g4_copy_loop
+
+ .align 5 // align inner loops
+pmap_g4_copy_loop: // loop over 64-byte chunks
+ dcbt r3,r8 // touch 3 lines ahead
+ nop // avoid a 17-word loop...
+ dcbt r3,r9 // touch 4 lines ahead
+ nop // more padding
+ dcba 0,r4 // avoid pre-fetch of 1st dest line
+ lvx v0,0,r3 // offset 0
+ lvx v1,r5,r3 // offset 16
+ lvx v2,r6,r3 // offset 32
+ lvx v3,r7,r3 // offset 48
+ addi r3,r3,64
+ dcba r6,r4 // avoid pre-fetch of 2nd line
+ stvx v0,0,r4 // offset 0
+ stvx v1,r5,r4 // offset 16
+ stvx v2,r6,r4 // offset 32
+ stvx v3,r7,r4 // offset 48
+ dcbf 0,r4 // push line 1
+ dcbf r6,r4 // and line 2
+ addi r4,r4,64
+ bdnz pmap_g4_copy_loop
+
+ sync // wait for stores to take
+ subi r4,r4,PPC_PGBYTES // restore ptr to destintation page
+ li r8,PPC_PGBYTES-32 // point to last line in page
+pmap_g4_icache_flush:
+ subic. r9,r8,32 // more to go?
+ icbi r4,r8 // flush from icache
+ subi r8,r9,32 // get offset to next line
+ icbi r4,r9
+ bne pmap_g4_icache_flush
+
+ sync
+ mtmsr r2 // turn DR back on
+ isync
+ la r9,FM_SIZE+16(r1) // get base of VR save area
+ lvx v0,0,r9 // restore the VRs
+ lvx v1,r5,r9
+ lvx v2,r6,r9
+ lvx v3,r7,r9
+
+pmap_g4_restore: // r11=MSR
+ mtmsr r11 // turn EE on, VEC and FR off
+ isync // wait for it to happen
+ addi r1,r1,kSFSize // pop off our stack frame
+ lwz r0,8(r1) // restore return address
+ mtlr r0
+ blr
+
+
+ // 64-bit/128-byte processor: copy using VRs
+
+pmap_copy_64: // r10=features, r11=old MSR
+ sldi r3,r3,12 // get page address from page num
+ sldi r4,r4,12 // get page address from page num
+ la r9,FM_SIZE+16(r1) // get base of VR save area
+ li r5,16 // load x-form offsets into r5-r9
+ li r6,32 // another offset
+ bf pfAltivecb,pmap_novmx_copy // altivec suppressed...
+ stvx v0,0,r9 // save 8 VRs so we can copy wo bubbles
+ stvx v1,r5,r9
+ li r7,48 // another offset
+ li r0,PPC_PGBYTES/128 // we loop over 128-byte chunks
+ stvx v2,r6,r9
+ stvx v3,r7,r9
+ addi r9,r9,64 // advance base ptr so we can store another 4
+ mtctr r0
+ li r0,MASK(MSR_DR) // get DR bit
+ stvx v4,0,r9
+ stvx v5,r5,r9
+ andc r12,r2,r0 // turn off DR bit
+ li r0,1 // get a 1 to slam into SF
+ stvx v6,r6,r9
+ stvx v7,r7,r9
+ rldimi r12,r0,63,MSR_SF_BIT // set SF bit (bit 0)
+ li r8,-128 // offset so we can reach back one line
+ mtmsrd r12 // now we've saved VRs, turn DR off and SF on
+ isync // wait for it to happen
+ dcbt128 0,r3,1 // start a forward stream
+ b pmap_64_copy_loop
+
+ .align 5 // align inner loops
+pmap_64_copy_loop: // loop over 128-byte chunks
+ dcbz128 0,r4 // avoid read of destination line
+ lvx v0,0,r3 // offset 0
+ lvx v1,r5,r3 // offset 16
+ lvx v2,r6,r3 // offset 32
+ lvx v3,r7,r3 // offset 48
+ addi r3,r3,64 // don't have enough GPRs so add 64 2x
+ lvx v4,0,r3 // offset 64
+ lvx v5,r5,r3 // offset 80
+ lvx v6,r6,r3 // offset 96
+ lvx v7,r7,r3 // offset 112
+ addi r3,r3,64
+ stvx v0,0,r4 // offset 0
+ stvx v1,r5,r4 // offset 16
+ stvx v2,r6,r4 // offset 32
+ stvx v3,r7,r4 // offset 48
+ addi r4,r4,64
+ stvx v4,0,r4 // offset 64
+ stvx v5,r5,r4 // offset 80
+ stvx v6,r6,r4 // offset 96
+ stvx v7,r7,r4 // offset 112
+ addi r4,r4,64
+ dcbf r8,r4 // flush the line we just wrote
+ bdnz pmap_64_copy_loop
+
+ sync // wait for stores to take
+ subi r4,r4,PPC_PGBYTES // restore ptr to destintation page
+ li r8,PPC_PGBYTES-128 // point to last line in page
+pmap_64_icache_flush:
+ subic. r9,r8,128 // more to go?
+ icbi r4,r8 // flush from icache
+ subi r8,r9,128 // get offset to next line
+ icbi r4,r9
+ bne pmap_64_icache_flush
+
+ sync
+ mtmsrd r2 // turn DR back on, SF off
+ isync
+ la r9,FM_SIZE+16(r1) // get base address of VR save area on stack
+ lvx v0,0,r9 // restore the VRs
+ lvx v1,r5,r9
+ lvx v2,r6,r9
+ lvx v3,r7,r9
+ addi r9,r9,64
+ lvx v4,0,r9
+ lvx v5,r5,r9
+ lvx v6,r6,r9
+ lvx v7,r7,r9
+
+ b pmap_g4_restore // restore lower half of MSR and return
+
+ //
+ // Copy on 64-bit without VMX
+ //
+
+pmap_novmx_copy:
+ li r0,PPC_PGBYTES/128 // we loop over 128-byte chunks
+ mtctr r0
+ li r0,MASK(MSR_DR) // get DR bit
+ andc r12,r2,r0 // turn off DR bit
+ li r0,1 // get a 1 to slam into SF
+ rldimi r12,r0,63,MSR_SF_BIT // set SF bit (bit 0)
+ mtmsrd r12 // now we've saved VRs, turn DR off and SF on
+ isync // wait for it to happen
+ dcbt128 0,r3,1 // start a forward stream
+
+pmap_novmx_copy_loop: // loop over 128-byte cache lines
+ dcbz128 0,r4 // avoid read of dest line
+
+ ld r0,0(r3) // Load half a line
+ ld r12,8(r3)
+ ld r5,16(r3)
+ ld r6,24(r3)
+ ld r7,32(r3)
+ ld r8,40(r3)
+ ld r9,48(r3)
+ ld r10,56(r3)
+
+ std r0,0(r4) // Store half a line
+ std r12,8(r4)
+ std r5,16(r4)
+ std r6,24(r4)
+ std r7,32(r4)
+ std r8,40(r4)
+ std r9,48(r4)
+ std r10,56(r4)
+
+ ld r0,64(r3) // Load half a line
+ ld r12,72(r3)
+ ld r5,80(r3)
+ ld r6,88(r3)
+ ld r7,96(r3)
+ ld r8,104(r3)
+ ld r9,112(r3)
+ ld r10,120(r3)
+
+ addi r3,r3,128
+
+ std r0,64(r4) // Store half a line
+ std r12,72(r4)
+ std r5,80(r4)
+ std r6,88(r4)
+ std r7,96(r4)
+ std r8,104(r4)
+ std r9,112(r4)
+ std r10,120(r4)
+
+ dcbf 0,r4 // flush the line we just wrote
+ addi r4,r4,128
+ bdnz pmap_novmx_copy_loop
+
+ sync // wait for stores to take
+ subi r4,r4,PPC_PGBYTES // restore ptr to destintation page
+ li r8,PPC_PGBYTES-128 // point to last line in page
+
+pmap_novmx_icache_flush:
+ subic. r9,r8,128 // more to go?
+ icbi r4,r8 // flush from icache
+ subi r8,r9,128 // get offset to next line
+ icbi r4,r9
+ bne pmap_novmx_icache_flush
+
+ sync
+ mtmsrd r2 // turn DR back on, SF off
+ isync
+
+ b pmap_g4_restore // restore lower half of MSR and return
+
+
+
+//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>