+ stfd f28,savefp28(r3)
+ stfd f29,savefp29(r3)
+ stfd f30,savefp30(r3)
+ stfd f31,savefp31(r3)
+ blr
+
+
+// *******************
+// * v r _ s t o r e *
+// *******************
+//
+// Store VRs into savearea, according to bits set in passed vrsave bitfield. This routine is used
+// both by vec_save and vec_switch. In order to minimize conditional branches and touching in
+// unnecessary cache blocks, we either save all or none of the VRs in a block. We have separate paths
+// for each cache block size.
+//
+// When called:
+// interrupts are off, vectors are enabled
+// r3 = ptr to save area
+// r10 = vrsave (not 0)
+//
+// We destroy:
+// r4 - r11, all CRs.
+
+vr_store:
+ mfsprg r9,2 ; get feature flags
+ stw r10,savevrvalid(r3) ; Save the validity information in savearea
+ slwi r8,r10,1 ; Shift over 1
+ mtcrf 0x02,r9 ; put cache line size bits in cr6 where we can test
+ or r8,r10,r8 ; r8 <- even bits show which pairs are in use
+ bt-- pf32Byteb,vr_st32 ; skip if 32-byte cacheline processor
+
+
+; Save vectors on a 128-byte linesize processor. We save all or none of the 8 registers in each of
+; the four cache lines. This minimizes mispredicted branches yet handles cache lines optimally.
+
+ slwi r7,r8,2 ; shift groups-of-2 over by 2
+ li r4,16 ; load offsets for X-form stores
+ or r8,r7,r8 ; show if any in group of 4 are in use
+ li r5,32
+ slwi r7,r8,4 ; shift groups-of-4 over by 4
+ li r6,48
+ or r11,r7,r8 ; show if any in group of 8 are in use
+ li r7,64
+ mtcrf 0x80,r11 ; set CRs one at a time (faster)
+ li r8,80
+ mtcrf 0x20,r11
+ li r9,96
+ mtcrf 0x08,r11
+ li r10,112
+ mtcrf 0x02,r11
+
+ bf 0,vr_st64b ; skip if none of vr0-vr7 are in use
+ la r11,savevr0(r3) ; get address of this group of registers in save area
+ dcbz128 0,r11 ; zero the line
+ stvxl v0,0,r11 ; save 8 VRs in the line
+ stvxl v1,r4,r11
+ stvxl v2,r5,r11
+ stvxl v3,r6,r11
+ stvxl v4,r7,r11
+ stvxl v5,r8,r11
+ stvxl v6,r9,r11
+ stvxl v7,r10,r11
+
+vr_st64b:
+ bf 8,vr_st64c ; skip if none of vr8-vr15 are in use
+ la r11,savevr8(r3) ; get address of this group of registers in save area
+ dcbz128 0,r11 ; zero the line
+ stvxl v8,0,r11 ; save 8 VRs in the line
+ stvxl v9,r4,r11
+ stvxl v10,r5,r11
+ stvxl v11,r6,r11
+ stvxl v12,r7,r11
+ stvxl v13,r8,r11
+ stvxl v14,r9,r11
+ stvxl v15,r10,r11
+
+vr_st64c:
+ bf 16,vr_st64d ; skip if none of vr16-vr23 are in use
+ la r11,savevr16(r3) ; get address of this group of registers in save area
+ dcbz128 0,r11 ; zero the line
+ stvxl v16,0,r11 ; save 8 VRs in the line
+ stvxl v17,r4,r11
+ stvxl v18,r5,r11
+ stvxl v19,r6,r11
+ stvxl v20,r7,r11
+ stvxl v21,r8,r11
+ stvxl v22,r9,r11
+ stvxl v23,r10,r11
+
+vr_st64d:
+ bflr 24 ; done if none of vr24-vr31 are in use
+ la r11,savevr24(r3) ; get address of this group of registers in save area
+ dcbz128 0,r11 ; zero the line
+ stvxl v24,0,r11 ; save 8 VRs in the line
+ stvxl v25,r4,r11
+ stvxl v26,r5,r11
+ stvxl v27,r6,r11
+ stvxl v28,r7,r11
+ stvxl v29,r8,r11
+ stvxl v30,r9,r11
+ stvxl v31,r10,r11
+ blr
+
+; Save vectors on a 32-byte linesize processor. We save in 16 groups of 2: we either save both
+; or neither in each group. This cuts down on conditional branches.
+; r8 = bitmask with bit n set (for even n) if either of that pair of VRs is in use
+; r3 = savearea
+
+vr_st32:
+ mtcrf 0xFF,r8 ; set CR bits so we can branch on them
+ li r4,16 ; load offset for X-form stores
+
+ bf 0,vr_st32b ; skip if neither VR in this pair is in use
+ la r11,savevr0(r3) ; get address of this group of registers in save area
+ dcba 0,r11 ; establish the line wo reading it
+ stvxl v0,0,r11 ; save the two VRs in the line
+ stvxl v1,r4,r11
+
+vr_st32b:
+ bf 2,vr_st32c ; skip if neither VR in this pair is in use
+ la r11,savevr2(r3) ; get address of this group of registers in save area
+ dcba 0,r11 ; establish the line wo reading it
+ stvxl v2,0,r11 ; save the two VRs in the line
+ stvxl v3,r4,r11
+
+vr_st32c:
+ bf 4,vr_st32d ; skip if neither VR in this pair is in use
+ la r11,savevr4(r3) ; get address of this group of registers in save area
+ dcba 0,r11 ; establish the line wo reading it
+ stvxl v4,0,r11 ; save the two VRs in the line
+ stvxl v5,r4,r11
+
+vr_st32d:
+ bf 6,vr_st32e ; skip if neither VR in this pair is in use
+ la r11,savevr6(r3) ; get address of this group of registers in save area
+ dcba 0,r11 ; establish the line wo reading it
+ stvxl v6,0,r11 ; save the two VRs in the line
+ stvxl v7,r4,r11
+
+vr_st32e:
+ bf 8,vr_st32f ; skip if neither VR in this pair is in use
+ la r11,savevr8(r3) ; get address of this group of registers in save area
+ dcba 0,r11 ; establish the line wo reading it
+ stvxl v8,0,r11 ; save the two VRs in the line
+ stvxl v9,r4,r11
+
+vr_st32f:
+ bf 10,vr_st32g ; skip if neither VR in this pair is in use
+ la r11,savevr10(r3) ; get address of this group of registers in save area
+ dcba 0,r11 ; establish the line wo reading it
+ stvxl v10,0,r11 ; save the two VRs in the line
+ stvxl v11,r4,r11
+
+vr_st32g:
+ bf 12,vr_st32h ; skip if neither VR in this pair is in use
+ la r11,savevr12(r3) ; get address of this group of registers in save area
+ dcba 0,r11 ; establish the line wo reading it
+ stvxl v12,0,r11 ; save the two VRs in the line
+ stvxl v13,r4,r11
+
+vr_st32h:
+ bf 14,vr_st32i ; skip if neither VR in this pair is in use
+ la r11,savevr14(r3) ; get address of this group of registers in save area
+ dcba 0,r11 ; establish the line wo reading it
+ stvxl v14,0,r11 ; save the two VRs in the line
+ stvxl v15,r4,r11
+
+vr_st32i:
+ bf 16,vr_st32j ; skip if neither VR in this pair is in use
+ la r11,savevr16(r3) ; get address of this group of registers in save area
+ dcba 0,r11 ; establish the line wo reading it
+ stvxl v16,0,r11 ; save the two VRs in the line
+ stvxl v17,r4,r11
+
+vr_st32j:
+ bf 18,vr_st32k ; skip if neither VR in this pair is in use
+ la r11,savevr18(r3) ; get address of this group of registers in save area
+ dcba 0,r11 ; establish the line wo reading it
+ stvxl v18,0,r11 ; save the two VRs in the line
+ stvxl v19,r4,r11
+
+vr_st32k:
+ bf 20,vr_st32l ; skip if neither VR in this pair is in use
+ la r11,savevr20(r3) ; get address of this group of registers in save area
+ dcba 0,r11 ; establish the line wo reading it
+ stvxl v20,0,r11 ; save the two VRs in the line
+ stvxl v21,r4,r11
+
+vr_st32l:
+ bf 22,vr_st32m ; skip if neither VR in this pair is in use
+ la r11,savevr22(r3) ; get address of this group of registers in save area
+ dcba 0,r11 ; establish the line wo reading it
+ stvxl v22,0,r11 ; save the two VRs in the line
+ stvxl v23,r4,r11
+
+vr_st32m:
+ bf 24,vr_st32n ; skip if neither VR in this pair is in use
+ la r11,savevr24(r3) ; get address of this group of registers in save area
+ dcba 0,r11 ; establish the line wo reading it
+ stvxl v24,0,r11 ; save the two VRs in the line
+ stvxl v25,r4,r11
+
+vr_st32n:
+ bf 26,vr_st32o ; skip if neither VR in this pair is in use
+ la r11,savevr26(r3) ; get address of this group of registers in save area
+ dcba 0,r11 ; establish the line wo reading it
+ stvxl v26,0,r11 ; save the two VRs in the line
+ stvxl v27,r4,r11
+
+vr_st32o:
+ bf 28,vr_st32p ; skip if neither VR in this pair is in use
+ la r11,savevr28(r3) ; get address of this group of registers in save area
+ dcba 0,r11 ; establish the line wo reading it
+ stvxl v28,0,r11 ; save the two VRs in the line
+ stvxl v29,r4,r11
+
+vr_st32p:
+ bflr 30 ; done if neither VR in this pair is in use
+ la r11,savevr30(r3) ; get address of this group of registers in save area
+ dcba 0,r11 ; establish the line wo reading it
+ stvxl v30,0,r11 ; save the two VRs in the line
+ stvxl v31,r4,r11
+ blr
+
+
+// *****************
+// * v r _ l o a d *
+// *****************
+//
+// Load live VRs from a savearea, according to bits set in a passed vector. This is the reverse
+// of "vr_store". Like it, we avoid touching unnecessary cache blocks and minimize conditional
+// branches by loading all VRs from a cache line, if we have to load any. If we don't load the VRs
+// in a cache line, we bug them. Note that this behavior is slightly different from earlier kernels,
+// which would bug all VRs that aren't live.
+//
+// When called:
+// interrupts are off, vectors are enabled
+// r3 = ptr to save area
+// r10 = vector of live regs to load (ie, savevrsave & savevrvalid, may be 0)
+// v31 = bugbug constant (0x7FFFDEAD7FFFDEAD7FFFDEAD7FFFDEAD)
+//
+// We destroy:
+// r4 - r11, all CRs.
+
+vr_load:
+ mfsprg r9,2 ; get feature flags
+ li r6,1 ; assuming 32-byte, get (#VRs)-1 in a cacheline
+ mtcrf 0x02,r9 ; set cache line size bits in cr6
+ lis r7,0xC000 ; assuming 32-byte, set bits 0-1
+ bt-- pf32Byteb,vr_ld0 ; skip if 32-bit processor
+ li r6,7 ; 128-byte machines have 8 VRs in a cacheline
+ lis r7,0xFF00 ; so set bits 0-7
+
+// Loop touching in cache blocks we will load from.
+// r3 = savearea ptr
+// r5 = we light bits for the VRs we will be loading
+// r6 = 1 if 32-byte, 7 if 128-byte
+// r7 = 0xC0000000 if 32-byte, 0xFF000000 if 128-byte
+// r10 = live VR bits
+// v31 = bugbug constant
+
+vr_ld0:
+ li r5,0 ; initialize set of VRs to load
+ la r11,savevr0(r3) ; get address of register file
+ b vr_ld2 ; enter loop in middle
+
+ .align 5
+vr_ld1: ; loop over each cache line we will load
+ dcbt r4,r11 ; start prefetch of the line
+ andc r10,r10,r9 ; turn off the bits in this line
+ or r5,r5,r9 ; we will load all these
+vr_ld2: ; initial entry pt
+ cntlzw r4,r10 ; get offset to next live VR
+ andc r4,r4,r6 ; cacheline align it
+ srw. r9,r7,r4 ; position bits for VRs in that cache line
+ slwi r4,r4,4 ; get byte offset within register file to that line
+ bne vr_ld1 ; loop if more bits in r10
+
+ bf-- pf128Byteb,vr_ld32 ; skip if not 128-byte lines
+
+// Handle a processor with 128-byte cache lines. Four groups of 8 VRs.
+// r3 = savearea ptr
+// r5 = 1st bit in each cacheline is 1 iff any reg in that line must be loaded
+// r11 = addr(savevr0)
+// v31 = bugbug constant
+
+ mtcrf 0x80,r5 ; set up bits for conditional branches
+ li r4,16 ; load offsets for X-form stores
+ li r6,48
+ mtcrf 0x20,r5 ; load CRs ona at a time, which is faster
+ li r7,64
+ li r8,80
+ mtcrf 0x08,r5
+ li r9,96
+ li r10,112
+ mtcrf 0x02,r5
+ li r5,32
+
+ bt 0,vr_ld128a ; skip if this line must be loaded
+ vor v0,v31,v31 ; no VR must be loaded, so bug them all
+ vor v1,v31,v31
+ vor v2,v31,v31
+ vor v3,v31,v31
+ vor v4,v31,v31
+ vor v5,v31,v31
+ vor v6,v31,v31
+ vor v7,v31,v31
+ b vr_ld128b
+vr_ld128a: ; must load from this line
+ lvxl v0,0,r11
+ lvxl v1,r4,r11
+ lvxl v2,r5,r11
+ lvxl v3,r6,r11
+ lvxl v4,r7,r11
+ lvxl v5,r8,r11
+ lvxl v6,r9,r11
+ lvxl v7,r10,r11
+
+vr_ld128b: ; here to handle next cache line
+ la r11,savevr8(r3) ; load offset to it
+ bt 8,vr_ld128c ; skip if this line must be loaded
+ vor v8,v31,v31 ; no VR must be loaded, so bug them all
+ vor v9,v31,v31
+ vor v10,v31,v31
+ vor v11,v31,v31
+ vor v12,v31,v31
+ vor v13,v31,v31
+ vor v14,v31,v31
+ vor v15,v31,v31
+ b vr_ld128d
+vr_ld128c: ; must load from this line
+ lvxl v8,0,r11
+ lvxl v9,r4,r11
+ lvxl v10,r5,r11
+ lvxl v11,r6,r11
+ lvxl v12,r7,r11
+ lvxl v13,r8,r11
+ lvxl v14,r9,r11
+ lvxl v15,r10,r11
+
+vr_ld128d: ; here to handle next cache line
+ la r11,savevr16(r3) ; load offset to it
+ bt 16,vr_ld128e ; skip if this line must be loaded
+ vor v16,v31,v31 ; no VR must be loaded, so bug them all
+ vor v17,v31,v31
+ vor v18,v31,v31
+ vor v19,v31,v31
+ vor v20,v31,v31
+ vor v21,v31,v31
+ vor v22,v31,v31
+ vor v23,v31,v31
+ b vr_ld128f
+vr_ld128e: ; must load from this line
+ lvxl v16,0,r11
+ lvxl v17,r4,r11
+ lvxl v18,r5,r11
+ lvxl v19,r6,r11
+ lvxl v20,r7,r11
+ lvxl v21,r8,r11
+ lvxl v22,r9,r11
+ lvxl v23,r10,r11
+
+vr_ld128f: ; here to handle next cache line
+ la r11,savevr24(r3) ; load offset to it
+ bt 24,vr_ld128g ; skip if this line must be loaded
+ vor v24,v31,v31 ; no VR must be loaded, so bug them all
+ vor v25,v31,v31
+ vor v26,v31,v31
+ vor v27,v31,v31
+ vor v28,v31,v31
+ vor v29,v31,v31
+ vor v30,v31,v31
+ blr
+vr_ld128g: ; must load from this line
+ lvxl v24,0,r11
+ lvxl v25,r4,r11
+ lvxl v26,r5,r11
+ lvxl v27,r6,r11
+ lvxl v28,r7,r11
+ lvxl v29,r8,r11
+ lvxl v30,r9,r11
+ lvxl v31,r10,r11
+ blr
+
+// Handle a processor with 32-byte cache lines. Sixteen groups of two VRs.
+// r5 = 1st bit in each cacheline is 1 iff any reg in that line must be loaded
+// r11 = addr(savevr0)
+
+vr_ld32:
+ mtcrf 0xFF,r5 ; set up bits for conditional branches
+ li r4,16 ; load offset for X-form stores
+
+ bt 0,vr_ld32load0 ; skip if we must load this line
+ vor v0,v31,v31 ; neither VR is live, so bug them both
+ vor v1,v31,v31
+ b vr_ld32test2
+vr_ld32load0: ; must load VRs in this line
+ lvxl v0,0,r11
+ lvxl v1,r4,r11
+
+vr_ld32test2: ; here to handle next cache line
+ la r11,savevr2(r3) ; get offset to next cache line
+ bt 2,vr_ld32load2 ; skip if we must load this line
+ vor v2,v31,v31 ; neither VR is live, so bug them both
+ vor v3,v31,v31
+ b vr_ld32test4
+vr_ld32load2: ; must load VRs in this line
+ lvxl v2,0,r11
+ lvxl v3,r4,r11
+
+vr_ld32test4: ; here to handle next cache line
+ la r11,savevr4(r3) ; get offset to next cache line
+ bt 4,vr_ld32load4 ; skip if we must load this line
+ vor v4,v31,v31 ; neither VR is live, so bug them both
+ vor v5,v31,v31
+ b vr_ld32test6
+vr_ld32load4: ; must load VRs in this line
+ lvxl v4,0,r11
+ lvxl v5,r4,r11
+
+vr_ld32test6: ; here to handle next cache line
+ la r11,savevr6(r3) ; get offset to next cache line
+ bt 6,vr_ld32load6 ; skip if we must load this line
+ vor v6,v31,v31 ; neither VR is live, so bug them both
+ vor v7,v31,v31
+ b vr_ld32test8
+vr_ld32load6: ; must load VRs in this line
+ lvxl v6,0,r11
+ lvxl v7,r4,r11
+
+vr_ld32test8: ; here to handle next cache line
+ la r11,savevr8(r3) ; get offset to next cache line
+ bt 8,vr_ld32load8 ; skip if we must load this line
+ vor v8,v31,v31 ; neither VR is live, so bug them both
+ vor v9,v31,v31
+ b vr_ld32test10
+vr_ld32load8: ; must load VRs in this line
+ lvxl v8,0,r11
+ lvxl v9,r4,r11
+
+vr_ld32test10: ; here to handle next cache line
+ la r11,savevr10(r3) ; get offset to next cache line
+ bt 10,vr_ld32load10 ; skip if we must load this line
+ vor v10,v31,v31 ; neither VR is live, so bug them both
+ vor v11,v31,v31
+ b vr_ld32test12
+vr_ld32load10: ; must load VRs in this line
+ lvxl v10,0,r11
+ lvxl v11,r4,r11
+
+vr_ld32test12: ; here to handle next cache line
+ la r11,savevr12(r3) ; get offset to next cache line
+ bt 12,vr_ld32load12 ; skip if we must load this line
+ vor v12,v31,v31 ; neither VR is live, so bug them both
+ vor v13,v31,v31
+ b vr_ld32test14
+vr_ld32load12: ; must load VRs in this line
+ lvxl v12,0,r11
+ lvxl v13,r4,r11
+
+vr_ld32test14: ; here to handle next cache line
+ la r11,savevr14(r3) ; get offset to next cache line
+ bt 14,vr_ld32load14 ; skip if we must load this line
+ vor v14,v31,v31 ; neither VR is live, so bug them both
+ vor v15,v31,v31
+ b vr_ld32test16
+vr_ld32load14: ; must load VRs in this line
+ lvxl v14,0,r11
+ lvxl v15,r4,r11
+
+vr_ld32test16: ; here to handle next cache line
+ la r11,savevr16(r3) ; get offset to next cache line
+ bt 16,vr_ld32load16 ; skip if we must load this line
+ vor v16,v31,v31 ; neither VR is live, so bug them both
+ vor v17,v31,v31
+ b vr_ld32test18
+vr_ld32load16: ; must load VRs in this line
+ lvxl v16,0,r11
+ lvxl v17,r4,r11
+
+vr_ld32test18: ; here to handle next cache line
+ la r11,savevr18(r3) ; get offset to next cache line
+ bt 18,vr_ld32load18 ; skip if we must load this line
+ vor v18,v31,v31 ; neither VR is live, so bug them both
+ vor v19,v31,v31
+ b vr_ld32test20
+vr_ld32load18: ; must load VRs in this line
+ lvxl v18,0,r11
+ lvxl v19,r4,r11
+
+vr_ld32test20: ; here to handle next cache line
+ la r11,savevr20(r3) ; get offset to next cache line
+ bt 20,vr_ld32load20 ; skip if we must load this line
+ vor v20,v31,v31 ; neither VR is live, so bug them both
+ vor v21,v31,v31
+ b vr_ld32test22
+vr_ld32load20: ; must load VRs in this line
+ lvxl v20,0,r11
+ lvxl v21,r4,r11
+
+vr_ld32test22: ; here to handle next cache line
+ la r11,savevr22(r3) ; get offset to next cache line
+ bt 22,vr_ld32load22 ; skip if we must load this line
+ vor v22,v31,v31 ; neither VR is live, so bug them both
+ vor v23,v31,v31
+ b vr_ld32test24
+vr_ld32load22: ; must load VRs in this line
+ lvxl v22,0,r11
+ lvxl v23,r4,r11
+
+vr_ld32test24: ; here to handle next cache line
+ la r11,savevr24(r3) ; get offset to next cache line
+ bt 24,vr_ld32load24 ; skip if we must load this line
+ vor v24,v31,v31 ; neither VR is live, so bug them both
+ vor v25,v31,v31
+ b vr_ld32test26
+vr_ld32load24: ; must load VRs in this line
+ lvxl v24,0,r11
+ lvxl v25,r4,r11
+
+vr_ld32test26: ; here to handle next cache line
+ la r11,savevr26(r3) ; get offset to next cache line
+ bt 26,vr_ld32load26 ; skip if we must load this line
+ vor v26,v31,v31 ; neither VR is live, so bug them both
+ vor v27,v31,v31
+ b vr_ld32test28
+vr_ld32load26: ; must load VRs in this line
+ lvxl v26,0,r11
+ lvxl v27,r4,r11
+
+vr_ld32test28: ; here to handle next cache line
+ la r11,savevr28(r3) ; get offset to next cache line
+ bt 28,vr_ld32load28 ; skip if we must load this line
+ vor v28,v31,v31 ; neither VR is live, so bug them both
+ vor v29,v31,v31
+ b vr_ld32test30
+vr_ld32load28: ; must load VRs in this line
+ lvxl v28,0,r11
+ lvxl v29,r4,r11
+
+vr_ld32test30: ; here to handle next cache line
+ la r11,savevr30(r3) ; get offset to next cache line
+ bt 30,vr_ld32load30 ; skip if we must load this line
+ vor v30,v31,v31 ; neither VR is live, so bug them both
+ blr
+vr_ld32load30: ; must load VRs in this line
+ lvxl v30,0,r11
+ lvxl v31,r4,r11
+ blr