[apple/xnu.git] / bsd / dev / ppc / xsumas.s

/*
 * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved.
 *
 * @APPLE_LICENSE_HEADER_START@
 * 
 * The contents of this file constitute Original Code as defined in and
 * are subject to the Apple Public Source License Version 1.1 (the
 * "License").  You may not use this file except in compliance with the
 * License.  Please obtain a copy of the License at
 * http://www.apple.com/publicsource and read it before using this file.
 * 
 * This Original Code and all software distributed under the License are
 * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
 * License for the specific language governing rights and limitations
 * under the License.
 * 
 * @APPLE_LICENSE_HEADER_END@
 */

#define kShort  11
#define cr1_gt  5       // bit 1 of cr1

/*
 * short xsum_assym( short *p, int len, short xsum, boolean odd);
 *
 *  r3 - Pointer to data
 *  r4 - Length of data
 *  r5 - Accumulated sum value
 *  r6 -"Starting on odd address" flag (relative to byte 0 of the checksumed data)
 *
 * Note: If the "odd" flag is set, the address in r3 will be even.  Nonetheless, we
 *       correctly handle the case where the flag is set and the address is odd.
 *
 * This is the internet (IP, TCP) checksum algorithm, which is the 1s-complement sum
 * of the data, treated as an array of 16-bit integers.  1s-complement sums are done
 * via "add with carry" operations on a 2s-complement machine like PPC.  Note that
 * the adds can be done in parallel on 32-bit (or 64-bit) registers, as long as the
 * final sum is folded down to 16 bits.  On 32-bit machines we use "adde", which is
 * perfect except that it serializes the adds on the carry bit.  On 64-bit machines
 * we avoid this serialization by adding 32-bit words into 64-bit sums, then folding
 * all 64-bits into a 16-bit sum at the end.  We cannot use "adde" on 64-bit sums,
 * because the kernel runs in 32-bit mode even on 64-bit machines (so the carry bit
 * is set on the low 32-bits of the sum.)
 *
 * Using Altivec is tempting, but the performance impact of the greatly increased
 * number of exceptions and register save/restore traffic probably make it impractical
 * for now.
 */        
        .globl  _xsum_assym
        .globl  _xsum_nop_if_32bit
        .text
        .align  5
_xsum_assym:
        cmplwi  cr0,r4,kShort   ; too short to word align?
        rlwinm  r2,r3,0,0x3     ; get byte offset in word
        dcbt    0,r3            ; touch in 1st cache line
        cmpwi   cr6,r2,0        ; is address word aligned?
        ble     cr0,Lshort      ; skip if too short to bother aligning
        
        subfic  r0,r2,4         ; get #bytes in partial word
        cmplwi  cr1,r6,0        ; set cr1_gt if "starting on odd address" flag is set
        addic   r0,r0,0         ; turn off carry
        beq     cr6,Laligned    ; skip if already word aligned (r2==0 if aligned)
        
;       Partial word at start: zero filled on left, it becomes initial checksum.
        
        rlwinm  r3,r3,0,0,29    ; word align address
        mtcrf   0x01,r2         ; move byte offset to cr7
        lwz     r6,0(r3)        ; get partial word
        li      r7,-1           ; start of mask for partial fill
        slwi    r8,r2,3         ; multiply byte offset by 8
        sub     r4,r4,r0        ; adjust length for bytes in partial word
        crxor   cr1_gt,31,cr1_gt; set flag if byte-lane swap will be necessary
        srw     r7,r7,r8        ; get mask for bytes to keep in partial word
        addi    r3,r3,4         ; point to next word of input
        and     r2,r6,r7        ; zero fill on left
        
;       Address is now word aligned.  Prepare for inner loop over 32-byte chunks.
;           r2 = initial checksum
;           r3 = word aligned address
;           r4 = length remaining
;           r5 = accumulated sum parameter
;        carry = off
;       cr1_gt = "starting on odd address" flag

Laligned:
        srwi.   r0,r4,5         ; get count of 32-byte chunks
        mtcrf   0x02,r4         ; move residual length to cr6 and cr7
        mtcrf   0x01,r4
        beq     cr0,Lleftovers  ; no chunks
        
        mtctr   r0              ; set up loop count
        li      r4,32           ; offset to next chunk
_xsum_nop_if_32bit:
        b       L64BitPath      ; use the 64-bit path (patched to nop on 32-bit machine)
        dcbt    r4,r3           ; touch in 2nd cache line
        li      r0,96           ; get touch offset
        b       LInnerLoop32    ; enter 32-bit loop
        
;       Inner loop for 32-bit machines.

        .align  4
LInnerLoop32:
        lwz     r4,0(r3)
        lwz     r6,4(r3)
        lwz     r7,8(r3)
        lwz     r8,12(r3)
        adde    r2,r2,r4
        lwz     r9,16(r3)
        adde    r2,r2,r6
        lwz     r10,20(r3)
        adde    r2,r2,r7
        lwz     r11,24(r3)
        adde    r2,r2,r8
        lwz     r12,28(r3)
        adde    r2,r2,r9
        dcbt    r3,r0
        adde    r2,r2,r10
        addi    r3,r3,32
        adde    r2,r2,r11
        adde    r2,r2,r12
        bdnz+   LInnerLoop32

;       Handle leftover bytes.
;           r2 = checksum so far
;           r3 = word aligned address
;           r5 = accumulated sum parameter
;        carry = live
;       cr1_gt = "starting on odd address" flag
;      cr6,cr7 = residual length

Lleftovers:
        bf      27,Lleftover8   ; test 0x10 bit of residual length
        lwz     r4,0(r3)
        lwz     r6,4(r3)
        lwz     r7,8(r3)
        lwz     r8,12(r3)
        addi    r3,r3,16
        adde    r2,r2,r4
        adde    r2,r2,r6
        adde    r2,r2,r7
        adde    r2,r2,r8
Lleftover8:
        bf      28,Lleftover4
        lwz     r4,0(r3)
        lwz     r6,4(r3)
        addi    r3,r3,8
        adde    r2,r2,r4
        adde    r2,r2,r6
Lleftover4:
        bf      29,Lleftover2
        lwz     r4,0(r3)
        addi    r3,r3,4
        adde    r2,r2,r4
Lleftover2:
        bf      30,Lleftover1
        lhz     r4,0(r3)
        addi    r3,r3,2
        adde    r2,r2,r4
Lleftover1:
        bf      31,Lwrapup
        lbz     r4,0(r3)
        slwi    r4,r4,8         ; shift last byte into proper lane
        adde    r2,r2,r4

;       All data bytes checksummed.  Wrap up.
;           r2 = checksum so far (word parallel)
;           r5 = accumulated sum parameter
;        carry = live
;       cr1_gt = "starting on odd address" flag

Lwrapup:
        addze   r2,r2           ; add in last carry
        addze   r2,r2           ; in case the "addze" carries
Lwrapupx:                       ; here from short-operand case, with xer(ca) undefined
        srwi    r6,r2,16        ; top half of 32-bit checksum
        rlwinm  r7,r2,0,0xFFFF  ; lower half
        add     r2,r6,r7        ; add them together
        srwi    r6,r2,16        ; then do it again, in case first carried
        rlwinm  r7,r2,0,0xFFFF
        add     r2,r6,r7
        bf      cr1_gt,Lswapped ; test "starting on odd address" flag
        
;       The checksum began on an odd address, so swap bytes.

        rlwinm  r6,r2,24,0x00FF ; move top byte to bottom
        rlwinm  r7,r2,8,0xFF00  ; bottom to top
        or      r2,r6,r7        ; rejoin
        
;       Finally, add in checksum passed in as a parameter.

Lswapped:
        add     r2,r2,r5        ; add passed-in checksum
        srwi    r6,r2,16        ; top half of 32-bit checksum
        rlwinm  r7,r2,0,0xFFFF  ; lower half
        add     r2,r6,r7        ; add them together
        srwi    r6,r2,16        ; then do it again, in case first carried
        rlwinm  r7,r2,0,0xFFFF
        add     r3,r6,r7        ; steer result into r3
        blr

;       Handle short operands.  Do a halfword at a time.
;           r3 = address
;           r4 = length (<= kShort)
;           r5 = accumulated sum parameter
;           r6 = "starting on odd byte" flag

Lshort:
        cmpwi   cr6,r4,2        ; at least two bytes?
        andi.   r0,r4,1         ; odd length?
        li      r2,0            ; initialize checksum
        cmplwi  cr1,r6,0        ; set cr1_gt if "starting on odd address" flag is set
        blt     cr6,Lshort2     ; fewer than two bytes, so skip
Lshort1:
        cmpwi   cr6,r4,4        ; two more bytes (after we decrement)?
        lhz     r7,0(r3)
        subi    r4,r4,2
        addi    r3,r3,2
        add     r2,r2,r7        ; note no need for "adde"
        bge     cr6,Lshort1     ; loop for 2 more bytes
Lshort2:
        beq     Lwrapupx        ; no byte at end, proceed to checkout with carry undefined
        lbz     r7,0(r3)
        slwi    r7,r7,8         ; shift last byte into proper lane
        add     r2,r2,r7
        b       Lwrapupx
        
;       Handle 64-bit machine.  The major improvement over the 32-bit path is that we use
;       four parallel 32-bit accumulators, which carry into the upper half naturally so we
;       do not have to use "adde", which serializes on the carry bit.  Note that we cannot
;       do 64-bit "adde"s, because we run in 32-bit mode so carry would not be set correctly.
;           r2 = checksum so far (ie, the zero-filled partial first word)
;           r3 = word aligned address
;           r5 = accumulated sum parameter
;          ctr = number of 32-byte chunks of input
;        carry = unused in this code
;       cr1_gt = "starting on odd address" flag
;      cr6,cr7 = residual length

L64BitPath:
        stw     r13,-4(r1)      ; save a few nonvolatile regs in red zone so we can use them
        stw     r14,-8(r1)
        stw     r15,-12(r1)
        stw     r16,-16(r1)
        li      r0,128          ; to touch next line
        li      r13,0           ; r13-r15 are the accumulators, so initialize them
        dcbt    r3,r0           ; touch in next cache line, and keep loads away from the above stores
        lwz     r4,0(r3)        ; start pipeline by loading first 32 bytes into r4, r6-r12
        lwz     r6,4(r3)
        lwz     r7,8(r3)
        mr      r14,r2          ; just copy incoming partial word into one of the accumulators
        li      r15,0
        lwz     r8,12(r3)
        lwz     r9,16(r3)
        li      r16,0
        li      r0,256          ; get touch offset
        lwz     r10,20(r3)
        lwz     r11,24(r3)
        lwz     r12,28(r3)      ; load last word of previous chunk
        addi    r3,r3,32        ; skip past the chunk
        bdnz++  LInnerLoop64    ; enter loop if another chunk to go
        
        b       LAddLastChunk   ; only one chunk
        
;       Inner loop for 64-bit processors.  This loop is scheduled for the 970.
;       It is pipelined (loads are one iteration ahead of adds), and unrolled.
;       It should take 9-10 cycles per iteration, which consumes 64 bytes of input.

        .align  5
LInnerLoop64:                   ; 64 bytes/iteration
        add     r13,r13,r4      ; cycle 1
        add     r14,r14,r6
        dcbt    r3,r0           ; touch in 2 lines ahead
        lwz     r4,0(r3)
        
        add     r15,r15,r7      ; cycle 2, etc
        lwz     r6,4(r3)
        lwz     r7,8(r3)
        add     r16,r16,r8
        
        lwz     r8,12(r3)
        add     r13,r13,r9
        add     r14,r14,r10
        lwz     r9,16(r3)
        
        add     r15,r15,r11
        lwz     r10,20(r3)
        lwz     r11,24(r3)
        add     r16,r16,r12
        bdz--   LEarlyExit      ; early exit if no more chunks
        
        lwz     r12,28(r3)
        add     r13,r13,r4
        add     r14,r14,r6
        lwz     r4,32(r3)
        
        add     r15,r15,r7
        lwz     r6,36(r3)
        lwz     r7,40(r3)
        add     r16,r16,r8
        
        lwz     r8,44(r3)
        add     r13,r13,r9
        add     r14,r14,r10
        lwz     r9,48(r3)
        
        add     r15,r15,r11
        lwz     r10,52(r3)
        lwz     r11,56(r3)
        add     r16,r16,r12
        
        nop                     ; position last load in 2nd dispatch slot
        lwz     r12,60(r3)
        addi    r3,r3,64
        bdnz++  LInnerLoop64
        
        b       LAddLastChunk

;       Add in the last 32-byte chunk, and any leftover bytes.
;           r3 = word aligned address of next byte of data
;           r5 = accumulated sum parameter
;      r13-r16 = the four accumulators
;       cr1_gt = "starting on odd address" flag
;      cr6,cr7 = residual length

LEarlyExit:                     ; here from middle of inner loop
        lwz     r12,28(r3)      ; load last word of last chunk
        addi    r3,r3,32
LAddLastChunk:                  ; last 32-byte chunk of input is in r4,r6-r12
        add     r13,r13,r4      ; add in last chunk
        add     r14,r14,r6      ; these are 64-bit adds
        add     r15,r15,r7
        add     r16,r16,r8
        add     r13,r13,r9
        add     r14,r14,r10
        add     r15,r15,r11
        add     r16,r16,r12

;       Handle leftover bytes, if any.

        bf      27,Lleft1       ; test 0x10 bit of residual length
        lwz     r4,0(r3)
        lwz     r6,4(r3)
        lwz     r7,8(r3)
        lwz     r8,12(r3)
        addi    r3,r3,16
        add     r13,r13,r4
        add     r14,r14,r6
        add     r15,r15,r7
        add     r16,r16,r8
Lleft1:
        bf      28,Lleft2
        lwz     r4,0(r3)
        lwz     r6,4(r3)
        addi    r3,r3,8
        add     r13,r13,r4
        add     r14,r14,r6
Lleft2:
        bf      29,Lleft3
        lwz     r4,0(r3)
        addi    r3,r3,4
        add     r14,r14,r4
Lleft3:
        bf      30,Lleft4
        lhz     r4,0(r3)
        addi    r3,r3,2
        add     r15,r15,r4
Lleft4:
        bf      31,Lleft5
        lbz     r4,0(r3)
        slwi    r4,r4,8         ; shift last byte into proper lane
        add     r16,r16,r4

;       All data bytes have been checksummed.  Now we must add together the four
;       accumulators and restore the regs from the red zone.
;           r3 = word aligned address of next byte of data
;           r5 = accumulated sum parameter
;      r13-r16 = the four accumulators
;        carry = not used so far
;       cr1_gt = "starting on odd address" flag

Lleft5:
        add     r8,r13,r14      ; add the four accumulators together
        add     r9,r15,r16
        lwz     r13,-4(r1)      ; start to restore nonvolatiles from red zone
        lwz     r14,-8(r1)
        add     r8,r8,r9        ; now r8 is 64-bit sum of the four accumulators
        lwz     r15,-12(r1)
        lwz     r16,-16(r1)
        srdi    r7,r8,32        ; get upper half of 64-bit sum
        addc    r2,r7,r8        ; finally, do a 32-bit add of the two halves of r8 (setting carry)
        b       Lwrapup         ; merge r2, r5, and carry into a 16-bit checksum
Commit	Line	Data
1c79356b	1	/*
91447636	2	* Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved.
1c79356b A	3	*
	4	* @APPLE_LICENSE_HEADER_START@
	5	*
e5568f75 A	6	* The contents of this file constitute Original Code as defined in and
	7	* are subject to the Apple Public Source License Version 1.1 (the
	8	* "License"). You may not use this file except in compliance with the
	9	* License. Please obtain a copy of the License at
	10	* http://www.apple.com/publicsource and read it before using this file.
1c79356b	11	*
e5568f75 A	12	* This Original Code and all software distributed under the License are
e5568f75 A	13	* distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
1c79356b A	14	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
1c79356b A	15	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
e5568f75 A	16	* FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT. Please see the
	17	* License for the specific language governing rights and limitations
	18	* under the License.
1c79356b A	19	*
	20	* @APPLE_LICENSE_HEADER_END@
	21	*/
91447636 A	22
	23	#define kShort 11
	24	#define cr1_gt 5 // bit 1 of cr1
1c79356b A	25
1c79356b A	26	/*
91447636 A	27	* short xsum_assym( short *p, int len, short xsum, boolean odd);
	28	*
	29	* r3 - Pointer to data
	30	* r4 - Length of data
	31	* r5 - Accumulated sum value
	32	* r6 -"Starting on odd address" flag (relative to byte 0 of the checksumed data)
	33	*
	34	* Note: If the "odd" flag is set, the address in r3 will be even. Nonetheless, we
	35	* correctly handle the case where the flag is set and the address is odd.
	36	*
	37	* This is the internet (IP, TCP) checksum algorithm, which is the 1s-complement sum
	38	* of the data, treated as an array of 16-bit integers. 1s-complement sums are done
	39	* via "add with carry" operations on a 2s-complement machine like PPC. Note that
	40	* the adds can be done in parallel on 32-bit (or 64-bit) registers, as long as the
	41	* final sum is folded down to 16 bits. On 32-bit machines we use "adde", which is
	42	* perfect except that it serializes the adds on the carry bit. On 64-bit machines
	43	* we avoid this serialization by adding 32-bit words into 64-bit sums, then folding
	44	* all 64-bits into a 16-bit sum at the end. We cannot use "adde" on 64-bit sums,
	45	* because the kernel runs in 32-bit mode even on 64-bit machines (so the carry bit
	46	* is set on the low 32-bits of the sum.)
	47	*
	48	* Using Altivec is tempting, but the performance impact of the greatly increased
	49	* number of exceptions and register save/restore traffic probably make it impractical
	50	* for now.
	51	*/
	52	.globl _xsum_assym
	53	.globl _xsum_nop_if_32bit
	54	.text
	55	.align 5
	56	_xsum_assym:
	57	cmplwi cr0,r4,kShort ; too short to word align?
	58	rlwinm r2,r3,0,0x3 ; get byte offset in word
	59	dcbt 0,r3 ; touch in 1st cache line
	60	cmpwi cr6,r2,0 ; is address word aligned?
	61	ble cr0,Lshort ; skip if too short to bother aligning
	62
	63	subfic r0,r2,4 ; get #bytes in partial word
	64	cmplwi cr1,r6,0 ; set cr1_gt if "starting on odd address" flag is set
	65	addic r0,r0,0 ; turn off carry
	66	beq cr6,Laligned ; skip if already word aligned (r2==0 if aligned)
	67
	68	; Partial word at start: zero filled on left, it becomes initial checksum.
	69
	70	rlwinm r3,r3,0,0,29 ; word align address
	71	mtcrf 0x01,r2 ; move byte offset to cr7
	72	lwz r6,0(r3) ; get partial word
	73	li r7,-1 ; start of mask for partial fill
	74	slwi r8,r2,3 ; multiply byte offset by 8
	75	sub r4,r4,r0 ; adjust length for bytes in partial word
	76	crxor cr1_gt,31,cr1_gt; set flag if byte-lane swap will be necessary
	77	srw r7,r7,r8 ; get mask for bytes to keep in partial word
	78	addi r3,r3,4 ; point to next word of input
	79	and r2,r6,r7 ; zero fill on left
	80
	81	; Address is now word aligned. Prepare for inner loop over 32-byte chunks.
	82	; r2 = initial checksum
	83	; r3 = word aligned address
	84	; r4 = length remaining
	85	; r5 = accumulated sum parameter
	86	; carry = off
	87	; cr1_gt = "starting on odd address" flag
	88
	89	Laligned:
	90	srwi. r0,r4,5 ; get count of 32-byte chunks
91	mtcrf 0x02,r4 ; move residual length to cr6 and cr7
92	mtcrf 0x01,r4
93	beq cr0,Lleftovers ; no chunks
94
95	mtctr r0 ; set up loop count
96	li r4,32 ; offset to next chunk
97	_xsum_nop_if_32bit:
98	b L64BitPath ; use the 64-bit path (patched to nop on 32-bit machine)
99	dcbt r4,r3 ; touch in 2nd cache line
100	li r0,96 ; get touch offset
101	b LInnerLoop32 ; enter 32-bit loop
102
103	; Inner loop for 32-bit machines.
104
105	.align 4
106	LInnerLoop32:
107	lwz r4,0(r3)
108	lwz r6,4(r3)
109	lwz r7,8(r3)
110	lwz r8,12(r3)
111	adde r2,r2,r4
112	lwz r9,16(r3)
113	adde r2,r2,r6
114	lwz r10,20(r3)
115	adde r2,r2,r7
116	lwz r11,24(r3)
117	adde r2,r2,r8
118	lwz r12,28(r3)
119	adde r2,r2,r9
120	dcbt r3,r0
121	adde r2,r2,r10
122	addi r3,r3,32
123	adde r2,r2,r11
124	adde r2,r2,r12
125	bdnz+ LInnerLoop32
126
127	; Handle leftover bytes.
128	; r2 = checksum so far
129	; r3 = word aligned address
130	; r5 = accumulated sum parameter
131	; carry = live
132	; cr1_gt = "starting on odd address" flag
133	; cr6,cr7 = residual length
134
135	Lleftovers:
136	bf 27,Lleftover8 ; test 0x10 bit of residual length
137	lwz r4,0(r3)
138	lwz r6,4(r3)
139	lwz r7,8(r3)
140	lwz r8,12(r3)
141	addi r3,r3,16
142	adde r2,r2,r4
143	adde r2,r2,r6
144	adde r2,r2,r7
145	adde r2,r2,r8
146	Lleftover8:
147	bf 28,Lleftover4
148	lwz r4,0(r3)
149	lwz r6,4(r3)
150	addi r3,r3,8
151	adde r2,r2,r4
152	adde r2,r2,r6
153	Lleftover4:
154	bf 29,Lleftover2
155	lwz r4,0(r3)
156	addi r3,r3,4
157	adde r2,r2,r4
158	Lleftover2:
159	bf 30,Lleftover1
160	lhz r4,0(r3)
161	addi r3,r3,2
162	adde r2,r2,r4
163	Lleftover1:
164	bf 31,Lwrapup
165	lbz r4,0(r3)
166	slwi r4,r4,8 ; shift last byte into proper lane
167	adde r2,r2,r4
168
169	; All data bytes checksummed. Wrap up.
170	; r2 = checksum so far (word parallel)
171	; r5 = accumulated sum parameter
172	; carry = live
173	; cr1_gt = "starting on odd address" flag
174
175	Lwrapup:
176	addze r2,r2 ; add in last carry
177	addze r2,r2 ; in case the "addze" carries
178	Lwrapupx: ; here from short-operand case, with xer(ca) undefined
179	srwi r6,r2,16 ; top half of 32-bit checksum
180	rlwinm r7,r2,0,0xFFFF ; lower half
181	add r2,r6,r7 ; add them together
182	srwi r6,r2,16 ; then do it again, in case first carried
183	rlwinm r7,r2,0,0xFFFF
184	add r2,r6,r7
185	bf cr1_gt,Lswapped ; test "starting on odd address" flag
186
187	; The checksum began on an odd address, so swap bytes.
188
189	rlwinm r6,r2,24,0x00FF ; move top byte to bottom
190	rlwinm r7,r2,8,0xFF00 ; bottom to top
191	or r2,r6,r7 ; rejoin
192
193	; Finally, add in checksum passed in as a parameter.
194
195	Lswapped:
196	add r2,r2,r5 ; add passed-in checksum
197	srwi r6,r2,16 ; top half of 32-bit checksum
198	rlwinm r7,r2,0,0xFFFF ; lower half
199	add r2,r6,r7 ; add them together
200	srwi r6,r2,16 ; then do it again, in case first carried
201	rlwinm r7,r2,0,0xFFFF
202	add r3,r6,r7 ; steer result into r3
203	blr
204
205	; Handle short operands. Do a halfword at a time.
206	; r3 = address
207	; r4 = length (<= kShort)
208	; r5 = accumulated sum parameter
209	; r6 = "starting on odd byte" flag
210
211	Lshort:
212	cmpwi cr6,r4,2 ; at least two bytes?
213	andi. r0,r4,1 ; odd length?
214	li r2,0 ; initialize checksum
215	cmplwi cr1,r6,0 ; set cr1_gt if "starting on odd address" flag is set
216	blt cr6,Lshort2 ; fewer than two bytes, so skip
217	Lshort1:
218	cmpwi cr6,r4,4 ; two more bytes (after we decrement)?
219	lhz r7,0(r3)
220	subi r4,r4,2
221	addi r3,r3,2
222	add r2,r2,r7 ; note no need for "adde"
223	bge cr6,Lshort1 ; loop for 2 more bytes
224	Lshort2:
225	beq Lwrapupx ; no byte at end, proceed to checkout with carry undefined
226	lbz r7,0(r3)
227	slwi r7,r7,8 ; shift last byte into proper lane
228	add r2,r2,r7
229	b Lwrapupx
230
231	; Handle 64-bit machine. The major improvement over the 32-bit path is that we use
232	; four parallel 32-bit accumulators, which carry into the upper half naturally so we
233	; do not have to use "adde", which serializes on the carry bit. Note that we cannot
234	; do 64-bit "adde"s, because we run in 32-bit mode so carry would not be set correctly.
235	; r2 = checksum so far (ie, the zero-filled partial first word)
236	; r3 = word aligned address
237	; r5 = accumulated sum parameter
238	; ctr = number of 32-byte chunks of input
239	; carry = unused in this code
240	; cr1_gt = "starting on odd address" flag
241	; cr6,cr7 = residual length
242
243	L64BitPath:
244	stw r13,-4(r1) ; save a few nonvolatile regs in red zone so we can use them
245	stw r14,-8(r1)
246	stw r15,-12(r1)
247	stw r16,-16(r1)
248	li r0,128 ; to touch next line
249	li r13,0 ; r13-r15 are the accumulators, so initialize them
250	dcbt r3,r0 ; touch in next cache line, and keep loads away from the above stores
251	lwz r4,0(r3) ; start pipeline by loading first 32 bytes into r4, r6-r12
252	lwz r6,4(r3)
253	lwz r7,8(r3)
254	mr r14,r2 ; just copy incoming partial word into one of the accumulators
255	li r15,0
256	lwz r8,12(r3)
257	lwz r9,16(r3)
258	li r16,0
259	li r0,256 ; get touch offset
260	lwz r10,20(r3)
261	lwz r11,24(r3)
262	lwz r12,28(r3) ; load last word of previous chunk
263	addi r3,r3,32 ; skip past the chunk
264	bdnz++ LInnerLoop64 ; enter loop if another chunk to go
265
266	b LAddLastChunk ; only one chunk
267
268	; Inner loop for 64-bit processors. This loop is scheduled for the 970.
269	; It is pipelined (loads are one iteration ahead of adds), and unrolled.
270	; It should take 9-10 cycles per iteration, which consumes 64 bytes of input.
271
272	.align 5
273	LInnerLoop64: ; 64 bytes/iteration
274	add r13,r13,r4 ; cycle 1
275	add r14,r14,r6
276	dcbt r3,r0 ; touch in 2 lines ahead
277	lwz r4,0(r3)
278
279	add r15,r15,r7 ; cycle 2, etc
280	lwz r6,4(r3)
281	lwz r7,8(r3)
282	add r16,r16,r8
283
284	lwz r8,12(r3)
285	add r13,r13,r9
286	add r14,r14,r10
287	lwz r9,16(r3)
288
289	add r15,r15,r11
290	lwz r10,20(r3)
291	lwz r11,24(r3)
292	add r16,r16,r12
293	bdz-- LEarlyExit ; early exit if no more chunks
294
295	lwz r12,28(r3)
296	add r13,r13,r4
297	add r14,r14,r6
298	lwz r4,32(r3)
299
300	add r15,r15,r7
301	lwz r6,36(r3)
302	lwz r7,40(r3)
303	add r16,r16,r8
304
305	lwz r8,44(r3)
306	add r13,r13,r9
307	add r14,r14,r10
308	lwz r9,48(r3)
309
310	add r15,r15,r11
311	lwz r10,52(r3)
312	lwz r11,56(r3)
313	add r16,r16,r12
314
315	nop ; position last load in 2nd dispatch slot
316	lwz r12,60(r3)
317	addi r3,r3,64
318	bdnz++ LInnerLoop64
319
320	b LAddLastChunk
321
322	; Add in the last 32-byte chunk, and any leftover bytes.
323	; r3 = word aligned address of next byte of data
324	; r5 = accumulated sum parameter
325	; r13-r16 = the four accumulators
326	; cr1_gt = "starting on odd address" flag
327	; cr6,cr7 = residual length
328
329	LEarlyExit: ; here from middle of inner loop
330	lwz r12,28(r3) ; load last word of last chunk
331	addi r3,r3,32
332	LAddLastChunk: ; last 32-byte chunk of input is in r4,r6-r12
333	add r13,r13,r4 ; add in last chunk
334	add r14,r14,r6 ; these are 64-bit adds
335	add r15,r15,r7
336	add r16,r16,r8
337	add r13,r13,r9
338	add r14,r14,r10
339	add r15,r15,r11
340	add r16,r16,r12
341
342	; Handle leftover bytes, if any.
343
344	bf 27,Lleft1 ; test 0x10 bit of residual length
345	lwz r4,0(r3)
346	lwz r6,4(r3)
347	lwz r7,8(r3)
348	lwz r8,12(r3)
349	addi r3,r3,16
350	add r13,r13,r4
351	add r14,r14,r6
352	add r15,r15,r7
353	add r16,r16,r8
354	Lleft1:
355	bf 28,Lleft2
356	lwz r4,0(r3)
357	lwz r6,4(r3)
358	addi r3,r3,8
359	add r13,r13,r4
360	add r14,r14,r6
361	Lleft2:
362	bf 29,Lleft3
363	lwz r4,0(r3)
364	addi r3,r3,4
365	add r14,r14,r4
366	Lleft3:
367	bf 30,Lleft4
368	lhz r4,0(r3)
369	addi r3,r3,2
370	add r15,r15,r4
371	Lleft4:
372	bf 31,Lleft5
373	lbz r4,0(r3)
374	slwi r4,r4,8 ; shift last byte into proper lane
375	add r16,r16,r4
376
377	; All data bytes have been checksummed. Now we must add together the four
378	; accumulators and restore the regs from the red zone.
379	; r3 = word aligned address of next byte of data
380	; r5 = accumulated sum parameter
381	; r13-r16 = the four accumulators
382	; carry = not used so far
383	; cr1_gt = "starting on odd address" flag
384
385	Lleft5:
386	add r8,r13,r14 ; add the four accumulators together
387	add r9,r15,r16
388	lwz r13,-4(r1) ; start to restore nonvolatiles from red zone
389	lwz r14,-8(r1)
390	add r8,r8,r9 ; now r8 is 64-bit sum of the four accumulators
391	lwz r15,-12(r1)
392	lwz r16,-16(r1)
393	srdi r7,r8,32 ; get upper half of 64-bit sum
394	addc r2,r7,r8 ; finally, do a 32-bit add of the two halves of r8 (setting carry)
395	b Lwrapup ; merge r2, r5, and carry into a 16-bit checksum