[apple/xnu.git] / bsd / dev / ppc / xsumas.s

/*
 * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
 * 
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. The rights granted to you under the License
 * may not be used to create, or enable the creation or redistribution of,
 * unlawful or unlicensed copies of an Apple operating system, or to
 * circumvent, violate, or enable the circumvention or violation of, any
 * terms of an Apple operating system software license agreement.
 * 
 * Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this file.
 * 
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 * 
 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
 */

#define kShort  11
#define cr1_gt  5       // bit 1 of cr1

/*
 * short xsum_assym( short *p, int len, short xsum, boolean odd);
 *
 *  r3 - Pointer to data
 *  r4 - Length of data
 *  r5 - Accumulated sum value
 *  r6 -"Starting on odd address" flag (relative to byte 0 of the checksumed data)
 *
 * Note: If the "odd" flag is set, the address in r3 will be even.  Nonetheless, we
 *       correctly handle the case where the flag is set and the address is odd.
 *
 * This is the internet (IP, TCP) checksum algorithm, which is the 1s-complement sum
 * of the data, treated as an array of 16-bit integers.  1s-complement sums are done
 * via "add with carry" operations on a 2s-complement machine like PPC.  Note that
 * the adds can be done in parallel on 32-bit (or 64-bit) registers, as long as the
 * final sum is folded down to 16 bits.  On 32-bit machines we use "adde", which is
 * perfect except that it serializes the adds on the carry bit.  On 64-bit machines
 * we avoid this serialization by adding 32-bit words into 64-bit sums, then folding
 * all 64-bits into a 16-bit sum at the end.  We cannot use "adde" on 64-bit sums,
 * because the kernel runs in 32-bit mode even on 64-bit machines (so the carry bit
 * is set on the low 32-bits of the sum.)
 *
 * Using Altivec is tempting, but the performance impact of the greatly increased
 * number of exceptions and register save/restore traffic probably make it impractical
 * for now.
 */        
        .globl  _xsum_assym
        .globl  _xsum_nop_if_32bit
        .text
        .align  5
_xsum_assym:
        cmplwi  cr0,r4,kShort   ; too short to word align?
        rlwinm  r2,r3,0,0x3     ; get byte offset in word
        dcbt    0,r3            ; touch in 1st cache line
        cmpwi   cr6,r2,0        ; is address word aligned?
        ble     cr0,Lshort      ; skip if too short to bother aligning
        
        subfic  r0,r2,4         ; get #bytes in partial word
        cmplwi  cr1,r6,0        ; set cr1_gt if "starting on odd address" flag is set
        addic   r0,r0,0         ; turn off carry
        beq     cr6,Laligned    ; skip if already word aligned (r2==0 if aligned)
        
;       Partial word at start: zero filled on left, it becomes initial checksum.
        
        rlwinm  r3,r3,0,0,29    ; word align address
        mtcrf   0x01,r2         ; move byte offset to cr7
        lwz     r6,0(r3)        ; get partial word
        li      r7,-1           ; start of mask for partial fill
        slwi    r8,r2,3         ; multiply byte offset by 8
        sub     r4,r4,r0        ; adjust length for bytes in partial word
        crxor   cr1_gt,31,cr1_gt; set flag if byte-lane swap will be necessary
        srw     r7,r7,r8        ; get mask for bytes to keep in partial word
        addi    r3,r3,4         ; point to next word of input
        and     r2,r6,r7        ; zero fill on left
        
;       Address is now word aligned.  Prepare for inner loop over 32-byte chunks.
;           r2 = initial checksum
;           r3 = word aligned address
;           r4 = length remaining
;           r5 = accumulated sum parameter
;        carry = off
;       cr1_gt = "starting on odd address" flag

Laligned:
        srwi.   r0,r4,5         ; get count of 32-byte chunks
        mtcrf   0x02,r4         ; move residual length to cr6 and cr7
        mtcrf   0x01,r4
        beq     cr0,Lleftovers  ; no chunks
        
        mtctr   r0              ; set up loop count
        li      r4,32           ; offset to next chunk
_xsum_nop_if_32bit:
        b       L64BitPath      ; use the 64-bit path (patched to nop on 32-bit machine)
        dcbt    r4,r3           ; touch in 2nd cache line
        li      r0,96           ; get touch offset
        b       LInnerLoop32    ; enter 32-bit loop
        
;       Inner loop for 32-bit machines.

        .align  4
LInnerLoop32:
        lwz     r4,0(r3)
        lwz     r6,4(r3)
        lwz     r7,8(r3)
        lwz     r8,12(r3)
        adde    r2,r2,r4
        lwz     r9,16(r3)
        adde    r2,r2,r6
        lwz     r10,20(r3)
        adde    r2,r2,r7
        lwz     r11,24(r3)
        adde    r2,r2,r8
        lwz     r12,28(r3)
        adde    r2,r2,r9
        dcbt    r3,r0
        adde    r2,r2,r10
        addi    r3,r3,32
        adde    r2,r2,r11
        adde    r2,r2,r12
        bdnz+   LInnerLoop32

;       Handle leftover bytes.
;           r2 = checksum so far
;           r3 = word aligned address
;           r5 = accumulated sum parameter
;        carry = live
;       cr1_gt = "starting on odd address" flag
;      cr6,cr7 = residual length

Lleftovers:
        bf      27,Lleftover8   ; test 0x10 bit of residual length
        lwz     r4,0(r3)
        lwz     r6,4(r3)
        lwz     r7,8(r3)
        lwz     r8,12(r3)
        addi    r3,r3,16
        adde    r2,r2,r4
        adde    r2,r2,r6
        adde    r2,r2,r7
        adde    r2,r2,r8
Lleftover8:
        bf      28,Lleftover4
        lwz     r4,0(r3)
        lwz     r6,4(r3)
        addi    r3,r3,8
        adde    r2,r2,r4
        adde    r2,r2,r6
Lleftover4:
        bf      29,Lleftover2
        lwz     r4,0(r3)
        addi    r3,r3,4
        adde    r2,r2,r4
Lleftover2:
        bf      30,Lleftover1
        lhz     r4,0(r3)
        addi    r3,r3,2
        adde    r2,r2,r4
Lleftover1:
        bf      31,Lwrapup
        lbz     r4,0(r3)
        slwi    r4,r4,8         ; shift last byte into proper lane
        adde    r2,r2,r4

;       All data bytes checksummed.  Wrap up.
;           r2 = checksum so far (word parallel)
;           r5 = accumulated sum parameter
;        carry = live
;       cr1_gt = "starting on odd address" flag

Lwrapup:
        addze   r2,r2           ; add in last carry
        addze   r2,r2           ; in case the "addze" carries
Lwrapupx:                       ; here from short-operand case, with xer(ca) undefined
        srwi    r6,r2,16        ; top half of 32-bit checksum
        rlwinm  r7,r2,0,0xFFFF  ; lower half
        add     r2,r6,r7        ; add them together
        srwi    r6,r2,16        ; then do it again, in case first carried
        rlwinm  r7,r2,0,0xFFFF
        add     r2,r6,r7
        bf      cr1_gt,Lswapped ; test "starting on odd address" flag
        
;       The checksum began on an odd address, so swap bytes.

        rlwinm  r6,r2,24,0x00FF ; move top byte to bottom
        rlwinm  r7,r2,8,0xFF00  ; bottom to top
        or      r2,r6,r7        ; rejoin
        
;       Finally, add in checksum passed in as a parameter.

Lswapped:
        add     r2,r2,r5        ; add passed-in checksum
        srwi    r6,r2,16        ; top half of 32-bit checksum
        rlwinm  r7,r2,0,0xFFFF  ; lower half
        add     r2,r6,r7        ; add them together
        srwi    r6,r2,16        ; then do it again, in case first carried
        rlwinm  r7,r2,0,0xFFFF
        add     r3,r6,r7        ; steer result into r3
        blr

;       Handle short operands.  Do a halfword at a time.
;           r3 = address
;           r4 = length (<= kShort)
;           r5 = accumulated sum parameter
;           r6 = "starting on odd byte" flag

Lshort:
        cmpwi   cr6,r4,2        ; at least two bytes?
        andi.   r0,r4,1         ; odd length?
        li      r2,0            ; initialize checksum
        cmplwi  cr1,r6,0        ; set cr1_gt if "starting on odd address" flag is set
        blt     cr6,Lshort2     ; fewer than two bytes, so skip
Lshort1:
        cmpwi   cr6,r4,4        ; two more bytes (after we decrement)?
        lhz     r7,0(r3)
        subi    r4,r4,2
        addi    r3,r3,2
        add     r2,r2,r7        ; note no need for "adde"
        bge     cr6,Lshort1     ; loop for 2 more bytes
Lshort2:
        beq     Lwrapupx        ; no byte at end, proceed to checkout with carry undefined
        lbz     r7,0(r3)
        slwi    r7,r7,8         ; shift last byte into proper lane
        add     r2,r2,r7
        b       Lwrapupx
        
;       Handle 64-bit machine.  The major improvement over the 32-bit path is that we use
;       four parallel 32-bit accumulators, which carry into the upper half naturally so we
;       do not have to use "adde", which serializes on the carry bit.  Note that we cannot
;       do 64-bit "adde"s, because we run in 32-bit mode so carry would not be set correctly.
;           r2 = checksum so far (ie, the zero-filled partial first word)
;           r3 = word aligned address
;           r5 = accumulated sum parameter
;          ctr = number of 32-byte chunks of input
;        carry = unused in this code
;       cr1_gt = "starting on odd address" flag
;      cr6,cr7 = residual length

L64BitPath:
        stw     r13,-4(r1)      ; save a few nonvolatile regs in red zone so we can use them
        stw     r14,-8(r1)
        stw     r15,-12(r1)
        stw     r16,-16(r1)
        li      r0,128          ; to touch next line
        li      r13,0           ; r13-r15 are the accumulators, so initialize them
        dcbt    r3,r0           ; touch in next cache line, and keep loads away from the above stores
        lwz     r4,0(r3)        ; start pipeline by loading first 32 bytes into r4, r6-r12
        lwz     r6,4(r3)
        lwz     r7,8(r3)
        mr      r14,r2          ; just copy incoming partial word into one of the accumulators
        li      r15,0
        lwz     r8,12(r3)
        lwz     r9,16(r3)
        li      r16,0
        li      r0,256          ; get touch offset
        lwz     r10,20(r3)
        lwz     r11,24(r3)
        lwz     r12,28(r3)      ; load last word of previous chunk
        addi    r3,r3,32        ; skip past the chunk
        bdnz++  LInnerLoop64    ; enter loop if another chunk to go
        
        b       LAddLastChunk   ; only one chunk
        
;       Inner loop for 64-bit processors.  This loop is scheduled for the 970.
;       It is pipelined (loads are one iteration ahead of adds), and unrolled.
;       It should take 9-10 cycles per iteration, which consumes 64 bytes of input.

        .align  5
LInnerLoop64:                   ; 64 bytes/iteration
        add     r13,r13,r4      ; cycle 1
        add     r14,r14,r6
        dcbt    r3,r0           ; touch in 2 lines ahead
        lwz     r4,0(r3)
        
        add     r15,r15,r7      ; cycle 2, etc
        lwz     r6,4(r3)
        lwz     r7,8(r3)
        add     r16,r16,r8
        
        lwz     r8,12(r3)
        add     r13,r13,r9
        add     r14,r14,r10
        lwz     r9,16(r3)
        
        add     r15,r15,r11
        lwz     r10,20(r3)
        lwz     r11,24(r3)
        add     r16,r16,r12
        bdz--   LEarlyExit      ; early exit if no more chunks
        
        lwz     r12,28(r3)
        add     r13,r13,r4
        add     r14,r14,r6
        lwz     r4,32(r3)
        
        add     r15,r15,r7
        lwz     r6,36(r3)
        lwz     r7,40(r3)
        add     r16,r16,r8
        
        lwz     r8,44(r3)
        add     r13,r13,r9
        add     r14,r14,r10
        lwz     r9,48(r3)
        
        add     r15,r15,r11
        lwz     r10,52(r3)
        lwz     r11,56(r3)
        add     r16,r16,r12
        
        nop                     ; position last load in 2nd dispatch slot
        lwz     r12,60(r3)
        addi    r3,r3,64
        bdnz++  LInnerLoop64
        
        b       LAddLastChunk

;       Add in the last 32-byte chunk, and any leftover bytes.
;           r3 = word aligned address of next byte of data
;           r5 = accumulated sum parameter
;      r13-r16 = the four accumulators
;       cr1_gt = "starting on odd address" flag
;      cr6,cr7 = residual length

LEarlyExit:                     ; here from middle of inner loop
        lwz     r12,28(r3)      ; load last word of last chunk
        addi    r3,r3,32
LAddLastChunk:                  ; last 32-byte chunk of input is in r4,r6-r12
        add     r13,r13,r4      ; add in last chunk
        add     r14,r14,r6      ; these are 64-bit adds
        add     r15,r15,r7
        add     r16,r16,r8
        add     r13,r13,r9
        add     r14,r14,r10
        add     r15,r15,r11
        add     r16,r16,r12

;       Handle leftover bytes, if any.

        bf      27,Lleft1       ; test 0x10 bit of residual length
        lwz     r4,0(r3)
        lwz     r6,4(r3)
        lwz     r7,8(r3)
        lwz     r8,12(r3)
        addi    r3,r3,16
        add     r13,r13,r4
        add     r14,r14,r6
        add     r15,r15,r7
        add     r16,r16,r8
Lleft1:
        bf      28,Lleft2
        lwz     r4,0(r3)
        lwz     r6,4(r3)
        addi    r3,r3,8
        add     r13,r13,r4
        add     r14,r14,r6
Lleft2:
        bf      29,Lleft3
        lwz     r4,0(r3)
        addi    r3,r3,4
        add     r14,r14,r4
Lleft3:
        bf      30,Lleft4
        lhz     r4,0(r3)
        addi    r3,r3,2
        add     r15,r15,r4
Lleft4:
        bf      31,Lleft5
        lbz     r4,0(r3)
        slwi    r4,r4,8         ; shift last byte into proper lane
        add     r16,r16,r4

;       All data bytes have been checksummed.  Now we must add together the four
;       accumulators and restore the regs from the red zone.
;           r3 = word aligned address of next byte of data
;           r5 = accumulated sum parameter
;      r13-r16 = the four accumulators
;        carry = not used so far
;       cr1_gt = "starting on odd address" flag

Lleft5:
        add     r8,r13,r14      ; add the four accumulators together
        add     r9,r15,r16
        lwz     r13,-4(r1)      ; start to restore nonvolatiles from red zone
        lwz     r14,-8(r1)
        add     r8,r8,r9        ; now r8 is 64-bit sum of the four accumulators
        lwz     r15,-12(r1)
        lwz     r16,-16(r1)
        srdi    r7,r8,32        ; get upper half of 64-bit sum
        addc    r2,r7,r8        ; finally, do a 32-bit add of the two halves of r8 (setting carry)
        b       Lwrapup         ; merge r2, r5, and carry into a 16-bit checksum
Commit	Line	Data
1c79356b	1	/*
5d5c5d0d A	2	* Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved.
5d5c5d0d A	3	*
2d21ac55	4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
1c79356b	5	*
2d21ac55 A	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. The rights granted to you under the License
	10	* may not be used to create, or enable the creation or redistribution of,
	11	* unlawful or unlicensed copies of an Apple operating system, or to
	12	* circumvent, violate, or enable the circumvention or violation of, any
	13	* terms of an Apple operating system software license agreement.
8f6c56a5	14	*
2d21ac55 A	15	* Please obtain a copy of the License at
	16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
	17	*
	18	* The Original Code and all software distributed under the License are
	19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
8f6c56a5 A	20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
8f6c56a5 A	21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
2d21ac55 A	22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	23	* Please see the License for the specific language governing rights and
	24	* limitations under the License.
8f6c56a5	25	*
2d21ac55	26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
1c79356b	27	*/
91447636 A	28
	29	#define kShort 11
	30	#define cr1_gt 5 // bit 1 of cr1
1c79356b A	31
1c79356b A	32	/*
91447636 A	33	* short xsum_assym( short *p, int len, short xsum, boolean odd);
	34	*
	35	* r3 - Pointer to data
	36	* r4 - Length of data
	37	* r5 - Accumulated sum value
	38	* r6 -"Starting on odd address" flag (relative to byte 0 of the checksumed data)
	39	*
	40	* Note: If the "odd" flag is set, the address in r3 will be even. Nonetheless, we
	41	* correctly handle the case where the flag is set and the address is odd.
	42	*
	43	* This is the internet (IP, TCP) checksum algorithm, which is the 1s-complement sum
	44	* of the data, treated as an array of 16-bit integers. 1s-complement sums are done
	45	* via "add with carry" operations on a 2s-complement machine like PPC. Note that
	46	* the adds can be done in parallel on 32-bit (or 64-bit) registers, as long as the
	47	* final sum is folded down to 16 bits. On 32-bit machines we use "adde", which is
	48	* perfect except that it serializes the adds on the carry bit. On 64-bit machines
	49	* we avoid this serialization by adding 32-bit words into 64-bit sums, then folding
	50	* all 64-bits into a 16-bit sum at the end. We cannot use "adde" on 64-bit sums,
	51	* because the kernel runs in 32-bit mode even on 64-bit machines (so the carry bit
	52	* is set on the low 32-bits of the sum.)
	53	*
	54	* Using Altivec is tempting, but the performance impact of the greatly increased
	55	* number of exceptions and register save/restore traffic probably make it impractical
	56	* for now.
	57	*/
	58	.globl _xsum_assym
	59	.globl _xsum_nop_if_32bit
	60	.text
	61	.align 5
	62	_xsum_assym:
	63	cmplwi cr0,r4,kShort ; too short to word align?
	64	rlwinm r2,r3,0,0x3 ; get byte offset in word
	65	dcbt 0,r3 ; touch in 1st cache line
	66	cmpwi cr6,r2,0 ; is address word aligned?
	67	ble cr0,Lshort ; skip if too short to bother aligning
	68
	69	subfic r0,r2,4 ; get #bytes in partial word
	70	cmplwi cr1,r6,0 ; set cr1_gt if "starting on odd address" flag is set
	71	addic r0,r0,0 ; turn off carry
	72	beq cr6,Laligned ; skip if already word aligned (r2==0 if aligned)
	73
	74	; Partial word at start: zero filled on left, it becomes initial checksum.
	75
	76	rlwinm r3,r3,0,0,29 ; word align address
	77	mtcrf 0x01,r2 ; move byte offset to cr7
	78	lwz r6,0(r3) ; get partial word
	79	li r7,-1 ; start of mask for partial fill
	80	slwi r8,r2,3 ; multiply byte offset by 8
	81	sub r4,r4,r0 ; adjust length for bytes in partial word
	82	crxor cr1_gt,31,cr1_gt; set flag if byte-lane swap will be necessary
	83	srw r7,r7,r8 ; get mask for bytes to keep in partial word
	84	addi r3,r3,4 ; point to next word of input
	85	and r2,r6,r7 ; zero fill on left
	86
	87	; Address is now word aligned. Prepare for inner loop over 32-byte chunks.
	88	; r2 = initial checksum
	89	; r3 = word aligned address
	90	; r4 = length remaining
	91	; r5 = accumulated sum parameter
	92	; carry = off
	93	; cr1_gt = "starting on odd address" flag
	94
	95	Laligned:
	96	srwi. r0,r4,5 ; get count of 32-byte chunks
97	mtcrf 0x02,r4 ; move residual length to cr6 and cr7
98	mtcrf 0x01,r4
99	beq cr0,Lleftovers ; no chunks
100
101	mtctr r0 ; set up loop count
102	li r4,32 ; offset to next chunk
103	_xsum_nop_if_32bit:
104	b L64BitPath ; use the 64-bit path (patched to nop on 32-bit machine)
105	dcbt r4,r3 ; touch in 2nd cache line
106	li r0,96 ; get touch offset
107	b LInnerLoop32 ; enter 32-bit loop
108
109	; Inner loop for 32-bit machines.
110
111	.align 4
112	LInnerLoop32:
113	lwz r4,0(r3)
114	lwz r6,4(r3)
115	lwz r7,8(r3)
116	lwz r8,12(r3)
117	adde r2,r2,r4
118	lwz r9,16(r3)
119	adde r2,r2,r6
120	lwz r10,20(r3)
121	adde r2,r2,r7
122	lwz r11,24(r3)
123	adde r2,r2,r8
124	lwz r12,28(r3)
125	adde r2,r2,r9
126	dcbt r3,r0
127	adde r2,r2,r10
128	addi r3,r3,32
129	adde r2,r2,r11
130	adde r2,r2,r12
131	bdnz+ LInnerLoop32
132
133	; Handle leftover bytes.
134	; r2 = checksum so far
135	; r3 = word aligned address
136	; r5 = accumulated sum parameter
137	; carry = live
138	; cr1_gt = "starting on odd address" flag
139	; cr6,cr7 = residual length
140
141	Lleftovers:
142	bf 27,Lleftover8 ; test 0x10 bit of residual length
143	lwz r4,0(r3)
144	lwz r6,4(r3)
145	lwz r7,8(r3)
146	lwz r8,12(r3)
147	addi r3,r3,16
148	adde r2,r2,r4
149	adde r2,r2,r6
150	adde r2,r2,r7
151	adde r2,r2,r8
152	Lleftover8:
153	bf 28,Lleftover4
154	lwz r4,0(r3)
155	lwz r6,4(r3)
156	addi r3,r3,8
157	adde r2,r2,r4
158	adde r2,r2,r6
159	Lleftover4:
160	bf 29,Lleftover2
161	lwz r4,0(r3)
162	addi r3,r3,4
163	adde r2,r2,r4
164	Lleftover2:
165	bf 30,Lleftover1
166	lhz r4,0(r3)
167	addi r3,r3,2
168	adde r2,r2,r4
169	Lleftover1:
170	bf 31,Lwrapup
171	lbz r4,0(r3)
172	slwi r4,r4,8 ; shift last byte into proper lane
173	adde r2,r2,r4
174
175	; All data bytes checksummed. Wrap up.
176	; r2 = checksum so far (word parallel)
177	; r5 = accumulated sum parameter
178	; carry = live
179	; cr1_gt = "starting on odd address" flag
180
181	Lwrapup:
182	addze r2,r2 ; add in last carry
183	addze r2,r2 ; in case the "addze" carries
184	Lwrapupx: ; here from short-operand case, with xer(ca) undefined
185	srwi r6,r2,16 ; top half of 32-bit checksum
186	rlwinm r7,r2,0,0xFFFF ; lower half
187	add r2,r6,r7 ; add them together
188	srwi r6,r2,16 ; then do it again, in case first carried
189	rlwinm r7,r2,0,0xFFFF
190	add r2,r6,r7
191	bf cr1_gt,Lswapped ; test "starting on odd address" flag
192
193	; The checksum began on an odd address, so swap bytes.
194
195	rlwinm r6,r2,24,0x00FF ; move top byte to bottom
196	rlwinm r7,r2,8,0xFF00 ; bottom to top
197	or r2,r6,r7 ; rejoin
198
199	; Finally, add in checksum passed in as a parameter.
200
201	Lswapped:
202	add r2,r2,r5 ; add passed-in checksum
203	srwi r6,r2,16 ; top half of 32-bit checksum
204	rlwinm r7,r2,0,0xFFFF ; lower half
205	add r2,r6,r7 ; add them together
206	srwi r6,r2,16 ; then do it again, in case first carried
207	rlwinm r7,r2,0,0xFFFF
208	add r3,r6,r7 ; steer result into r3
209	blr
210
211	; Handle short operands. Do a halfword at a time.
212	; r3 = address
213	; r4 = length (<= kShort)
214	; r5 = accumulated sum parameter
215	; r6 = "starting on odd byte" flag
216
217	Lshort:
218	cmpwi cr6,r4,2 ; at least two bytes?
219	andi. r0,r4,1 ; odd length?
220	li r2,0 ; initialize checksum
221	cmplwi cr1,r6,0 ; set cr1_gt if "starting on odd address" flag is set
222	blt cr6,Lshort2 ; fewer than two bytes, so skip
223	Lshort1:
224	cmpwi cr6,r4,4 ; two more bytes (after we decrement)?
225	lhz r7,0(r3)
226	subi r4,r4,2
227	addi r3,r3,2
228	add r2,r2,r7 ; note no need for "adde"
229	bge cr6,Lshort1 ; loop for 2 more bytes
230	Lshort2:
231	beq Lwrapupx ; no byte at end, proceed to checkout with carry undefined
232	lbz r7,0(r3)
233	slwi r7,r7,8 ; shift last byte into proper lane
234	add r2,r2,r7
235	b Lwrapupx
236
237	; Handle 64-bit machine. The major improvement over the 32-bit path is that we use
238	; four parallel 32-bit accumulators, which carry into the upper half naturally so we
239	; do not have to use "adde", which serializes on the carry bit. Note that we cannot
240	; do 64-bit "adde"s, because we run in 32-bit mode so carry would not be set correctly.
241	; r2 = checksum so far (ie, the zero-filled partial first word)
242	; r3 = word aligned address
243	; r5 = accumulated sum parameter
244	; ctr = number of 32-byte chunks of input
245	; carry = unused in this code
246	; cr1_gt = "starting on odd address" flag
247	; cr6,cr7 = residual length
248
249	L64BitPath:
250	stw r13,-4(r1) ; save a few nonvolatile regs in red zone so we can use them
251	stw r14,-8(r1)
252	stw r15,-12(r1)
253	stw r16,-16(r1)
254	li r0,128 ; to touch next line
255	li r13,0 ; r13-r15 are the accumulators, so initialize them
256	dcbt r3,r0 ; touch in next cache line, and keep loads away from the above stores
257	lwz r4,0(r3) ; start pipeline by loading first 32 bytes into r4, r6-r12
258	lwz r6,4(r3)
259	lwz r7,8(r3)
260	mr r14,r2 ; just copy incoming partial word into one of the accumulators
261	li r15,0
262	lwz r8,12(r3)
263	lwz r9,16(r3)
264	li r16,0
265	li r0,256 ; get touch offset
266	lwz r10,20(r3)
267	lwz r11,24(r3)
268	lwz r12,28(r3) ; load last word of previous chunk
269	addi r3,r3,32 ; skip past the chunk
270	bdnz++ LInnerLoop64 ; enter loop if another chunk to go
271
272	b LAddLastChunk ; only one chunk
273
274	; Inner loop for 64-bit processors. This loop is scheduled for the 970.
275	; It is pipelined (loads are one iteration ahead of adds), and unrolled.
276	; It should take 9-10 cycles per iteration, which consumes 64 bytes of input.
277
278	.align 5
279	LInnerLoop64: ; 64 bytes/iteration
280	add r13,r13,r4 ; cycle 1
281	add r14,r14,r6
282	dcbt r3,r0 ; touch in 2 lines ahead
283	lwz r4,0(r3)
284
285	add r15,r15,r7 ; cycle 2, etc
286	lwz r6,4(r3)
287	lwz r7,8(r3)
288	add r16,r16,r8
289
290	lwz r8,12(r3)
291	add r13,r13,r9
292	add r14,r14,r10
293	lwz r9,16(r3)
294
295	add r15,r15,r11
296	lwz r10,20(r3)
297	lwz r11,24(r3)
298	add r16,r16,r12
299	bdz-- LEarlyExit ; early exit if no more chunks
300
301	lwz r12,28(r3)
302	add r13,r13,r4
303	add r14,r14,r6
304	lwz r4,32(r3)
305
306	add r15,r15,r7
307	lwz r6,36(r3)
308	lwz r7,40(r3)
309	add r16,r16,r8
310
311	lwz r8,44(r3)
312	add r13,r13,r9
313	add r14,r14,r10
314	lwz r9,48(r3)
315
316	add r15,r15,r11
317	lwz r10,52(r3)
318	lwz r11,56(r3)
319	add r16,r16,r12
320
321	nop ; position last load in 2nd dispatch slot
322	lwz r12,60(r3)
323	addi r3,r3,64
324	bdnz++ LInnerLoop64
325
326	b LAddLastChunk
327
328	; Add in the last 32-byte chunk, and any leftover bytes.
329	; r3 = word aligned address of next byte of data
330	; r5 = accumulated sum parameter
331	; r13-r16 = the four accumulators
332	; cr1_gt = "starting on odd address" flag
333	; cr6,cr7 = residual length
334
335	LEarlyExit: ; here from middle of inner loop
336	lwz r12,28(r3) ; load last word of last chunk
337	addi r3,r3,32
338	LAddLastChunk: ; last 32-byte chunk of input is in r4,r6-r12
339	add r13,r13,r4 ; add in last chunk
340	add r14,r14,r6 ; these are 64-bit adds
341	add r15,r15,r7
342	add r16,r16,r8
343	add r13,r13,r9
344	add r14,r14,r10
345	add r15,r15,r11
346	add r16,r16,r12
347
348	; Handle leftover bytes, if any.
349
350	bf 27,Lleft1 ; test 0x10 bit of residual length
351	lwz r4,0(r3)
352	lwz r6,4(r3)
353	lwz r7,8(r3)
354	lwz r8,12(r3)
355	addi r3,r3,16
356	add r13,r13,r4
357	add r14,r14,r6
358	add r15,r15,r7
359	add r16,r16,r8
360	Lleft1:
361	bf 28,Lleft2
362	lwz r4,0(r3)
363	lwz r6,4(r3)
364	addi r3,r3,8
365	add r13,r13,r4
366	add r14,r14,r6
367	Lleft2:
368	bf 29,Lleft3
369	lwz r4,0(r3)
370	addi r3,r3,4
371	add r14,r14,r4
372	Lleft3:
373	bf 30,Lleft4
374	lhz r4,0(r3)
375	addi r3,r3,2
376	add r15,r15,r4
377	Lleft4:
378	bf 31,Lleft5
379	lbz r4,0(r3)
380	slwi r4,r4,8 ; shift last byte into proper lane
381	add r16,r16,r4
382
383	; All data bytes have been checksummed. Now we must add together the four
384	; accumulators and restore the regs from the red zone.
385	; r3 = word aligned address of next byte of data
386	; r5 = accumulated sum parameter
387	; r13-r16 = the four accumulators
388	; carry = not used so far
389	; cr1_gt = "starting on odd address" flag
390
391	Lleft5:
392	add r8,r13,r14 ; add the four accumulators together
393	add r9,r15,r16
394	lwz r13,-4(r1) ; start to restore nonvolatiles from red zone
395	lwz r14,-8(r1)
396	add r8,r8,r9 ; now r8 is 64-bit sum of the four accumulators
397	lwz r15,-12(r1)
398	lwz r16,-16(r1)
399	srdi r7,r8,32 ; get upper half of 64-bit sum
400	addc r2,r7,r8 ; finally, do a 32-bit add of the two halves of r8 (setting carry)
401	b Lwrapup ; merge r2, r5, and carry into a 16-bit checksum