[apple/xnu.git] / bsd / dev / ppc / xsumas.s

/*
 * Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved.
 *
 * @APPLE_LICENSE_HEADER_START@
 * 
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this
 * file.
 * 
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 * 
 * @APPLE_LICENSE_HEADER_END@
 */

#define kShort  11
#define cr1_gt  5       // bit 1 of cr1

/*
 * short xsum_assym( short *p, int len, short xsum, boolean odd);
 *
 *  r3 - Pointer to data
 *  r4 - Length of data
 *  r5 - Accumulated sum value
 *  r6 -"Starting on odd address" flag (relative to byte 0 of the checksumed data)
 *
 * Note: If the "odd" flag is set, the address in r3 will be even.  Nonetheless, we
 *       correctly handle the case where the flag is set and the address is odd.
 *
 * This is the internet (IP, TCP) checksum algorithm, which is the 1s-complement sum
 * of the data, treated as an array of 16-bit integers.  1s-complement sums are done
 * via "add with carry" operations on a 2s-complement machine like PPC.  Note that
 * the adds can be done in parallel on 32-bit (or 64-bit) registers, as long as the
 * final sum is folded down to 16 bits.  On 32-bit machines we use "adde", which is
 * perfect except that it serializes the adds on the carry bit.  On 64-bit machines
 * we avoid this serialization by adding 32-bit words into 64-bit sums, then folding
 * all 64-bits into a 16-bit sum at the end.  We cannot use "adde" on 64-bit sums,
 * because the kernel runs in 32-bit mode even on 64-bit machines (so the carry bit
 * is set on the low 32-bits of the sum.)
 *
 * Using Altivec is tempting, but the performance impact of the greatly increased
 * number of exceptions and register save/restore traffic probably make it impractical
 * for now.
 */        
        .globl  _xsum_assym
        .globl  _xsum_nop_if_32bit
        .text
        .align  5
_xsum_assym:
        cmplwi  cr0,r4,kShort   ; too short to word align?
        rlwinm  r2,r3,0,0x3     ; get byte offset in word
        dcbt    0,r3            ; touch in 1st cache line
        cmpwi   cr6,r2,0        ; is address word aligned?
        ble     cr0,Lshort      ; skip if too short to bother aligning
        
        subfic  r0,r2,4         ; get #bytes in partial word
        cmplwi  cr1,r6,0        ; set cr1_gt if "starting on odd address" flag is set
        addic   r0,r0,0         ; turn off carry
        beq     cr6,Laligned    ; skip if already word aligned (r2==0 if aligned)
        
;       Partial word at start: zero filled on left, it becomes initial checksum.
        
        rlwinm  r3,r3,0,0,29    ; word align address
        mtcrf   0x01,r2         ; move byte offset to cr7
        lwz     r6,0(r3)        ; get partial word
        li      r7,-1           ; start of mask for partial fill
        slwi    r8,r2,3         ; multiply byte offset by 8
        sub     r4,r4,r0        ; adjust length for bytes in partial word
        crxor   cr1_gt,31,cr1_gt; set flag if byte-lane swap will be necessary
        srw     r7,r7,r8        ; get mask for bytes to keep in partial word
        addi    r3,r3,4         ; point to next word of input
        and     r2,r6,r7        ; zero fill on left
        
;       Address is now word aligned.  Prepare for inner loop over 32-byte chunks.
;           r2 = initial checksum
;           r3 = word aligned address
;           r4 = length remaining
;           r5 = accumulated sum parameter
;        carry = off
;       cr1_gt = "starting on odd address" flag

Laligned:
        srwi.   r0,r4,5         ; get count of 32-byte chunks
        mtcrf   0x02,r4         ; move residual length to cr6 and cr7
        mtcrf   0x01,r4
        beq     cr0,Lleftovers  ; no chunks
        
        mtctr   r0              ; set up loop count
        li      r4,32           ; offset to next chunk
_xsum_nop_if_32bit:
        b       L64BitPath      ; use the 64-bit path (patched to nop on 32-bit machine)
        dcbt    r4,r3           ; touch in 2nd cache line
        li      r0,96           ; get touch offset
        b       LInnerLoop32    ; enter 32-bit loop
        
;       Inner loop for 32-bit machines.

        .align  4
LInnerLoop32:
        lwz     r4,0(r3)
        lwz     r6,4(r3)
        lwz     r7,8(r3)
        lwz     r8,12(r3)
        adde    r2,r2,r4
        lwz     r9,16(r3)
        adde    r2,r2,r6
        lwz     r10,20(r3)
        adde    r2,r2,r7
        lwz     r11,24(r3)
        adde    r2,r2,r8
        lwz     r12,28(r3)
        adde    r2,r2,r9
        dcbt    r3,r0
        adde    r2,r2,r10
        addi    r3,r3,32
        adde    r2,r2,r11
        adde    r2,r2,r12
        bdnz+   LInnerLoop32

;       Handle leftover bytes.
;           r2 = checksum so far
;           r3 = word aligned address
;           r5 = accumulated sum parameter
;        carry = live
;       cr1_gt = "starting on odd address" flag
;      cr6,cr7 = residual length

Lleftovers:
        bf      27,Lleftover8   ; test 0x10 bit of residual length
        lwz     r4,0(r3)
        lwz     r6,4(r3)
        lwz     r7,8(r3)
        lwz     r8,12(r3)
        addi    r3,r3,16
        adde    r2,r2,r4
        adde    r2,r2,r6
        adde    r2,r2,r7
        adde    r2,r2,r8
Lleftover8:
        bf      28,Lleftover4
        lwz     r4,0(r3)
        lwz     r6,4(r3)
        addi    r3,r3,8
        adde    r2,r2,r4
        adde    r2,r2,r6
Lleftover4:
        bf      29,Lleftover2
        lwz     r4,0(r3)
        addi    r3,r3,4
        adde    r2,r2,r4
Lleftover2:
        bf      30,Lleftover1
        lhz     r4,0(r3)
        addi    r3,r3,2
        adde    r2,r2,r4
Lleftover1:
        bf      31,Lwrapup
        lbz     r4,0(r3)
        slwi    r4,r4,8         ; shift last byte into proper lane
        adde    r2,r2,r4

;       All data bytes checksummed.  Wrap up.
;           r2 = checksum so far (word parallel)
;           r5 = accumulated sum parameter
;        carry = live
;       cr1_gt = "starting on odd address" flag

Lwrapup:
        addze   r2,r2           ; add in last carry
        addze   r2,r2           ; in case the "addze" carries
Lwrapupx:                       ; here from short-operand case, with xer(ca) undefined
        srwi    r6,r2,16        ; top half of 32-bit checksum
        rlwinm  r7,r2,0,0xFFFF  ; lower half
        add     r2,r6,r7        ; add them together
        srwi    r6,r2,16        ; then do it again, in case first carried
        rlwinm  r7,r2,0,0xFFFF
        add     r2,r6,r7
        bf      cr1_gt,Lswapped ; test "starting on odd address" flag
        
;       The checksum began on an odd address, so swap bytes.

        rlwinm  r6,r2,24,0x00FF ; move top byte to bottom
        rlwinm  r7,r2,8,0xFF00  ; bottom to top
        or      r2,r6,r7        ; rejoin
        
;       Finally, add in checksum passed in as a parameter.

Lswapped:
        add     r2,r2,r5        ; add passed-in checksum
        srwi    r6,r2,16        ; top half of 32-bit checksum
        rlwinm  r7,r2,0,0xFFFF  ; lower half
        add     r2,r6,r7        ; add them together
        srwi    r6,r2,16        ; then do it again, in case first carried
        rlwinm  r7,r2,0,0xFFFF
        add     r3,r6,r7        ; steer result into r3
        blr

;       Handle short operands.  Do a halfword at a time.
;           r3 = address
;           r4 = length (<= kShort)
;           r5 = accumulated sum parameter
;           r6 = "starting on odd byte" flag

Lshort:
        cmpwi   cr6,r4,2        ; at least two bytes?
        andi.   r0,r4,1         ; odd length?
        li      r2,0            ; initialize checksum
        cmplwi  cr1,r6,0        ; set cr1_gt if "starting on odd address" flag is set
        blt     cr6,Lshort2     ; fewer than two bytes, so skip
Lshort1:
        cmpwi   cr6,r4,4        ; two more bytes (after we decrement)?
        lhz     r7,0(r3)
        subi    r4,r4,2
        addi    r3,r3,2
        add     r2,r2,r7        ; note no need for "adde"
        bge     cr6,Lshort1     ; loop for 2 more bytes
Lshort2:
        beq     Lwrapupx        ; no byte at end, proceed to checkout with carry undefined
        lbz     r7,0(r3)
        slwi    r7,r7,8         ; shift last byte into proper lane
        add     r2,r2,r7
        b       Lwrapupx
        
;       Handle 64-bit machine.  The major improvement over the 32-bit path is that we use
;       four parallel 32-bit accumulators, which carry into the upper half naturally so we
;       do not have to use "adde", which serializes on the carry bit.  Note that we cannot
;       do 64-bit "adde"s, because we run in 32-bit mode so carry would not be set correctly.
;           r2 = checksum so far (ie, the zero-filled partial first word)
;           r3 = word aligned address
;           r5 = accumulated sum parameter
;          ctr = number of 32-byte chunks of input
;        carry = unused in this code
;       cr1_gt = "starting on odd address" flag
;      cr6,cr7 = residual length

L64BitPath:
        stw     r13,-4(r1)      ; save a few nonvolatile regs in red zone so we can use them
        stw     r14,-8(r1)
        stw     r15,-12(r1)
        stw     r16,-16(r1)
        li      r0,128          ; to touch next line
        li      r13,0           ; r13-r15 are the accumulators, so initialize them
        dcbt    r3,r0           ; touch in next cache line, and keep loads away from the above stores
        lwz     r4,0(r3)        ; start pipeline by loading first 32 bytes into r4, r6-r12
        lwz     r6,4(r3)
        lwz     r7,8(r3)
        mr      r14,r2          ; just copy incoming partial word into one of the accumulators
        li      r15,0
        lwz     r8,12(r3)
        lwz     r9,16(r3)
        li      r16,0
        li      r0,256          ; get touch offset
        lwz     r10,20(r3)
        lwz     r11,24(r3)
        lwz     r12,28(r3)      ; load last word of previous chunk
        addi    r3,r3,32        ; skip past the chunk
        bdnz++  LInnerLoop64    ; enter loop if another chunk to go
        
        b       LAddLastChunk   ; only one chunk
        
;       Inner loop for 64-bit processors.  This loop is scheduled for the 970.
;       It is pipelined (loads are one iteration ahead of adds), and unrolled.
;       It should take 9-10 cycles per iteration, which consumes 64 bytes of input.

        .align  5
LInnerLoop64:                   ; 64 bytes/iteration
        add     r13,r13,r4      ; cycle 1
        add     r14,r14,r6
        dcbt    r3,r0           ; touch in 2 lines ahead
        lwz     r4,0(r3)
        
        add     r15,r15,r7      ; cycle 2, etc
        lwz     r6,4(r3)
        lwz     r7,8(r3)
        add     r16,r16,r8
        
        lwz     r8,12(r3)
        add     r13,r13,r9
        add     r14,r14,r10
        lwz     r9,16(r3)
        
        add     r15,r15,r11
        lwz     r10,20(r3)
        lwz     r11,24(r3)
        add     r16,r16,r12
        bdz--   LEarlyExit      ; early exit if no more chunks
        
        lwz     r12,28(r3)
        add     r13,r13,r4
        add     r14,r14,r6
        lwz     r4,32(r3)
        
        add     r15,r15,r7
        lwz     r6,36(r3)
        lwz     r7,40(r3)
        add     r16,r16,r8
        
        lwz     r8,44(r3)
        add     r13,r13,r9
        add     r14,r14,r10
        lwz     r9,48(r3)
        
        add     r15,r15,r11
        lwz     r10,52(r3)
        lwz     r11,56(r3)
        add     r16,r16,r12
        
        nop                     ; position last load in 2nd dispatch slot
        lwz     r12,60(r3)
        addi    r3,r3,64
        bdnz++  LInnerLoop64
        
        b       LAddLastChunk

;       Add in the last 32-byte chunk, and any leftover bytes.
;           r3 = word aligned address of next byte of data
;           r5 = accumulated sum parameter
;      r13-r16 = the four accumulators
;       cr1_gt = "starting on odd address" flag
;      cr6,cr7 = residual length

LEarlyExit:                     ; here from middle of inner loop
        lwz     r12,28(r3)      ; load last word of last chunk
        addi    r3,r3,32
LAddLastChunk:                  ; last 32-byte chunk of input is in r4,r6-r12
        add     r13,r13,r4      ; add in last chunk
        add     r14,r14,r6      ; these are 64-bit adds
        add     r15,r15,r7
        add     r16,r16,r8
        add     r13,r13,r9
        add     r14,r14,r10
        add     r15,r15,r11
        add     r16,r16,r12

;       Handle leftover bytes, if any.

        bf      27,Lleft1       ; test 0x10 bit of residual length
        lwz     r4,0(r3)
        lwz     r6,4(r3)
        lwz     r7,8(r3)
        lwz     r8,12(r3)
        addi    r3,r3,16
        add     r13,r13,r4
        add     r14,r14,r6
        add     r15,r15,r7
        add     r16,r16,r8
Lleft1:
        bf      28,Lleft2
        lwz     r4,0(r3)
        lwz     r6,4(r3)
        addi    r3,r3,8
        add     r13,r13,r4
        add     r14,r14,r6
Lleft2:
        bf      29,Lleft3
        lwz     r4,0(r3)
        addi    r3,r3,4
        add     r14,r14,r4
Lleft3:
        bf      30,Lleft4
        lhz     r4,0(r3)
        addi    r3,r3,2
        add     r15,r15,r4
Lleft4:
        bf      31,Lleft5
        lbz     r4,0(r3)
        slwi    r4,r4,8         ; shift last byte into proper lane
        add     r16,r16,r4

;       All data bytes have been checksummed.  Now we must add together the four
;       accumulators and restore the regs from the red zone.
;           r3 = word aligned address of next byte of data
;           r5 = accumulated sum parameter
;      r13-r16 = the four accumulators
;        carry = not used so far
;       cr1_gt = "starting on odd address" flag

Lleft5:
        add     r8,r13,r14      ; add the four accumulators together
        add     r9,r15,r16
        lwz     r13,-4(r1)      ; start to restore nonvolatiles from red zone
        lwz     r14,-8(r1)
        add     r8,r8,r9        ; now r8 is 64-bit sum of the four accumulators
        lwz     r15,-12(r1)
        lwz     r16,-16(r1)
        srdi    r7,r8,32        ; get upper half of 64-bit sum
        addc    r2,r7,r8        ; finally, do a 32-bit add of the two halves of r8 (setting carry)
        b       Lwrapup         ; merge r2, r5, and carry into a 16-bit checksum
Commit	Line	Data
1c79356b	1	/*
91447636	2	* Copyright (c) 2000-2004 Apple Computer, Inc. All rights reserved.
1c79356b A	3	*
	4	* @APPLE_LICENSE_HEADER_START@
	5	*
ff6e181a A	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. Please obtain a copy of the License at
	10	* http://www.opensource.apple.com/apsl/ and read it before using this
	11	* file.
1c79356b	12	*
ff6e181a A	13	* The Original Code and all software distributed under the License are
ff6e181a A	14	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
1c79356b A	15	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
1c79356b A	16	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
ff6e181a A	17	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	18	* Please see the License for the specific language governing rights and
	19	* limitations under the License.
1c79356b A	20	*
	21	* @APPLE_LICENSE_HEADER_END@
	22	*/
91447636 A	23
	24	#define kShort 11
	25	#define cr1_gt 5 // bit 1 of cr1
1c79356b A	26
1c79356b A	27	/*
91447636 A	28	* short xsum_assym( short *p, int len, short xsum, boolean odd);
	29	*
	30	* r3 - Pointer to data
	31	* r4 - Length of data
	32	* r5 - Accumulated sum value
	33	* r6 -"Starting on odd address" flag (relative to byte 0 of the checksumed data)
	34	*
	35	* Note: If the "odd" flag is set, the address in r3 will be even. Nonetheless, we
	36	* correctly handle the case where the flag is set and the address is odd.
	37	*
	38	* This is the internet (IP, TCP) checksum algorithm, which is the 1s-complement sum
	39	* of the data, treated as an array of 16-bit integers. 1s-complement sums are done
	40	* via "add with carry" operations on a 2s-complement machine like PPC. Note that
	41	* the adds can be done in parallel on 32-bit (or 64-bit) registers, as long as the
	42	* final sum is folded down to 16 bits. On 32-bit machines we use "adde", which is
	43	* perfect except that it serializes the adds on the carry bit. On 64-bit machines
	44	* we avoid this serialization by adding 32-bit words into 64-bit sums, then folding
	45	* all 64-bits into a 16-bit sum at the end. We cannot use "adde" on 64-bit sums,
	46	* because the kernel runs in 32-bit mode even on 64-bit machines (so the carry bit
	47	* is set on the low 32-bits of the sum.)
	48	*
	49	* Using Altivec is tempting, but the performance impact of the greatly increased
	50	* number of exceptions and register save/restore traffic probably make it impractical
	51	* for now.
	52	*/
	53	.globl _xsum_assym
	54	.globl _xsum_nop_if_32bit
	55	.text
	56	.align 5
	57	_xsum_assym:
	58	cmplwi cr0,r4,kShort ; too short to word align?
	59	rlwinm r2,r3,0,0x3 ; get byte offset in word
	60	dcbt 0,r3 ; touch in 1st cache line
	61	cmpwi cr6,r2,0 ; is address word aligned?
	62	ble cr0,Lshort ; skip if too short to bother aligning
	63
	64	subfic r0,r2,4 ; get #bytes in partial word
	65	cmplwi cr1,r6,0 ; set cr1_gt if "starting on odd address" flag is set
	66	addic r0,r0,0 ; turn off carry
	67	beq cr6,Laligned ; skip if already word aligned (r2==0 if aligned)
	68
	69	; Partial word at start: zero filled on left, it becomes initial checksum.
	70
	71	rlwinm r3,r3,0,0,29 ; word align address
	72	mtcrf 0x01,r2 ; move byte offset to cr7
	73	lwz r6,0(r3) ; get partial word
	74	li r7,-1 ; start of mask for partial fill
	75	slwi r8,r2,3 ; multiply byte offset by 8
	76	sub r4,r4,r0 ; adjust length for bytes in partial word
	77	crxor cr1_gt,31,cr1_gt; set flag if byte-lane swap will be necessary
	78	srw r7,r7,r8 ; get mask for bytes to keep in partial word
	79	addi r3,r3,4 ; point to next word of input
	80	and r2,r6,r7 ; zero fill on left
	81
	82	; Address is now word aligned. Prepare for inner loop over 32-byte chunks.
	83	; r2 = initial checksum
	84	; r3 = word aligned address
	85	; r4 = length remaining
	86	; r5 = accumulated sum parameter
	87	; carry = off
	88	; cr1_gt = "starting on odd address" flag
	89
	90	Laligned:
	91	srwi. r0,r4,5 ; get count of 32-byte chunks
92	mtcrf 0x02,r4 ; move residual length to cr6 and cr7
93	mtcrf 0x01,r4
94	beq cr0,Lleftovers ; no chunks
95
96	mtctr r0 ; set up loop count
97	li r4,32 ; offset to next chunk
98	_xsum_nop_if_32bit:
99	b L64BitPath ; use the 64-bit path (patched to nop on 32-bit machine)
100	dcbt r4,r3 ; touch in 2nd cache line
101	li r0,96 ; get touch offset
102	b LInnerLoop32 ; enter 32-bit loop
103
104	; Inner loop for 32-bit machines.
105
106	.align 4
107	LInnerLoop32:
108	lwz r4,0(r3)
109	lwz r6,4(r3)
110	lwz r7,8(r3)
111	lwz r8,12(r3)
112	adde r2,r2,r4
113	lwz r9,16(r3)
114	adde r2,r2,r6
115	lwz r10,20(r3)
116	adde r2,r2,r7
117	lwz r11,24(r3)
118	adde r2,r2,r8
119	lwz r12,28(r3)
120	adde r2,r2,r9
121	dcbt r3,r0
122	adde r2,r2,r10
123	addi r3,r3,32
124	adde r2,r2,r11
125	adde r2,r2,r12
126	bdnz+ LInnerLoop32
127
128	; Handle leftover bytes.
129	; r2 = checksum so far
130	; r3 = word aligned address
131	; r5 = accumulated sum parameter
132	; carry = live
133	; cr1_gt = "starting on odd address" flag
134	; cr6,cr7 = residual length
135
136	Lleftovers:
137	bf 27,Lleftover8 ; test 0x10 bit of residual length
138	lwz r4,0(r3)
139	lwz r6,4(r3)
140	lwz r7,8(r3)
141	lwz r8,12(r3)
142	addi r3,r3,16
143	adde r2,r2,r4
144	adde r2,r2,r6
145	adde r2,r2,r7
146	adde r2,r2,r8
147	Lleftover8:
148	bf 28,Lleftover4
149	lwz r4,0(r3)
150	lwz r6,4(r3)
151	addi r3,r3,8
152	adde r2,r2,r4
153	adde r2,r2,r6
154	Lleftover4:
155	bf 29,Lleftover2
156	lwz r4,0(r3)
157	addi r3,r3,4
158	adde r2,r2,r4
159	Lleftover2:
160	bf 30,Lleftover1
161	lhz r4,0(r3)
162	addi r3,r3,2
163	adde r2,r2,r4
164	Lleftover1:
165	bf 31,Lwrapup
166	lbz r4,0(r3)
167	slwi r4,r4,8 ; shift last byte into proper lane
168	adde r2,r2,r4
169
170	; All data bytes checksummed. Wrap up.
171	; r2 = checksum so far (word parallel)
172	; r5 = accumulated sum parameter
173	; carry = live
174	; cr1_gt = "starting on odd address" flag
175
176	Lwrapup:
177	addze r2,r2 ; add in last carry
178	addze r2,r2 ; in case the "addze" carries
179	Lwrapupx: ; here from short-operand case, with xer(ca) undefined
180	srwi r6,r2,16 ; top half of 32-bit checksum
181	rlwinm r7,r2,0,0xFFFF ; lower half
182	add r2,r6,r7 ; add them together
183	srwi r6,r2,16 ; then do it again, in case first carried
184	rlwinm r7,r2,0,0xFFFF
185	add r2,r6,r7
186	bf cr1_gt,Lswapped ; test "starting on odd address" flag
187
188	; The checksum began on an odd address, so swap bytes.
189
190	rlwinm r6,r2,24,0x00FF ; move top byte to bottom
191	rlwinm r7,r2,8,0xFF00 ; bottom to top
192	or r2,r6,r7 ; rejoin
193
194	; Finally, add in checksum passed in as a parameter.
195
196	Lswapped:
197	add r2,r2,r5 ; add passed-in checksum
198	srwi r6,r2,16 ; top half of 32-bit checksum
199	rlwinm r7,r2,0,0xFFFF ; lower half
200	add r2,r6,r7 ; add them together
201	srwi r6,r2,16 ; then do it again, in case first carried
202	rlwinm r7,r2,0,0xFFFF
203	add r3,r6,r7 ; steer result into r3
204	blr
205
206	; Handle short operands. Do a halfword at a time.
207	; r3 = address
208	; r4 = length (<= kShort)
209	; r5 = accumulated sum parameter
210	; r6 = "starting on odd byte" flag
211
212	Lshort:
213	cmpwi cr6,r4,2 ; at least two bytes?
214	andi. r0,r4,1 ; odd length?
215	li r2,0 ; initialize checksum
216	cmplwi cr1,r6,0 ; set cr1_gt if "starting on odd address" flag is set
217	blt cr6,Lshort2 ; fewer than two bytes, so skip
218	Lshort1:
219	cmpwi cr6,r4,4 ; two more bytes (after we decrement)?
220	lhz r7,0(r3)
221	subi r4,r4,2
222	addi r3,r3,2
223	add r2,r2,r7 ; note no need for "adde"
224	bge cr6,Lshort1 ; loop for 2 more bytes
225	Lshort2:
226	beq Lwrapupx ; no byte at end, proceed to checkout with carry undefined
227	lbz r7,0(r3)
228	slwi r7,r7,8 ; shift last byte into proper lane
229	add r2,r2,r7
230	b Lwrapupx
231
232	; Handle 64-bit machine. The major improvement over the 32-bit path is that we use
233	; four parallel 32-bit accumulators, which carry into the upper half naturally so we
234	; do not have to use "adde", which serializes on the carry bit. Note that we cannot
235	; do 64-bit "adde"s, because we run in 32-bit mode so carry would not be set correctly.
236	; r2 = checksum so far (ie, the zero-filled partial first word)
237	; r3 = word aligned address
238	; r5 = accumulated sum parameter
239	; ctr = number of 32-byte chunks of input
240	; carry = unused in this code
241	; cr1_gt = "starting on odd address" flag
242	; cr6,cr7 = residual length
243
244	L64BitPath:
245	stw r13,-4(r1) ; save a few nonvolatile regs in red zone so we can use them
246	stw r14,-8(r1)
247	stw r15,-12(r1)
248	stw r16,-16(r1)
249	li r0,128 ; to touch next line
250	li r13,0 ; r13-r15 are the accumulators, so initialize them
251	dcbt r3,r0 ; touch in next cache line, and keep loads away from the above stores
252	lwz r4,0(r3) ; start pipeline by loading first 32 bytes into r4, r6-r12
253	lwz r6,4(r3)
254	lwz r7,8(r3)
255	mr r14,r2 ; just copy incoming partial word into one of the accumulators
256	li r15,0
257	lwz r8,12(r3)
258	lwz r9,16(r3)
259	li r16,0
260	li r0,256 ; get touch offset
261	lwz r10,20(r3)
262	lwz r11,24(r3)
263	lwz r12,28(r3) ; load last word of previous chunk
264	addi r3,r3,32 ; skip past the chunk
265	bdnz++ LInnerLoop64 ; enter loop if another chunk to go
266
267	b LAddLastChunk ; only one chunk
268
269	; Inner loop for 64-bit processors. This loop is scheduled for the 970.
270	; It is pipelined (loads are one iteration ahead of adds), and unrolled.
271	; It should take 9-10 cycles per iteration, which consumes 64 bytes of input.
272
273	.align 5
274	LInnerLoop64: ; 64 bytes/iteration
275	add r13,r13,r4 ; cycle 1
276	add r14,r14,r6
277	dcbt r3,r0 ; touch in 2 lines ahead
278	lwz r4,0(r3)
279
280	add r15,r15,r7 ; cycle 2, etc
281	lwz r6,4(r3)
282	lwz r7,8(r3)
283	add r16,r16,r8
284
285	lwz r8,12(r3)
286	add r13,r13,r9
287	add r14,r14,r10
288	lwz r9,16(r3)
289
290	add r15,r15,r11
291	lwz r10,20(r3)
292	lwz r11,24(r3)
293	add r16,r16,r12
294	bdz-- LEarlyExit ; early exit if no more chunks
295
296	lwz r12,28(r3)
297	add r13,r13,r4
298	add r14,r14,r6
299	lwz r4,32(r3)
300
301	add r15,r15,r7
302	lwz r6,36(r3)
303	lwz r7,40(r3)
304	add r16,r16,r8
305
306	lwz r8,44(r3)
307	add r13,r13,r9
308	add r14,r14,r10
309	lwz r9,48(r3)
310
311	add r15,r15,r11
312	lwz r10,52(r3)
313	lwz r11,56(r3)
314	add r16,r16,r12
315
316	nop ; position last load in 2nd dispatch slot
317	lwz r12,60(r3)
318	addi r3,r3,64
319	bdnz++ LInnerLoop64
320
321	b LAddLastChunk
322
323	; Add in the last 32-byte chunk, and any leftover bytes.
324	; r3 = word aligned address of next byte of data
325	; r5 = accumulated sum parameter
326	; r13-r16 = the four accumulators
327	; cr1_gt = "starting on odd address" flag
328	; cr6,cr7 = residual length
329
330	LEarlyExit: ; here from middle of inner loop
331	lwz r12,28(r3) ; load last word of last chunk
332	addi r3,r3,32
333	LAddLastChunk: ; last 32-byte chunk of input is in r4,r6-r12
334	add r13,r13,r4 ; add in last chunk
335	add r14,r14,r6 ; these are 64-bit adds
336	add r15,r15,r7
337	add r16,r16,r8
338	add r13,r13,r9
339	add r14,r14,r10
340	add r15,r15,r11
341	add r16,r16,r12
342
343	; Handle leftover bytes, if any.
344
345	bf 27,Lleft1 ; test 0x10 bit of residual length
346	lwz r4,0(r3)
347	lwz r6,4(r3)
348	lwz r7,8(r3)
349	lwz r8,12(r3)
350	addi r3,r3,16
351	add r13,r13,r4
352	add r14,r14,r6
353	add r15,r15,r7
354	add r16,r16,r8
355	Lleft1:
356	bf 28,Lleft2
357	lwz r4,0(r3)
358	lwz r6,4(r3)
359	addi r3,r3,8
360	add r13,r13,r4
361	add r14,r14,r6
362	Lleft2:
363	bf 29,Lleft3
364	lwz r4,0(r3)
365	addi r3,r3,4
366	add r14,r14,r4
367	Lleft3:
368	bf 30,Lleft4
369	lhz r4,0(r3)
370	addi r3,r3,2
371	add r15,r15,r4
372	Lleft4:
373	bf 31,Lleft5
374	lbz r4,0(r3)
375	slwi r4,r4,8 ; shift last byte into proper lane
376	add r16,r16,r4
377
378	; All data bytes have been checksummed. Now we must add together the four
379	; accumulators and restore the regs from the red zone.
380	; r3 = word aligned address of next byte of data
381	; r5 = accumulated sum parameter
382	; r13-r16 = the four accumulators
383	; carry = not used so far
384	; cr1_gt = "starting on odd address" flag
385
386	Lleft5:
387	add r8,r13,r14 ; add the four accumulators together
388	add r9,r15,r16
389	lwz r13,-4(r1) ; start to restore nonvolatiles from red zone
390	lwz r14,-8(r1)
391	add r8,r8,r9 ; now r8 is 64-bit sum of the four accumulators
392	lwz r15,-12(r1)
393	lwz r16,-16(r1)
394	srdi r7,r8,32 ; get upper half of 64-bit sum
395	addc r2,r7,r8 ; finally, do a 32-bit add of the two halves of r8 (setting carry)
396	b Lwrapup ; merge r2, r5, and carry into a 16-bit checksum