bsd/dev/arm/cpu_in_cksum.s

   1 /*
   2  * Copyright (c) 2009-2017 Apple Inc. All rights reserved.
   3  *
   4  * This document is the property of Apple Inc.
   5  * It is considered confidential and proprietary.
   6  *
   7  * This document may not be reproduced or transmitted in any form,
   8  * in whole or in part, without the express written permission of
   9  * Apple Inc.
  10  */
  11
  12 /*      $NetBSD: cpu_in_cksum.S,v 1.2 2008/01/27 16:58:05 chris Exp $   */
  13
  14 /*
  15  * Copyright 2003 Wasabi Systems, Inc.
  16  * All rights reserved.
  17  *
  18  * Written by Steve C. Woodford for Wasabi Systems, Inc.
  19  *
  20  * Redistribution and use in source and binary forms, with or without
  21  * modification, are permitted provided that the following conditions
  22  * are met:
  23  * 1. Redistributions of source code must retain the above copyright
  24  *    notice, this list of conditions and the following disclaimer.
  25  * 2. Redistributions in binary form must reproduce the above copyright
  26  *    notice, this list of conditions and the following disclaimer in the
  27  *    documentation and/or other materials provided with the distribution.
  28  * 3. All advertising materials mentioning features or use of this software
  29  *    must display the following acknowledgement:
  30  *      This product includes software developed for the NetBSD Project by
  31  *      Wasabi Systems, Inc.
  32  * 4. The name of Wasabi Systems, Inc. may not be used to endorse
  33  *    or promote products derived from this software without specific prior
  34  *    written permission.
  35  *
  36  * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
  37  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  38  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  39  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
  40  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  41  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  42  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  43  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  44  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  45  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  46  * POSSIBILITY OF SUCH DAMAGE.
  47  */
  48
  49 #ifdef KERNEL
  50 #include "../../../osfmk/arm/arch.h"
  51 #include "../../../osfmk/arm/proc_reg.h"
  52
  53 #if __ARM_VFP__ < 3
  54 #error "Unsupported: __ARM_VFP__ < 3"
  55 #endif /* __ARM_VFP__ < 3 */
  56 #define CKSUM_ERR _kprintf
  57 #else /* !KERNEL */
  58 #ifndef LIBSYSCALL_INTERFACE
  59 #error "LIBSYSCALL_INTERFACE not defined"
  60 #endif /* !LIBSYSCALL_INTERFACE */
  61 #define CKSUM_ERR _fprintf_stderr
  62 #define __ARM_VFP__     3
  63 #endif /* !KERNEL */
  64
  65 /*
  66  * The following default the implementation to little-endian architectures.
  67  */
  68 #define LITTLE_ENDIAN   1
  69 #define BYTE_ORDER      LITTLE_ENDIAN
  70
  71 .syntax unified
  72
  73 /*
  74  * XXX: adi@apple.com:
  75  *
  76  * Ugly, but we have little choice, since relying on genassym and <assym.s>
  77  * is not possible unless this code lives in osfmk.  Note also that this
  78  * routine expects "mbuf-like" argument, and it does not expect the mbuf to be
  79  * authentic; it only cares about 3 fields.
  80  */
  81 #define M_NEXT  0
  82 #define M_DATA  8
  83 #define M_LEN   12
  84
  85 /*
  86  * APPLE MODIFICATION
  87  *
  88  * The use of R7 in this code as data register prevents
  89  * the use of debugging or instrumentation tools, which is an acceptable
  90  * tradeoff considering the potential gain in performance.
  91  */
  92
  93 /*
  94  * Hand-optimised implementations for ARM/Xscale
  95  */
  96
  97         .macro EnableVFP
  98 #ifdef KERNEL
  99         push    {r0, r1, r2, r12}
 100         bl      _enable_kernel_vfp_context
 101         pop     {r0, r1, r2, r12}
 102 #endif /* KERNEL */
 103         .endm
 104
 105
 106 /*
 107  * uint32_t os_cpu_in_cksum_mbuf(struct mbuf *m, int len, int off,
 108  *     uint32_t initial_sum);
 109  *
 110  * Entry:
 111  *      r0      m
 112  *      r1      len
 113  *      r2      off
 114  *      r3      initial_sum
 115  *
 116  * Function wide register usage
 117  *      r8      accumulated sum
 118  *      r9      remaining length to parse
 119  *      ip      pointer to next mbuf
 120  *
 121  * This function returns the partial 16-bit checksum accumulated in
 122  * a 32-bit variable (withouth 1's complement); caller is responsible
 123  * for folding the 32-bit sum into 16-bit and performinng the 1's
 124  * complement if applicable
 125  */
 126         .globl  _os_cpu_in_cksum_mbuf
 127         .text
 128         .align  4
 129 _os_cpu_in_cksum_mbuf:
 130         stmfd   sp!, {r4-r11,lr}
 131
 132         mov     r8, r3                  /* Accumulate sum in r8 */
 133         mov     r9, r1                  /* save len in r9 */
 134         mov     ip, r0                  /* set ip to the current mbuf */
 135
 136         cmp     r9, #0                  /* length is 0? */
 137         bne     .Lin_cksum_skip_loop    /* if not, proceed further */
 138         mov     r0, r8                  /* otherwise, return initial sum */
 139
 140         ldmfd   sp!, {r4-r11, pc}
 141
 142 .Lin_cksum_skip_loop:
 143         ldr     r1, [ip, #(M_LEN)]
 144         ldr     r0, [ip, #(M_DATA)]
 145         ldr     ip, [ip, #(M_NEXT)]
 146 .Lin_cksum_skip_entry:
 147         subs    r2, r2, r1              /* offset = offset - mbuf length */
 148         blt     .Lin_cksum_skip_done    /* if offset has gone negative start with this mbuf */
 149         cmp     ip, #0x00
 150         bne     .Lin_cksum_skip_loop
 151         b       .Lin_cksum_whoops
 152
 153 .Lin_cksum_skip_done:
 154         add     r0, r2, r0              /* data += offset (offset is < 0) */
 155         add     r0, r0, r1              /* data += length of mbuf */
 156                                         /* data == start of data to cksum */
 157         rsb     r1, r2, #0x00           /* length = remainder of mbuf to read */
 158         mov     r10, #0x00
 159         b       .Lin_cksum_entry
 160
 161 .Lin_cksum_loop:
 162         ldr     r1, [ip, #(M_LEN)]
 163         ldr     r0, [ip, #(M_DATA)]
 164         ldr     ip, [ip, #(M_NEXT)]
 165 .Lin_cksum_entry:
 166         cmp     r9, r1
 167         movlt   r1, r9
 168         sub     r9, r9, r1
 169         eor     r11, r10, r0
 170         add     r10, r10, r1
 171         adds    r2, r1, #0x00
 172
 173         beq     .Lin_cksum_next
 174
 175 /*
 176  * APPLE MODIFICATION
 177  *
 178  * Replace the 'blne _ASM_LABEL(L_cksumdata)' by bringing the called function
 179  * inline. This results in slightly faster code, and also permits the whole
 180  * function to be included in kernel profiling data.
 181  */
 182
 183 /*
 184  * The main in*_cksum() workhorse...
 185  *
 186  * Entry parameters:
 187  *      r0      Pointer to buffer
 188  *      r1      Buffer length
 189  *      lr      Return address
 190  *
 191  * Returns:
 192  *      r2      Accumulated 32-bit sum
 193  *
 194  * Clobbers:
 195  *      r0-r7
 196  */
 197         mov     r2, #0
 198
 199         /* We first have to word-align the buffer.  */
 200         ands    r7, r0, #0x03
 201         beq     .Lcksumdata_wordaligned
 202         rsb     r7, r7, #0x04
 203         cmp     r1, r7                  /* Enough bytes left to make it? */
 204         blt     .Lcksumdata_endgame
 205         cmp     r7, #0x02
 206         ldrb    r4, [r0], #0x01         /* Fetch 1st byte */
 207         ldrbge  r5, [r0], #0x01         /* Fetch 2nd byte */
 208         movlt   r5, #0x00
 209         ldrbgt  r6, [r0], #0x01         /* Fetch 3rd byte */
 210         movle   r6, #0x00
 211         /* Combine the three bytes depending on endianness and alignment */
 212 #if BYTE_ORDER != LITTLE_ENDIAN
 213         orreq   r2, r5, r4, lsl #8
 214         orreq   r2, r2, r6, lsl #24
 215         orrne   r2, r4, r5, lsl #8
 216         orrne   r2, r2, r6, lsl #16
 217 #else
 218         orreq   r2, r4, r5, lsl #8
 219         orreq   r2, r2, r6, lsl #16
 220         orrne   r2, r5, r4, lsl #8
 221         orrne   r2, r2, r6, lsl #24
 222 #endif
 223         subs    r1, r1, r7              /* Update length */
 224         beq     .Lin_cksum_next         /* All done? */
 225
 226         /* Buffer is now word aligned */
 227 .Lcksumdata_wordaligned:
 228
 229 #if __ARM_VFP__ >= 3
 230
 231         cmp             r1, #512        // do this if r1 is at least 512
 232         blt             9f
 233
 234         EnableVFP
 235
 236         and             r3, r1, #~0x3f
 237
 238         vpush   {q0-q7}
 239
 240         // move r2 to s16 (q4) for neon computation
 241         veor        q4, q4, q4
 242         vld1.32     {q0-q1}, [r0]!
 243         vmov        s16, r2
 244         vld1.32     {q2-q3}, [r0]!
 245
 246         // pre-decrement size by 64
 247         subs    r3, r3, #0x80
 248
 249         vpadal.u32  q4, q0
 250         vld1.32     {q0}, [r0]!
 251         vpaddl.u32  q5, q1
 252         vld1.32     {q1}, [r0]!
 253         vpaddl.u32  q6, q2
 254         vld1.32     {q2}, [r0]!
 255         vpaddl.u32  q7, q3
 256         vld1.32     {q3}, [r0]!
 257
 258 0:
 259         subs    r3, r3, #0x40           // decrement size by 64
 260
 261         vpadal.u32  q4, q0
 262         vld1.32     {q0}, [r0]!
 263         vpadal.u32  q5, q1
 264         vld1.32     {q1}, [r0]!
 265         vpadal.u32  q6, q2
 266         vld1.32     {q2}, [r0]!
 267         vpadal.u32  q7, q3
 268         vld1.32     {q3}, [r0]!
 269
 270         bgt             0b
 271
 272         vpadal.u32  q4, q0
 273         vpadal.u32  q5, q1
 274         vpadal.u32  q6, q2
 275         vpadal.u32  q7, q3
 276
 277         vpadal.u32  q4, q5
 278         vpadal.u32  q6, q7
 279         vpadal.u32  q4, q6
 280         vadd.i64    d8, d9
 281
 282         vpaddl.u32  d8, d8
 283         vpaddl.u32  d8, d8
 284         vpaddl.u32  d8, d8
 285
 286         vmov    r2, s16
 287
 288         vpop   {q0-q7}
 289
 290         ands    r1, r1, #0x3f           // residual bytes
 291         beq     .Lin_cksum_next
 292
 293 9:
 294
 295 #endif /* __ARM_VFP__ >= 3 */
 296
 297         subs    r1, r1, #0x40
 298         blt     .Lcksumdata_bigloop_end
 299
 300 .Lcksumdata_bigloop:
 301         ldmia   r0!, {r3, r4, r5, r6}
 302         adds    r2, r2, r3
 303         adcs    r2, r2, r4
 304         adcs    r2, r2, r5
 305         ldmia   r0!, {r3, r4, r5, r7}
 306         adcs    r2, r2, r6
 307         adcs    r2, r2, r3
 308         adcs    r2, r2, r4
 309         adcs    r2, r2, r5
 310         ldmia   r0!, {r3, r4, r5, r6}
 311         adcs    r2, r2, r7
 312         adcs    r2, r2, r3
 313         adcs    r2, r2, r4
 314         adcs    r2, r2, r5
 315         ldmia   r0!, {r3, r4, r5, r7}
 316         adcs    r2, r2, r6
 317         adcs    r2, r2, r3
 318         adcs    r2, r2, r4
 319         adcs    r2, r2, r5
 320         adcs    r2, r2, r7
 321         adc     r2, r2, #0x00
 322         subs    r1, r1, #0x40
 323         bge     .Lcksumdata_bigloop
 324 .Lcksumdata_bigloop_end:
 325
 326         adds    r1, r1, #0x40
 327         beq     .Lin_cksum_next
 328
 329         cmp     r1, #0x20
 330
 331         blt     .Lcksumdata_less_than_32
 332         ldmia   r0!, {r3, r4, r5, r6}
 333         adds    r2, r2, r3
 334         adcs    r2, r2, r4
 335         adcs    r2, r2, r5
 336         ldmia   r0!, {r3, r4, r5, r7}
 337         adcs    r2, r2, r6
 338         adcs    r2, r2, r3
 339         adcs    r2, r2, r4
 340         adcs    r2, r2, r5
 341         adcs    r2, r2, r7
 342         adc     r2, r2, #0x00
 343         subs    r1, r1, #0x20
 344         beq     .Lin_cksum_next
 345
 346 .Lcksumdata_less_than_32:
 347         /* There are less than 32 bytes left */
 348         and     r3, r1, #0x18
 349         rsb     r4, r3, #0x18
 350         sub     r1, r1, r3
 351         adds    r4, r4, r4, lsr #1      /* Side effect: Clear carry flag */
 352         addne   pc, pc, r4
 353
 354 /*
 355  * Note: We use ldm here, even on Xscale, since the combined issue/result
 356  * latencies for ldm and ldrd are the same. Using ldm avoids needless #ifdefs.
 357  */
 358         /* At least 24 bytes remaining... */
 359         ldmia   r0!, {r4, r5}
 360         nop
 361         adcs    r2, r2, r4
 362         adcs    r2, r2, r5
 363
 364         /* At least 16 bytes remaining... */
 365         ldmia   r0!, {r4, r5}
 366         adcs    r2, r2, r4
 367         adcs    r2, r2, r5
 368
 369         /* At least 8 bytes remaining... */
 370         ldmia   r0!, {r4, r5}
 371         adcs    r2, r2, r4
 372         adcs    r2, r2, r5
 373
 374         /* Less than 8 bytes remaining... */
 375         adc     r2, r2, #0x00
 376         subs    r1, r1, #0x04
 377         blt     .Lcksumdata_lessthan4
 378
 379         ldr     r4, [r0], #0x04
 380         sub     r1, r1, #0x04
 381         adds    r2, r2, r4
 382         adc     r2, r2, #0x00
 383
 384         /* Deal with < 4 bytes remaining */
 385 .Lcksumdata_lessthan4:
 386         adds    r1, r1, #0x04
 387         beq     .Lin_cksum_next
 388
 389         /* Deal with 1 to 3 remaining bytes, possibly misaligned */
 390 .Lcksumdata_endgame:
 391         ldrb    r3, [r0]                /* Fetch first byte */
 392         cmp     r1, #0x02
 393         ldrbge  r4, [r0, #0x01]         /* Fetch 2nd and 3rd as necessary */
 394         movlt   r4, #0x00
 395         ldrbgt  r5, [r0, #0x02]
 396         movle   r5, #0x00
 397         /* Combine the three bytes depending on endianness and alignment */
 398         tst     r0, #0x01
 399 #if BYTE_ORDER != LITTLE_ENDIAN
 400         orreq   r3, r4, r3, lsl #8
 401         orreq   r3, r3, r5, lsl #24
 402         orrne   r3, r3, r4, lsl #8
 403         orrne   r3, r3, r5, lsl #16
 404 #else
 405         orreq   r3, r3, r4, lsl #8
 406         orreq   r3, r3, r5, lsl #16
 407         orrne   r3, r4, r3, lsl #8
 408         orrne   r3, r3, r5, lsl #24
 409 #endif
 410         adds    r2, r2, r3
 411         adc     r2, r2, #0x00
 412
 413 .Lin_cksum_next:
 414         tst     r11, #0x01
 415         movne   r2, r2, ror #8
 416         adds    r8, r8, r2
 417         adc     r8, r8, #0x00
 418         cmp     ip, #00
 419         bne     .Lin_cksum_loop
 420
 421         mov     r1, #0xff
 422         orr     r1, r1, #0xff00
 423         and     r0, r8, r1
 424         add     r0, r0, r8, lsr #16
 425         add     r0, r0, r0, lsr #16
 426         and     r0, r0, r1
 427         /*
 428          * If we were to 1's complement it (XOR with 0xffff):
 429          *
 430          * eor  r0, r0, r1
 431          */
 432
 433         ldmfd   sp!, {r4-r11, pc}
 434
 435 .Lin_cksum_whoops:
 436         adr     r0, .Lin_cksum_whoops_str
 437         bl      #CKSUM_ERR
 438         mov     r0, #-1
 439
 440         ldmfd   sp!, {r4-r11, pc}
 441
 442 .Lin_cksum_whoops_str:
 443         .asciz  "os_cpu_in_cksum_mbuf: out of data\n"
 444         .align  5