osfmk/arm64/lz4_encode_arm64.s

   1 /*
   2  * Copyright (c) 2016-2016 Apple Inc. All rights reserved.
   3  *
   4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
   5  *
   6  * This file contains Original Code and/or Modifications of Original Code
   7  * as defined in and that are subject to the Apple Public Source License
   8  * Version 2.0 (the 'License'). You may not use this file except in
   9  * compliance with the License. The rights granted to you under the License
  10  * may not be used to create, or enable the creation or redistribution of,
  11  * unlawful or unlicensed copies of an Apple operating system, or to
  12  * circumvent, violate, or enable the circumvention or violation of, any
  13  * terms of an Apple operating system software license agreement.
  14  *
  15  * Please obtain a copy of the License at
  16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
  17  *
  18  * The Original Code and all software distributed under the License are
  19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
  20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
  21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
  22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
  23  * Please see the License for the specific language governing rights and
  24  * limitations under the License.
  25  *
  26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
  27  */
  28
  29 #include <vm/lz4_assembly_select.h>
  30 #include <vm/lz4_constants.h>
  31 #include <arm64/asm.h>
  32
  33 #if LZ4_ENABLE_ASSEMBLY_ENCODE_ARM64
  34
  35 /* void lz4_encode_2gb(uint8_t ** dst_ptr,
  36                        size_t dst_size,
  37                        const uint8_t ** src_ptr,
  38                        const uint8_t * src_begin,
  39                        size_t src_size,
  40                        lz4_hash_entry_t hash_table[LZ4_COMPRESS_HASH_ENTRIES],
  41                        int skip_final_literals)                               */
  42
  43 .globl _lz4_encode_2gb
  44
  45 #define dst_ptr             x0
  46 #define dst_size            x1
  47 #define src_ptr             x2
  48 #define src_begin           x3
  49 #define src_size            x4
  50 #define hash_table          x5
  51 #define skip_final_literals x6
  52
  53 .text
  54 .p2align 4
  55 _lz4_encode_2gb:
  56
  57     // esteblish frame
  58     ARM64_STACK_PROLOG
  59     stp     fp, lr,    [sp, #-16]!
  60     mov     fp, sp
  61
  62     stp x19, x20, [sp, #-16]!
  63     stp x21, x22, [sp, #-16]!
  64     stp x23, x24, [sp, #-16]!
  65     stp x25, x26, [sp, #-16]!
  66     stp x27, x28, [sp, #-16]!
  67
  68     // constant registers
  69     adr x7, L_constant
  70     ldr w28, [x7, #4]                        // x28 = 0x80808081 (magic number to cmopute 1/255)
  71     ldr w7, [x7]                             //  x7 = LZ4_COMPRESS_HASH_MULTIPLY
  72     mov x27, #-1                             // x27 = 0xffffffffffffffff
  73     dup.4s v1, w27                           //  q1 = {0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff}
  74
  75
  76     //  x9 - is current dst
  77     // x10 - dst_end - safety_margin
  78     ldr x9, [x0]                             // dst
  79     add x10, x9, x1                          // dst_end
  80     sub x10, x10, #LZ4_GOFAST_SAFETY_MARGIN  // dst_end - safety_margin
  81     cmp x10, x9                              // if dst_size < safety_margin abort
  82     b.lt L_done
  83
  84     // x11 - is current src
  85     // x12 - is src_end - safety margin
  86     ldr x11, [x2]                            // src
  87     add x12, x11, x4                         // src_end
  88     sub x12, x12, #LZ4_GOFAST_SAFETY_MARGIN  // src_end - safety_margin
  89     cmp x12, x11                             // if src_size < safety_margin skip to trailing_literals
  90     b.lt L_trailing_literals
  91
  92
  93     // this block search for the next available match
  94     // set match_begin to current src (which is also where last match ended)
  95 L_search_next_available_match:
  96     mov x13, x11                            // match_begin   = src
  97     sub x14, x13, x3                        // match_postion = match_begin - src_begin
  98
  99     // compute hash value for the next 5 "quads"
 100     // hash distance need to be 0 < D < 0x10000
 101
 102 L_hash_match:
 103     ldr x15, [x13]                          // match_first_4_bytes
 104     umull x20, w7, w15                      // match_bytes * LZ4_COMPRESS_HASH_MULTIPLY
 105     lsr w20, w20, #LZ4_COMPRESS_HASH_SHIFT  // use LZ4_COMPRESS_HASH_BITS MSbits as index
 106     add x20, x5, x20, lsl #3                // hash_table_entry ptr (hash + 8*index)
 107
 108     ldp w19, w22, [x20]                     //  read entry values (w19 - pos, w22 - 4 bytes at pos)
 109     stp w14, w15, [x20]                     // write entry values (w14 - current pos, w15 - current 4 bytes)
 110
 111     add x26, x14, #1                        // next_match pos
 112     lsr x25, x15, #8                        // next_match_first_4_bytes
 113     umull x21, w7, w25                      // match_bytes * LZ4_COMPRESS_HASH_MULTIPLY
 114     lsr w21, w21, #LZ4_COMPRESS_HASH_SHIFT  // use LZ4_COMPRESS_HASH_BITS MSbits as index
 115     add x21, x5, x21, lsl #3                // hash_table_entry ptr (hash + 8*index)
 116
 117     ldp w23, w24, [x21]                     //  read entry values (w23 - pos, w24 - 4 bytes at pos)
 118     stp w26, w25, [x21]                     // write entry values (w26 - next pos, w25 - next 4 bytes)
 119
 120     cmp w15, w22
 121     b.ne L_try_next_match_0                 // compare the 4 bytes to see if there is a match
 122     sub w19, w14, w19                       // x19 - match_dist (current_pos - match_pos)
 123     cmp w19, #0x10000
 124     ccmp w19, #0, #0xf, lo
 125     b.eq L_try_next_match_0                 // verify the 0 < dist < 0x10000
 126     b L_found_valid_match
 127
 128 L_try_next_match_0:
 129     add x13, x13, #1
 130     add x14, x14, #1
 131
 132     add x26, x14, #1                        // next_match pos
 133     lsr x15, x15, #16                       // next_match_first_4_bytes
 134     umull x20, w7, w15                      // match_bytes * LZ4_COMPRESS_HASH_MULTIPLY
 135     lsr w20, w20, #LZ4_COMPRESS_HASH_SHIFT  // use LZ4_COMPRESS_HASH_BITS MSbits as index
 136     add x20, x5, x20, lsl #3                // hash_table_entry ptr (hash + 8*index)
 137
 138     ldp w21, w22, [x20]                     //  read entry values (w19 - pos, w22 - 4 bytes at pos)
 139     stp w26, w15, [x20]                     // write entry values (w14 - current pos, w15 - current 4 bytes)
 140
 141     cmp w25, w24
 142     b.ne L_try_next_match_1                 // compare the 4 bytes to see if there is a match
 143     sub w19, w14, w23                       // x19 - match_dist (current_pos - match_pos)
 144     cmp w19, #0x10000
 145     ccmp w19, #0, #0xf, lo
 146     b.eq L_try_next_match_1                 // verify the 0 < dist < 0x10000
 147     b L_found_valid_match
 148
 149 L_try_next_match_1:
 150     add x13, x13, #1
 151     add x14, x14, #1
 152
 153     add x26, x14, #1                        // next_match pos
 154     lsr x25, x15, #8                        // next_match_first_4_bytes
 155     umull x20, w7, w25                      // match_bytes * LZ4_COMPRESS_HASH_MULTIPLY
 156     lsr w20, w20, #LZ4_COMPRESS_HASH_SHIFT  // use LZ4_COMPRESS_HASH_BITS MSbits as index
 157     add x20, x5, x20, lsl #3                // hash_table_entry ptr (hash + 8*index)
 158
 159     ldp w23, w24, [x20]                     //  read entry values (w23 - pos, w24 - 4 bytes at pos)
 160     stp w26, w25, [x20]                     // write entry values (w26 - next pos, w25 - next 4 bytes)
 161
 162     cmp w15, w22
 163     b.ne L_try_next_match_2                 // compare the 4 bytes to see if there is a match
 164     sub w19, w14, w21                       // x19 - match_dist (current_pos - match_pos)
 165     cmp w19, #0x10000
 166     ccmp w19, #0, #0xf, lo
 167     b.eq L_try_next_match_2                 // verify the 0 < dist < 0x10000
 168     b L_found_valid_match
 169
 170 L_try_next_match_2:
 171     add x13, x13, #1
 172     add x14, x14, #1
 173
 174     add x26, x14, #1                        // next_match pos
 175     lsr x15, x15, #16                       // next_match_first_4_bytes
 176     umull x20, w7, w15                      // match_bytes * LZ4_COMPRESS_HASH_MULTIPLY
 177     lsr w20, w20, #LZ4_COMPRESS_HASH_SHIFT  // use LZ4_COMPRESS_HASH_BITS MSbits as index
 178     add x20, x5, x20, lsl #3                // hash_table_entry ptr (hash + 8*index)
 179
 180     ldp w21, w22, [x20]                     //  read entry values (w19 - pos, w22 - 4 bytes at pos)
 181     stp w26, w15, [x20]                     // write entry values (w14 - current pos, w15 - current 4 bytes)
 182
 183     cmp w25, w24
 184     b.ne L_try_next_match_3                 // compare the 4 bytes to see if there is a match
 185     sub w19, w14, w23                       // x19 - match_dist (current_pos - match_pos)
 186     cmp w19, #0x10000
 187     ccmp w19, #0, #0xf, lo
 188     b.eq L_try_next_match_3                 // verify the 0 < dist < 0x10000
 189     b L_found_valid_match
 190
 191 L_try_next_match_3:
 192     add x13, x13, #1
 193     add x14, x14, #1
 194
 195     cmp w15, w22
 196     b.ne L_try_next_matchs                 // compare the 4 bytes to see if there is a match
 197     sub w19, w14, w21                       // x19 - match_dist (current_pos - match_pos)
 198     cmp w19, #0x10000
 199     ccmp w19, #0, #0xf, lo
 200     b.eq L_try_next_matchs                 // verify the 0 < dist < 0x10000
 201     b L_found_valid_match
 202
 203     // this block exapnd the valid match as much as possible
 204     // first it try to expand the match forward
 205     // next  it try to expand the match backword
 206 L_found_valid_match:
 207     add x20, x13, #4                        // match_end = match_begin+4 (already confirmd the first 4 bytes)
 208     sub x21, x20, x19                       //   ref_end = match_end - dist
 209 L_found_valid_match_expand_forward_loop:
 210     ldr x22, [x20], #8                      // load match_current_8_bytes (safe to load becasue of safety margin)
 211     ldr x23, [x21], #8                      // load   ref_current_8_bytes
 212     cmp x22, x23
 213     b.ne L_found_valid_match_expand_forward_partial
 214     cmp x20, x12                            // check if match_end reached src_end
 215     b.lo L_found_valid_match_expand_forward_loop
 216     b L_found_valid_match_expand_backward
 217 L_found_valid_match_expand_forward_partial:
 218     sub  x20, x20, #8                       // revert match_end by 8 and compute actual match of current 8 bytes
 219     eor  x22, x22, x23                      // compare the bits using xor
 220     rbit x22, x22                           // revert the bits to use clz (the none equivalent bytes would have at least 1 set bit)
 221     clz  x22, x22                           // after the revrse for every equal prefix byte clz would count 8
 222     add  x20, x20, x22, lsr #3              // add the actual number of matching bytes is (clz result)>>3
 223 L_found_valid_match_expand_backward:
 224     sub  x15, x13, x19                      // ref_begin = match_begin - dist
 225 L_found_valid_match_expand_backward_loop:
 226     cmp  x13, x11                           // check if match_begin reached src (previous match end)
 227     ccmp x15, x3, #0xd, gt                  // check if   ref_begin reached src_begin
 228     b.le L_found_valid_match_emit_match
 229     ldrb w22, [x13, #-1]!                   // load match_current_8_bytes (safe to load becasue of safety margin)
 230     ldrb w23, [x15, #-1]!                   // load   ref_current_8_bytes
 231     cmp w22, w23
 232     b.eq L_found_valid_match_expand_backward_loop
 233     add x13, x13, #1                        // revert x13, last compare didn't match
 234
 235     // this block write the match into dst
 236     // it write the ML token [extra L tokens] [literals] <2byte dist> [extar M tokens]
 237     // it update src & dst positions and progress to L_search_next_available_match
 238 L_found_valid_match_emit_match:
 239     sub  x21, x20, x13                       // match_length - match_end - match_begin
 240     sub  x21, x21, #4                        // match_length - 4 (first 4 bytes are guaranteed)
 241     sub  x22, x13, x11                       // literals_length = match_begin - src    // compute
 242     sub  x26, x10, x9                        // dst_remaining_space = dst_end - dst
 243     sub  x26, x26, x22                       // dst_remaining_space -= literals_length
 244     subs x26, x26, #3                        // dst_remaining_space -= 2_dist_bytes + L/M_token
 245     b.lo L_done                              // exit if dst isn't sufficent
 246
 247     and x23, x21, #0xf                       // store M 4 LSbits
 248     add x23, x23, x22, lsl #4                // add L 4 LSbits
 249     add x15, x9, #1                          // tmp_dst = dst + 1
 250     cmp x22, #15                             // if L >= 15 need to write more L tokens
 251     b.lo L_found_valid_match_copy_literals
 252     orr x23, x23, #0xf0                      // update L/M token to be 0xfM
 253     sub x24, x22, #15                        // reduce 15 from number_of_literals
 254     sub x26, x26, #1                         // check if there is space for the extra L token
 255     b.lo L_done
 256     cmp x24, #255                            // check if need to compute number of 255 tokens
 257     b.lo L_found_valid_match_skip_L_255_tokens
 258     umull x25, w24, w28                      // x25 - (literals_to_token * 1_DIV_255_magic_number)
 259     lsr   x25, x25, #39                      // x25 - number_of_255_tokens = (literals_to_token * 1_DIV_255_magic_number)>>39
 260     subs  x26, x26, x25                      // check if there is sufficent space for the 255_tokens
 261     b.lo L_done
 262     mov x13, #255
 263     umsubl x24, w25, w13, x24                // x24 - value_of_remainder_token = literals_to_token - (number_of_255_tokens*255)
 264 L_found_valid_match_L_255_tokens_loop:
 265     str q1, [x15], #16                       // store 16 255 tokens into dst_tmp. safe to store because dst has safety_margin
 266     subs x25, x25, #16                       // check if there are any 255 token left after current 16
 267     b.hi L_found_valid_match_L_255_tokens_loop
 268     add x15, x15, x25                        // revert tmp_dst if written too many 255 tokens.
 269 L_found_valid_match_skip_L_255_tokens:
 270     strb w24, [x15], #1                      // write last L token
 271 L_found_valid_match_copy_literals:
 272     ldr q0, [x11], #16                       // load  current 16 literals. (safe becasue src_end has safety margin)
 273     str q0, [x15], #16                       // store current 16 literals. (safe becasue dst_end has safety margin)
 274     subs x22, x22, #16
 275     b.gt L_found_valid_match_copy_literals
 276     add x15, x15, x22                        // revert tmp_dst if written too many literals
 277     strh w19, [x15], #2                      // store dist bytes
 278     cmp x21, #15                             // if M >= 15 need to write more M tokens
 279     b.lo L_found_valid_match_finish_writing_match
 280     orr x23, x23, #0xf                       // update L/M token to be 0xLf
 281     sub x24, x21, #15                        // reduce 15 from match_length
 282     sub x26, x26, #1                         // check if there is space for the extra M token
 283     b.lo L_done
 284     cmp x24, #255                            // check if need to compute number of 255 tokens
 285     b.lo L_found_valid_match_skip_M_255_tokens
 286     umull x25, w24, w28                      // x25 - (match_length * 1_DIV_255_magic_number)
 287     lsr   x25, x25, #39                      // x25 - number_of_255_tokens = (match_length * 1_DIV_255_magic_number)>>39
 288     subs  x26, x26, x25                      // check if there is sufficent space for the 255_tokens
 289     b.lo L_done
 290     mov x13, #255
 291     umsubl x24, w25, w13, x24                // x24 - value_of_remainder_token = literals_to_token - (match_length*255)
 292 L_found_valid_match_M_255_tokens_loop:
 293     str q1, [x15], #16                       // store 16 255 tokens into dst_tmp. safe to store because dst has safety_margin
 294     subs x25, x25, #16                       // check if there are any 255 token left after current 16
 295     b.hi L_found_valid_match_M_255_tokens_loop
 296     add x15, x15, x25                        // revert tmp_dst if written too many 255 tokens.
 297 L_found_valid_match_skip_M_255_tokens:
 298     strb w24, [x15], #1                      // write last M token
 299 L_found_valid_match_finish_writing_match:
 300     strb w23, [x9]                           // store first token of match in dst
 301     mov  x9, x15                             // update dst to last postion written
 302     mov x11, x20                             // update src to match_end (last byte that was encoded)
 303     cmp x11, x12                             // check if src reached src_end
 304     ccmp x9, x10, #9, lt                     // check if dst reached dst_end
 305     b.ge L_trailing_literals
 306     b L_search_next_available_match
 307     // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 308     // attempted to hash three quad values from the end of each emited match
 309     // this eneded up being slower and less compression (???)
 310     // this block set match_begin and pos for next hash search and
 311     // compute the hash values for the last 3 bytes of currently emited match
 312     // only need to comute these hash becasue other "quads" were hashed when the original
 313     // data was read.
 314
 315 L_try_next_matchs:
 316     add x13, x13, #1                         // move to next match
 317     add x14, x14, #1                         // update next match pos
 318     cmp x13, x12                             // check match_begin didn't reach src_end
 319     b.lo L_hash_match
 320
 321 L_trailing_literals:
 322     // unless skip_final_literals is set
 323     // write the trailing bytes as literals
 324     // traliing bytes include the whole src (with the safty margin)
 325     // need to verify whole dst (withthe safty margin) has sufficent space
 326
 327     tst x6, x6
 328     b.ne L_done                              // if skip_final_literals is set skip writing them
 329
 330     add  x12, x12, #LZ4_GOFAST_SAFETY_MARGIN // add safety_margin
 331     subs x13, x12, x11                       // remaining_src
 332     b.eq L_done                              // finish if there are 0 trailing literals
 333
 334     add x10, x10, #LZ4_GOFAST_SAFETY_MARGIN  // add safety_margin
 335     sub x14, x10, x9                         // remaining dst (dst_end - dst)
 336     sub x14, x14, #1                         // 1 byte is needed at least to write literals token
 337     subs x14, x14, x13                       // finish if dst can't contain all remaining literals + 1 literals token
 338     b.le L_done                              // (need to verify that it has room for literals tokens
 339
 340     cmp  x13, #15
 341     b.lt L_trailing_literals_store_less_than_15_literals
 342     subs x14, x14, #1                        // 1-extra byte is needed for literals tokens
 343     b.mi L_done
 344     mov w15, #0xf0
 345     strb w15, [x9], #1                       // write literals first token (Important !!! if 255 tokens exist but dst isn't sufficent need to revert dst by 1)
 346     sub  x15, x13, #15
 347     cmp  x15, #255
 348     b.lo L_trailing_literals_no_255_tokens
 349     umull x19, w15, w28                      // x19 - (literals_to_token * 1_DIV_255_magic_number)
 350     lsr   x19, x19, #39                      // x19 - number_of_255_tokens = (literals_to_token * 1_DIV_255_magic_number)>>39
 351     subs  x14, x14, x19
 352     b.mi L_revert_x9_and_done
 353     mov x26, #255
 354     umsubl x15, w26, w19, x15                // x15 - value_of_remainder_token = literals_to_token - (number_of_255_tokens*255)
 355 L_tariling_literals_write_16_255_tokens:
 356     str q1, [x9], #16                        // store 16 255 tokens each iteration (this is safe becasue there is space for 15 or more literals + remainder token)
 357     subs x19, x19, #16
 358     b.gt L_tariling_literals_write_16_255_tokens
 359     add x9, x9, x19                          // fixes dst to actual number of tokens (x19 might not be a mulitple of 16)
 360 L_trailing_literals_no_255_tokens:
 361     strb w15, [x9], #1                       // store remainder_token
 362     lsr  x14, x13, #4                        // check if there are more than 16 literals left to be written
 363     tst  x14, x14
 364     b.eq L_trailing_literals_copy_less_than_16_literals
 365 L_trailing_literals_copy_16_literals:
 366     ldr q0, [x11], #16                       // load current_16_literals
 367     str q0, [ x9], #16                       // *dst16++ = current_16_literals
 368     subs x14, x14, #1
 369     b.gt L_trailing_literals_copy_16_literals
 370     cmp x11, x12
 371     b.lo L_trailing_literals_copy_less_than_16_literals
 372     b L_done
 373
 374 L_trailing_literals_store_less_than_15_literals:
 375     lsl x14, x13, #4                         // literals_only_token is 0xL0 (where L is 4 bits)
 376     strb w14, [x9], #1                       // *dst++ = literals_only_token
 377 L_trailing_literals_copy_less_than_16_literals:
 378     ldrb w13, [x11], #1                      // load current_literal
 379     strb w13, [ x9], #1                      // *dst++ = current_literal
 380     cmp x11, x12
 381     b.lo L_trailing_literals_copy_less_than_16_literals
 382
 383     // this block upadte dst & src pointers and remove frame
 384 L_done:
 385     str  x9, [x0]
 386     str x11, [x2]
 387
 388     ldp x27, x28, [sp], #16
 389     ldp x25, x26, [sp], #16
 390     ldp x23, x24, [sp], #16
 391     ldp x21, x22, [sp], #16
 392     ldp x19, x20, [sp], #16
 393
 394     // clear frame
 395     ldp     fp, lr,    [sp], #16
 396     ARM64_STACK_EPILOG
 397
 398 L_revert_x9_and_done:
 399     sub x9, x9, #1
 400     b L_done
 401
 402 .p2align 2
 403 L_constant:
 404 .long LZ4_COMPRESS_HASH_MULTIPLY
 405 .long 0x80808081
 406
 407 #endif
 408