[apple/xnu.git] / osfmk / arm64 / lz4_encode_arm64.s

/*
 * Copyright (c) 2016-2016 Apple Inc. All rights reserved.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
 * 
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. The rights granted to you under the License
 * may not be used to create, or enable the creation or redistribution of,
 * unlawful or unlicensed copies of an Apple operating system, or to
 * circumvent, violate, or enable the circumvention or violation of, any
 * terms of an Apple operating system software license agreement.
 * 
 * Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this file.
 * 
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 * 
 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
 */

#include <vm/lz4_assembly_select.h>
#include <vm/lz4_constants.h>

#if LZ4_ENABLE_ASSEMBLY_ENCODE_ARM64

/* void lz4_encode_2gb(uint8_t ** dst_ptr,
                       size_t dst_size,
                       const uint8_t ** src_ptr,
                       const uint8_t * src_begin,
                       size_t src_size,
                       lz4_hash_entry_t hash_table[LZ4_COMPRESS_HASH_ENTRIES],
                       int skip_final_literals)                               */

.globl _lz4_encode_2gb

#define dst_ptr             x0
#define dst_size            x1
#define src_ptr             x2
#define src_begin           x3
#define src_size            x4
#define hash_table          x5
#define skip_final_literals x6

.text
.p2align 4
_lz4_encode_2gb:

    // esteblish frame
    stp     fp, lr,    [sp, #-16]!
    mov     fp, sp

    stp x19, x20, [sp, #-16]!
    stp x21, x22, [sp, #-16]!
    stp x23, x24, [sp, #-16]!
    stp x25, x26, [sp, #-16]!
    stp x27, x28, [sp, #-16]!

    // constant registers
    adr x7, L_constant
    ldr w28, [x7, #4]                        // x28 = 0x80808081 (magic number to cmopute 1/255)
    ldr w7, [x7]                             //  x7 = LZ4_COMPRESS_HASH_MULTIPLY
    mov x27, #-1                             // x27 = 0xffffffffffffffff
    dup.4s v1, w27                           //  q1 = {0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff}


    //  x9 - is current dst
    // x10 - dst_end - safety_margin
    ldr x9, [x0]                             // dst
    add x10, x9, x1                          // dst_end
    sub x10, x10, #LZ4_GOFAST_SAFETY_MARGIN  // dst_end - safety_margin
    cmp x10, x9                              // if dst_size < safety_margin abort
    b.lt L_done

    // x11 - is current src
    // x12 - is src_end - safety margin
    ldr x11, [x2]                            // src
    add x12, x11, x4                         // src_end
    sub x12, x12, #LZ4_GOFAST_SAFETY_MARGIN  // src_end - safety_margin
    cmp x12, x11                             // if src_size < safety_margin skip to trailing_literals
    b.lt L_trailing_literals


    // this block search for the next available match
    // set match_begin to current src (which is also where last match ended)
L_search_next_available_match:
    mov x13, x11                            // match_begin   = src
    sub x14, x13, x3                        // match_postion = match_begin - src_begin

    // compute hash value for the next 5 "quads"
    // hash distance need to be 0 < D < 0x10000

L_hash_match:
    ldr x15, [x13]                          // match_first_4_bytes
    umull x20, w7, w15                      // match_bytes * LZ4_COMPRESS_HASH_MULTIPLY
    lsr w20, w20, #LZ4_COMPRESS_HASH_SHIFT  // use LZ4_COMPRESS_HASH_BITS MSbits as index
    add x20, x5, x20, lsl #3                // hash_table_entry ptr (hash + 8*index)

    ldp w19, w22, [x20]                     //  read entry values (w19 - pos, w22 - 4 bytes at pos)
    stp w14, w15, [x20]                     // write entry values (w14 - current pos, w15 - current 4 bytes)

    add x26, x14, #1                        // next_match pos
    lsr x25, x15, #8                        // next_match_first_4_bytes
    umull x21, w7, w25                      // match_bytes * LZ4_COMPRESS_HASH_MULTIPLY
    lsr w21, w21, #LZ4_COMPRESS_HASH_SHIFT  // use LZ4_COMPRESS_HASH_BITS MSbits as index
    add x21, x5, x21, lsl #3                // hash_table_entry ptr (hash + 8*index)

    ldp w23, w24, [x21]                     //  read entry values (w23 - pos, w24 - 4 bytes at pos)
    stp w26, w25, [x21]                     // write entry values (w26 - next pos, w25 - next 4 bytes)

    cmp w15, w22
    b.ne L_try_next_match_0                 // compare the 4 bytes to see if there is a match
    sub w19, w14, w19                       // x19 - match_dist (current_pos - match_pos)
    cmp w19, #0x10000
    ccmp w19, #0, #0xf, lo
    b.eq L_try_next_match_0                 // verify the 0 < dist < 0x10000
    b L_found_valid_match

L_try_next_match_0:
    add x13, x13, #1
    add x14, x14, #1

    add x26, x14, #1                        // next_match pos
    lsr x15, x15, #16                       // next_match_first_4_bytes
    umull x20, w7, w15                      // match_bytes * LZ4_COMPRESS_HASH_MULTIPLY
    lsr w20, w20, #LZ4_COMPRESS_HASH_SHIFT  // use LZ4_COMPRESS_HASH_BITS MSbits as index
    add x20, x5, x20, lsl #3                // hash_table_entry ptr (hash + 8*index)

    ldp w21, w22, [x20]                     //  read entry values (w19 - pos, w22 - 4 bytes at pos)
    stp w26, w15, [x20]                     // write entry values (w14 - current pos, w15 - current 4 bytes)

    cmp w25, w24
    b.ne L_try_next_match_1                 // compare the 4 bytes to see if there is a match
    sub w19, w14, w23                       // x19 - match_dist (current_pos - match_pos)
    cmp w19, #0x10000
    ccmp w19, #0, #0xf, lo
    b.eq L_try_next_match_1                 // verify the 0 < dist < 0x10000
    b L_found_valid_match

L_try_next_match_1:
    add x13, x13, #1
    add x14, x14, #1

    add x26, x14, #1                        // next_match pos
    lsr x25, x15, #8                        // next_match_first_4_bytes
    umull x20, w7, w25                      // match_bytes * LZ4_COMPRESS_HASH_MULTIPLY
    lsr w20, w20, #LZ4_COMPRESS_HASH_SHIFT  // use LZ4_COMPRESS_HASH_BITS MSbits as index
    add x20, x5, x20, lsl #3                // hash_table_entry ptr (hash + 8*index)

    ldp w23, w24, [x20]                     //  read entry values (w23 - pos, w24 - 4 bytes at pos)
    stp w26, w25, [x20]                     // write entry values (w26 - next pos, w25 - next 4 bytes)

    cmp w15, w22
    b.ne L_try_next_match_2                 // compare the 4 bytes to see if there is a match
    sub w19, w14, w21                       // x19 - match_dist (current_pos - match_pos)
    cmp w19, #0x10000
    ccmp w19, #0, #0xf, lo
    b.eq L_try_next_match_2                 // verify the 0 < dist < 0x10000
    b L_found_valid_match

L_try_next_match_2:
    add x13, x13, #1
    add x14, x14, #1

    add x26, x14, #1                        // next_match pos
    lsr x15, x15, #16                       // next_match_first_4_bytes
    umull x20, w7, w15                      // match_bytes * LZ4_COMPRESS_HASH_MULTIPLY
    lsr w20, w20, #LZ4_COMPRESS_HASH_SHIFT  // use LZ4_COMPRESS_HASH_BITS MSbits as index
    add x20, x5, x20, lsl #3                // hash_table_entry ptr (hash + 8*index)

    ldp w21, w22, [x20]                     //  read entry values (w19 - pos, w22 - 4 bytes at pos)
    stp w26, w15, [x20]                     // write entry values (w14 - current pos, w15 - current 4 bytes)

    cmp w25, w24
    b.ne L_try_next_match_3                 // compare the 4 bytes to see if there is a match
    sub w19, w14, w23                       // x19 - match_dist (current_pos - match_pos)
    cmp w19, #0x10000
    ccmp w19, #0, #0xf, lo
    b.eq L_try_next_match_3                 // verify the 0 < dist < 0x10000
    b L_found_valid_match

L_try_next_match_3:
    add x13, x13, #1
    add x14, x14, #1

    cmp w15, w22
    b.ne L_try_next_matchs                 // compare the 4 bytes to see if there is a match
    sub w19, w14, w21                       // x19 - match_dist (current_pos - match_pos)
    cmp w19, #0x10000
    ccmp w19, #0, #0xf, lo
    b.eq L_try_next_matchs                 // verify the 0 < dist < 0x10000
    b L_found_valid_match

    // this block exapnd the valid match as much as possible
    // first it try to expand the match forward
    // next  it try to expand the match backword
L_found_valid_match:
    add x20, x13, #4                        // match_end = match_begin+4 (already confirmd the first 4 bytes)
    sub x21, x20, x19                       //   ref_end = match_end - dist
L_found_valid_match_expand_forward_loop:
    ldr x22, [x20], #8                      // load match_current_8_bytes (safe to load becasue of safety margin)
    ldr x23, [x21], #8                      // load   ref_current_8_bytes
    cmp x22, x23
    b.ne L_found_valid_match_expand_forward_partial
    cmp x20, x12                            // check if match_end reached src_end
    b.lo L_found_valid_match_expand_forward_loop
    b L_found_valid_match_expand_backward
L_found_valid_match_expand_forward_partial:
    sub  x20, x20, #8                       // revert match_end by 8 and compute actual match of current 8 bytes
    eor  x22, x22, x23                      // compare the bits using xor
    rbit x22, x22                           // revert the bits to use clz (the none equivalent bytes would have at least 1 set bit)
    clz  x22, x22                           // after the revrse for every equal prefix byte clz would count 8
    add  x20, x20, x22, lsr #3              // add the actual number of matching bytes is (clz result)>>3
L_found_valid_match_expand_backward:
    sub  x15, x13, x19                      // ref_begin = match_begin - dist
L_found_valid_match_expand_backward_loop:
    cmp  x13, x11                           // check if match_begin reached src (previous match end)
    ccmp x15, x3, #0xd, gt                  // check if   ref_begin reached src_begin
    b.le L_found_valid_match_emit_match
    ldrb w22, [x13, #-1]!                   // load match_current_8_bytes (safe to load becasue of safety margin)
    ldrb w23, [x15, #-1]!                   // load   ref_current_8_bytes
    cmp w22, w23
    b.eq L_found_valid_match_expand_backward_loop
    add x13, x13, #1                        // revert x13, last compare didn't match

    // this block write the match into dst
    // it write the ML token [extra L tokens] [literals] <2byte dist> [extar M tokens]
    // it update src & dst positions and progress to L_search_next_available_match
L_found_valid_match_emit_match:
    sub  x21, x20, x13                       // match_length - match_end - match_begin
    sub  x21, x21, #4                        // match_length - 4 (first 4 bytes are guaranteed)
    sub  x22, x13, x11                       // literals_length = match_begin - src    // compute
    sub  x26, x10, x9                        // dst_remaining_space = dst_end - dst
    sub  x26, x26, x22                       // dst_remaining_space -= literals_length
    subs x26, x26, #3                        // dst_remaining_space -= 2_dist_bytes + L/M_token
    b.lo L_done                              // exit if dst isn't sufficent

    and x23, x21, #0xf                       // store M 4 LSbits
    add x23, x23, x22, lsl #4                // add L 4 LSbits
    add x15, x9, #1                          // tmp_dst = dst + 1
    cmp x22, #15                             // if L >= 15 need to write more L tokens
    b.lo L_found_valid_match_copy_literals
    orr x23, x23, #0xf0                      // update L/M token to be 0xfM
    sub x24, x22, #15                        // reduce 15 from number_of_literals
    sub x26, x26, #1                         // check if there is space for the extra L token
    b.lo L_done
    cmp x24, #255                            // check if need to compute number of 255 tokens
    b.lo L_found_valid_match_skip_L_255_tokens
    umull x25, w24, w28                      // x25 - (literals_to_token * 1_DIV_255_magic_number)
    lsr   x25, x25, #39                      // x25 - number_of_255_tokens = (literals_to_token * 1_DIV_255_magic_number)>>39
    subs  x26, x26, x25                      // check if there is sufficent space for the 255_tokens
    b.lo L_done
    mov x13, #255
    umsubl x24, w25, w13, x24                // x24 - value_of_remainder_token = literals_to_token - (number_of_255_tokens*255)
L_found_valid_match_L_255_tokens_loop:
    str q1, [x15], #16                       // store 16 255 tokens into dst_tmp. safe to store because dst has safety_margin
    subs x25, x25, #16                       // check if there are any 255 token left after current 16
    b.hi L_found_valid_match_L_255_tokens_loop
    add x15, x15, x25                        // revert tmp_dst if written too many 255 tokens.
L_found_valid_match_skip_L_255_tokens:
    strb w24, [x15], #1                      // write last L token
L_found_valid_match_copy_literals:
    ldr q0, [x11], #16                       // load  current 16 literals. (safe becasue src_end has safety margin)
    str q0, [x15], #16                       // store current 16 literals. (safe becasue dst_end has safety margin)
    subs x22, x22, #16
    b.gt L_found_valid_match_copy_literals
    add x15, x15, x22                        // revert tmp_dst if written too many literals
    strh w19, [x15], #2                      // store dist bytes
    cmp x21, #15                             // if M >= 15 need to write more M tokens
    b.lo L_found_valid_match_finish_writing_match
    orr x23, x23, #0xf                       // update L/M token to be 0xLf
    sub x24, x21, #15                        // reduce 15 from match_length
    sub x26, x26, #1                         // check if there is space for the extra M token
    b.lo L_done
    cmp x24, #255                            // check if need to compute number of 255 tokens
    b.lo L_found_valid_match_skip_M_255_tokens
    umull x25, w24, w28                      // x25 - (match_length * 1_DIV_255_magic_number)
    lsr   x25, x25, #39                      // x25 - number_of_255_tokens = (match_length * 1_DIV_255_magic_number)>>39
    subs  x26, x26, x25                      // check if there is sufficent space for the 255_tokens
    b.lo L_done
    mov x13, #255
    umsubl x24, w25, w13, x24                // x24 - value_of_remainder_token = literals_to_token - (match_length*255)
L_found_valid_match_M_255_tokens_loop:
    str q1, [x15], #16                       // store 16 255 tokens into dst_tmp. safe to store because dst has safety_margin
    subs x25, x25, #16                       // check if there are any 255 token left after current 16
    b.hi L_found_valid_match_M_255_tokens_loop
    add x15, x15, x25                        // revert tmp_dst if written too many 255 tokens.
L_found_valid_match_skip_M_255_tokens:
    strb w24, [x15], #1                      // write last M token
L_found_valid_match_finish_writing_match:
    strb w23, [x9]                           // store first token of match in dst
    mov  x9, x15                             // update dst to last postion written
    mov x11, x20                             // update src to match_end (last byte that was encoded)
    cmp x11, x12                             // check if src reached src_end
    ccmp x9, x10, #9, lt                     // check if dst reached dst_end
    b.ge L_trailing_literals
    b L_search_next_available_match
    // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    // attempted to hash three quad values from the end of each emited match
    // this eneded up being slower and less compression (???)
    // this block set match_begin and pos for next hash search and
    // compute the hash values for the last 3 bytes of currently emited match
    // only need to comute these hash becasue other "quads" were hashed when the original
    // data was read.

L_try_next_matchs:
    add x13, x13, #1                         // move to next match
    add x14, x14, #1                         // update next match pos
    cmp x13, x12                             // check match_begin didn't reach src_end
    b.lo L_hash_match

L_trailing_literals:
    // unless skip_final_literals is set
    // write the trailing bytes as literals
    // traliing bytes include the whole src (with the safty margin)
    // need to verify whole dst (withthe safty margin) has sufficent space

    tst x6, x6
    b.ne L_done                              // if skip_final_literals is set skip writing them

    add  x12, x12, #LZ4_GOFAST_SAFETY_MARGIN // add safety_margin
    subs x13, x12, x11                       // remaining_src
    b.eq L_done                              // finish if there are 0 trailing literals

    add x10, x10, #LZ4_GOFAST_SAFETY_MARGIN  // add safety_margin
    sub x14, x10, x9                         // remaining dst (dst_end - dst)
    sub x14, x14, #1                         // 1 byte is needed at least to write literals token
    subs x14, x14, x13                       // finish if dst can't contain all remaining literals + 1 literals token
    b.le L_done                              // (need to verify that it has room for literals tokens

    cmp  x13, #15
    b.lt L_trailing_literals_store_less_than_15_literals
    subs x14, x14, #1                        // 1-extra byte is needed for literals tokens
    b.mi L_done
    mov w15, #0xf0
    strb w15, [x9], #1                       // write literals first token (Important !!! if 255 tokens exist but dst isn't sufficent need to revert dst by 1)
    sub  x15, x13, #15
    cmp  x15, #255
    b.lo L_trailing_literals_no_255_tokens
    umull x19, w15, w28                      // x19 - (literals_to_token * 1_DIV_255_magic_number)
    lsr   x19, x19, #39                      // x19 - number_of_255_tokens = (literals_to_token * 1_DIV_255_magic_number)>>39
    subs  x14, x14, x19
    b.mi L_revert_x9_and_done
    mov x26, #255
    umsubl x15, w26, w19, x15                // x15 - value_of_remainder_token = literals_to_token - (number_of_255_tokens*255)
L_tariling_literals_write_16_255_tokens:
    str q1, [x9], #16                        // store 16 255 tokens each iteration (this is safe becasue there is space for 15 or more literals + remainder token)
    subs x19, x19, #16
    b.gt L_tariling_literals_write_16_255_tokens
    add x9, x9, x19                          // fixes dst to actual number of tokens (x19 might not be a mulitple of 16)
L_trailing_literals_no_255_tokens:
    strb w15, [x9], #1                       // store remainder_token
    lsr  x14, x13, #4                        // check if there are more than 16 literals left to be written
    tst  x14, x14
    b.eq L_trailing_literals_copy_less_than_16_literals
L_trailing_literals_copy_16_literals:
    ldr q0, [x11], #16                       // load current_16_literals
    str q0, [ x9], #16                       // *dst16++ = current_16_literals
    subs x14, x14, #1
    b.gt L_trailing_literals_copy_16_literals
    cmp x11, x12
    b.lo L_trailing_literals_copy_less_than_16_literals
    b L_done

L_trailing_literals_store_less_than_15_literals:
    lsl x14, x13, #4                         // literals_only_token is 0xL0 (where L is 4 bits)
    strb w14, [x9], #1                       // *dst++ = literals_only_token
L_trailing_literals_copy_less_than_16_literals:
    ldrb w13, [x11], #1                      // load current_literal
    strb w13, [ x9], #1                      // *dst++ = current_literal
    cmp x11, x12
    b.lo L_trailing_literals_copy_less_than_16_literals

    // this block upadte dst & src pointers and remove frame
L_done:
    str  x9, [x0]
    str x11, [x2]

    ldp x27, x28, [sp], #16
    ldp x25, x26, [sp], #16
    ldp x23, x24, [sp], #16
    ldp x21, x22, [sp], #16
    ldp x19, x20, [sp], #16

    // clear frame
    ldp     fp, lr,    [sp], #16
    ret     lr

L_revert_x9_and_done:
    sub x9, x9, #1
    b L_done

.p2align 2
L_constant:
.long LZ4_COMPRESS_HASH_MULTIPLY
.long 0x80808081

#endif
Commit	Line	Data
5ba3f43e A	1	/*
	2	* Copyright (c) 2016-2016 Apple Inc. All rights reserved.
	3	*
	4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
	5	*
	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. The rights granted to you under the License
	10	* may not be used to create, or enable the creation or redistribution of,
	11	* unlawful or unlicensed copies of an Apple operating system, or to
	12	* circumvent, violate, or enable the circumvention or violation of, any
	13	* terms of an Apple operating system software license agreement.
	14	*
	15	* Please obtain a copy of the License at
	16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
	17	*
	18	* The Original Code and all software distributed under the License are
	19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	23	* Please see the License for the specific language governing rights and
	24	* limitations under the License.
	25	*
	26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
	27	*/
	28
	29	#include <vm/lz4_assembly_select.h>
	30	#include <vm/lz4_constants.h>
	31
	32	#if LZ4_ENABLE_ASSEMBLY_ENCODE_ARM64
	33
	34	/* void lz4_encode_2gb(uint8_t ** dst_ptr,
	35	size_t dst_size,
	36	const uint8_t ** src_ptr,
	37	const uint8_t * src_begin,
	38	size_t src_size,
	39	lz4_hash_entry_t hash_table[LZ4_COMPRESS_HASH_ENTRIES],
	40	int skip_final_literals) */
	41
	42	.globl _lz4_encode_2gb
	43
	44	#define dst_ptr x0
	45	#define dst_size x1
	46	#define src_ptr x2
	47	#define src_begin x3
	48	#define src_size x4
	49	#define hash_table x5
	50	#define skip_final_literals x6
	51
	52	.text
	53	.p2align 4
	54	_lz4_encode_2gb:
	55
	56	// esteblish frame
	57	stp fp, lr, [sp, #-16]!
	58	mov fp, sp
	59
	60	stp x19, x20, [sp, #-16]!
	61	stp x21, x22, [sp, #-16]!
	62	stp x23, x24, [sp, #-16]!
	63	stp x25, x26, [sp, #-16]!
	64	stp x27, x28, [sp, #-16]!
65
66	// constant registers
67	adr x7, L_constant
68	ldr w28, [x7, #4] // x28 = 0x80808081 (magic number to cmopute 1/255)
69	ldr w7, [x7] // x7 = LZ4_COMPRESS_HASH_MULTIPLY
70	mov x27, #-1 // x27 = 0xffffffffffffffff
71	dup.4s v1, w27 // q1 = {0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff}
72
73
74	// x9 - is current dst
75	// x10 - dst_end - safety_margin
76	ldr x9, [x0] // dst
77	add x10, x9, x1 // dst_end
78	sub x10, x10, #LZ4_GOFAST_SAFETY_MARGIN // dst_end - safety_margin
79	cmp x10, x9 // if dst_size < safety_margin abort
80	b.lt L_done
81
82	// x11 - is current src
83	// x12 - is src_end - safety margin
84	ldr x11, [x2] // src
85	add x12, x11, x4 // src_end
86	sub x12, x12, #LZ4_GOFAST_SAFETY_MARGIN // src_end - safety_margin
87	cmp x12, x11 // if src_size < safety_margin skip to trailing_literals
88	b.lt L_trailing_literals
89
90
91	// this block search for the next available match
92	// set match_begin to current src (which is also where last match ended)
93	L_search_next_available_match:
94	mov x13, x11 // match_begin = src
95	sub x14, x13, x3 // match_postion = match_begin - src_begin
96
97	// compute hash value for the next 5 "quads"
98	// hash distance need to be 0 < D < 0x10000
99
100	L_hash_match:
101	ldr x15, [x13] // match_first_4_bytes
102	umull x20, w7, w15 // match_bytes * LZ4_COMPRESS_HASH_MULTIPLY
103	lsr w20, w20, #LZ4_COMPRESS_HASH_SHIFT // use LZ4_COMPRESS_HASH_BITS MSbits as index
104	add x20, x5, x20, lsl #3 // hash_table_entry ptr (hash + 8*index)
105
106	ldp w19, w22, [x20] // read entry values (w19 - pos, w22 - 4 bytes at pos)
107	stp w14, w15, [x20] // write entry values (w14 - current pos, w15 - current 4 bytes)
108
109	add x26, x14, #1 // next_match pos
110	lsr x25, x15, #8 // next_match_first_4_bytes
111	umull x21, w7, w25 // match_bytes * LZ4_COMPRESS_HASH_MULTIPLY
112	lsr w21, w21, #LZ4_COMPRESS_HASH_SHIFT // use LZ4_COMPRESS_HASH_BITS MSbits as index
113	add x21, x5, x21, lsl #3 // hash_table_entry ptr (hash + 8*index)
114
115	ldp w23, w24, [x21] // read entry values (w23 - pos, w24 - 4 bytes at pos)
116	stp w26, w25, [x21] // write entry values (w26 - next pos, w25 - next 4 bytes)
117
118	cmp w15, w22
119	b.ne L_try_next_match_0 // compare the 4 bytes to see if there is a match
120	sub w19, w14, w19 // x19 - match_dist (current_pos - match_pos)
121	cmp w19, #0x10000
122	ccmp w19, #0, #0xf, lo
123	b.eq L_try_next_match_0 // verify the 0 < dist < 0x10000
124	b L_found_valid_match
125
126	L_try_next_match_0:
127	add x13, x13, #1
128	add x14, x14, #1
129
130	add x26, x14, #1 // next_match pos
131	lsr x15, x15, #16 // next_match_first_4_bytes
132	umull x20, w7, w15 // match_bytes * LZ4_COMPRESS_HASH_MULTIPLY
133	lsr w20, w20, #LZ4_COMPRESS_HASH_SHIFT // use LZ4_COMPRESS_HASH_BITS MSbits as index
134	add x20, x5, x20, lsl #3 // hash_table_entry ptr (hash + 8*index)
135
136	ldp w21, w22, [x20] // read entry values (w19 - pos, w22 - 4 bytes at pos)
137	stp w26, w15, [x20] // write entry values (w14 - current pos, w15 - current 4 bytes)
138
139	cmp w25, w24
140	b.ne L_try_next_match_1 // compare the 4 bytes to see if there is a match
141	sub w19, w14, w23 // x19 - match_dist (current_pos - match_pos)
142	cmp w19, #0x10000
143	ccmp w19, #0, #0xf, lo
144	b.eq L_try_next_match_1 // verify the 0 < dist < 0x10000
145	b L_found_valid_match
146
147	L_try_next_match_1:
148	add x13, x13, #1
149	add x14, x14, #1
150
151	add x26, x14, #1 // next_match pos
152	lsr x25, x15, #8 // next_match_first_4_bytes
153	umull x20, w7, w25 // match_bytes * LZ4_COMPRESS_HASH_MULTIPLY
154	lsr w20, w20, #LZ4_COMPRESS_HASH_SHIFT // use LZ4_COMPRESS_HASH_BITS MSbits as index
155	add x20, x5, x20, lsl #3 // hash_table_entry ptr (hash + 8*index)
156
157	ldp w23, w24, [x20] // read entry values (w23 - pos, w24 - 4 bytes at pos)
158	stp w26, w25, [x20] // write entry values (w26 - next pos, w25 - next 4 bytes)
159
160	cmp w15, w22
161	b.ne L_try_next_match_2 // compare the 4 bytes to see if there is a match
162	sub w19, w14, w21 // x19 - match_dist (current_pos - match_pos)
163	cmp w19, #0x10000
164	ccmp w19, #0, #0xf, lo
165	b.eq L_try_next_match_2 // verify the 0 < dist < 0x10000
166	b L_found_valid_match
167
168	L_try_next_match_2:
169	add x13, x13, #1
170	add x14, x14, #1
171
172	add x26, x14, #1 // next_match pos
173	lsr x15, x15, #16 // next_match_first_4_bytes
174	umull x20, w7, w15 // match_bytes * LZ4_COMPRESS_HASH_MULTIPLY
175	lsr w20, w20, #LZ4_COMPRESS_HASH_SHIFT // use LZ4_COMPRESS_HASH_BITS MSbits as index
176	add x20, x5, x20, lsl #3 // hash_table_entry ptr (hash + 8*index)
177
178	ldp w21, w22, [x20] // read entry values (w19 - pos, w22 - 4 bytes at pos)
179	stp w26, w15, [x20] // write entry values (w14 - current pos, w15 - current 4 bytes)
180
181	cmp w25, w24
182	b.ne L_try_next_match_3 // compare the 4 bytes to see if there is a match
183	sub w19, w14, w23 // x19 - match_dist (current_pos - match_pos)
184	cmp w19, #0x10000
185	ccmp w19, #0, #0xf, lo
186	b.eq L_try_next_match_3 // verify the 0 < dist < 0x10000
187	b L_found_valid_match
188
189	L_try_next_match_3:
190	add x13, x13, #1
191	add x14, x14, #1
192
193	cmp w15, w22
194	b.ne L_try_next_matchs // compare the 4 bytes to see if there is a match
195	sub w19, w14, w21 // x19 - match_dist (current_pos - match_pos)
196	cmp w19, #0x10000
197	ccmp w19, #0, #0xf, lo
198	b.eq L_try_next_matchs // verify the 0 < dist < 0x10000
199	b L_found_valid_match
200
201	// this block exapnd the valid match as much as possible
202	// first it try to expand the match forward
203	// next it try to expand the match backword
204	L_found_valid_match:
205	add x20, x13, #4 // match_end = match_begin+4 (already confirmd the first 4 bytes)
206	sub x21, x20, x19 // ref_end = match_end - dist
207	L_found_valid_match_expand_forward_loop:
208	ldr x22, [x20], #8 // load match_current_8_bytes (safe to load becasue of safety margin)
209	ldr x23, [x21], #8 // load ref_current_8_bytes
210	cmp x22, x23
211	b.ne L_found_valid_match_expand_forward_partial
212	cmp x20, x12 // check if match_end reached src_end
213	b.lo L_found_valid_match_expand_forward_loop
214	b L_found_valid_match_expand_backward
215	L_found_valid_match_expand_forward_partial:
216	sub x20, x20, #8 // revert match_end by 8 and compute actual match of current 8 bytes
217	eor x22, x22, x23 // compare the bits using xor
218	rbit x22, x22 // revert the bits to use clz (the none equivalent bytes would have at least 1 set bit)
219	clz x22, x22 // after the revrse for every equal prefix byte clz would count 8
220	add x20, x20, x22, lsr #3 // add the actual number of matching bytes is (clz result)>>3
221	L_found_valid_match_expand_backward:
222	sub x15, x13, x19 // ref_begin = match_begin - dist
223	L_found_valid_match_expand_backward_loop:
224	cmp x13, x11 // check if match_begin reached src (previous match end)
225	ccmp x15, x3, #0xd, gt // check if ref_begin reached src_begin
226	b.le L_found_valid_match_emit_match
227	ldrb w22, [x13, #-1]! // load match_current_8_bytes (safe to load becasue of safety margin)
228	ldrb w23, [x15, #-1]! // load ref_current_8_bytes
229	cmp w22, w23
230	b.eq L_found_valid_match_expand_backward_loop
231	add x13, x13, #1 // revert x13, last compare didn't match
232
233	// this block write the match into dst
234	// it write the ML token [extra L tokens] [literals] <2byte dist> [extar M tokens]
235	// it update src & dst positions and progress to L_search_next_available_match
236	L_found_valid_match_emit_match:
237	sub x21, x20, x13 // match_length - match_end - match_begin
238	sub x21, x21, #4 // match_length - 4 (first 4 bytes are guaranteed)
239	sub x22, x13, x11 // literals_length = match_begin - src // compute
240	sub x26, x10, x9 // dst_remaining_space = dst_end - dst
241	sub x26, x26, x22 // dst_remaining_space -= literals_length
242	subs x26, x26, #3 // dst_remaining_space -= 2_dist_bytes + L/M_token
243	b.lo L_done // exit if dst isn't sufficent
244
245	and x23, x21, #0xf // store M 4 LSbits
246	add x23, x23, x22, lsl #4 // add L 4 LSbits
247	add x15, x9, #1 // tmp_dst = dst + 1
248	cmp x22, #15 // if L >= 15 need to write more L tokens
249	b.lo L_found_valid_match_copy_literals
250	orr x23, x23, #0xf0 // update L/M token to be 0xfM
251	sub x24, x22, #15 // reduce 15 from number_of_literals
252	sub x26, x26, #1 // check if there is space for the extra L token
253	b.lo L_done
254	cmp x24, #255 // check if need to compute number of 255 tokens
255	b.lo L_found_valid_match_skip_L_255_tokens
256	umull x25, w24, w28 // x25 - (literals_to_token * 1_DIV_255_magic_number)
257	lsr x25, x25, #39 // x25 - number_of_255_tokens = (literals_to_token * 1_DIV_255_magic_number)>>39
258	subs x26, x26, x25 // check if there is sufficent space for the 255_tokens
259	b.lo L_done
260	mov x13, #255
261	umsubl x24, w25, w13, x24 // x24 - value_of_remainder_token = literals_to_token - (number_of_255_tokens*255)
262	L_found_valid_match_L_255_tokens_loop:
263	str q1, [x15], #16 // store 16 255 tokens into dst_tmp. safe to store because dst has safety_margin
264	subs x25, x25, #16 // check if there are any 255 token left after current 16
265	b.hi L_found_valid_match_L_255_tokens_loop
266	add x15, x15, x25 // revert tmp_dst if written too many 255 tokens.
267	L_found_valid_match_skip_L_255_tokens:
268	strb w24, [x15], #1 // write last L token
269	L_found_valid_match_copy_literals:
270	ldr q0, [x11], #16 // load current 16 literals. (safe becasue src_end has safety margin)
271	str q0, [x15], #16 // store current 16 literals. (safe becasue dst_end has safety margin)
272	subs x22, x22, #16
273	b.gt L_found_valid_match_copy_literals
274	add x15, x15, x22 // revert tmp_dst if written too many literals
275	strh w19, [x15], #2 // store dist bytes
276	cmp x21, #15 // if M >= 15 need to write more M tokens
277	b.lo L_found_valid_match_finish_writing_match
278	orr x23, x23, #0xf // update L/M token to be 0xLf
279	sub x24, x21, #15 // reduce 15 from match_length
280	sub x26, x26, #1 // check if there is space for the extra M token
281	b.lo L_done
282	cmp x24, #255 // check if need to compute number of 255 tokens
283	b.lo L_found_valid_match_skip_M_255_tokens
284	umull x25, w24, w28 // x25 - (match_length * 1_DIV_255_magic_number)
285	lsr x25, x25, #39 // x25 - number_of_255_tokens = (match_length * 1_DIV_255_magic_number)>>39
286	subs x26, x26, x25 // check if there is sufficent space for the 255_tokens
287	b.lo L_done
288	mov x13, #255
289	umsubl x24, w25, w13, x24 // x24 - value_of_remainder_token = literals_to_token - (match_length*255)
290	L_found_valid_match_M_255_tokens_loop:
291	str q1, [x15], #16 // store 16 255 tokens into dst_tmp. safe to store because dst has safety_margin
292	subs x25, x25, #16 // check if there are any 255 token left after current 16
293	b.hi L_found_valid_match_M_255_tokens_loop
294	add x15, x15, x25 // revert tmp_dst if written too many 255 tokens.
295	L_found_valid_match_skip_M_255_tokens:
296	strb w24, [x15], #1 // write last M token
297	L_found_valid_match_finish_writing_match:
298	strb w23, [x9] // store first token of match in dst
299	mov x9, x15 // update dst to last postion written
300	mov x11, x20 // update src to match_end (last byte that was encoded)
301	cmp x11, x12 // check if src reached src_end
302	ccmp x9, x10, #9, lt // check if dst reached dst_end
303	b.ge L_trailing_literals
304	b L_search_next_available_match
305	// !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
306	// attempted to hash three quad values from the end of each emited match
307	// this eneded up being slower and less compression (???)
308	// this block set match_begin and pos for next hash search and
309	// compute the hash values for the last 3 bytes of currently emited match
310	// only need to comute these hash becasue other "quads" were hashed when the original
311	// data was read.
312
313	L_try_next_matchs:
314	add x13, x13, #1 // move to next match
315	add x14, x14, #1 // update next match pos
316	cmp x13, x12 // check match_begin didn't reach src_end
317	b.lo L_hash_match
318
319	L_trailing_literals:
320	// unless skip_final_literals is set
321	// write the trailing bytes as literals
322	// traliing bytes include the whole src (with the safty margin)
323	// need to verify whole dst (withthe safty margin) has sufficent space
324
325	tst x6, x6
326	b.ne L_done // if skip_final_literals is set skip writing them
327
328	add x12, x12, #LZ4_GOFAST_SAFETY_MARGIN // add safety_margin
329	subs x13, x12, x11 // remaining_src
330	b.eq L_done // finish if there are 0 trailing literals
331
332	add x10, x10, #LZ4_GOFAST_SAFETY_MARGIN // add safety_margin
333	sub x14, x10, x9 // remaining dst (dst_end - dst)
334	sub x14, x14, #1 // 1 byte is needed at least to write literals token
335	subs x14, x14, x13 // finish if dst can't contain all remaining literals + 1 literals token
336	b.le L_done // (need to verify that it has room for literals tokens
337
338	cmp x13, #15
339	b.lt L_trailing_literals_store_less_than_15_literals
340	subs x14, x14, #1 // 1-extra byte is needed for literals tokens
341	b.mi L_done
342	mov w15, #0xf0
343	strb w15, [x9], #1 // write literals first token (Important !!! if 255 tokens exist but dst isn't sufficent need to revert dst by 1)
344	sub x15, x13, #15
345	cmp x15, #255
346	b.lo L_trailing_literals_no_255_tokens
347	umull x19, w15, w28 // x19 - (literals_to_token * 1_DIV_255_magic_number)
348	lsr x19, x19, #39 // x19 - number_of_255_tokens = (literals_to_token * 1_DIV_255_magic_number)>>39
349	subs x14, x14, x19
350	b.mi L_revert_x9_and_done
351	mov x26, #255
352	umsubl x15, w26, w19, x15 // x15 - value_of_remainder_token = literals_to_token - (number_of_255_tokens*255)
353	L_tariling_literals_write_16_255_tokens:
354	str q1, [x9], #16 // store 16 255 tokens each iteration (this is safe becasue there is space for 15 or more literals + remainder token)
355	subs x19, x19, #16
356	b.gt L_tariling_literals_write_16_255_tokens
357	add x9, x9, x19 // fixes dst to actual number of tokens (x19 might not be a mulitple of 16)
358	L_trailing_literals_no_255_tokens:
359	strb w15, [x9], #1 // store remainder_token
360	lsr x14, x13, #4 // check if there are more than 16 literals left to be written
361	tst x14, x14
362	b.eq L_trailing_literals_copy_less_than_16_literals
363	L_trailing_literals_copy_16_literals:
364	ldr q0, [x11], #16 // load current_16_literals
365	str q0, [ x9], #16 // *dst16++ = current_16_literals
366	subs x14, x14, #1
367	b.gt L_trailing_literals_copy_16_literals
368	cmp x11, x12
369	b.lo L_trailing_literals_copy_less_than_16_literals
370	b L_done
371
372	L_trailing_literals_store_less_than_15_literals:
373	lsl x14, x13, #4 // literals_only_token is 0xL0 (where L is 4 bits)
374	strb w14, [x9], #1 // *dst++ = literals_only_token
375	L_trailing_literals_copy_less_than_16_literals:
376	ldrb w13, [x11], #1 // load current_literal
377	strb w13, [ x9], #1 // *dst++ = current_literal
378	cmp x11, x12
379	b.lo L_trailing_literals_copy_less_than_16_literals
380
381	// this block upadte dst & src pointers and remove frame
382	L_done:
383	str x9, [x0]
384	str x11, [x2]
385
386	ldp x27, x28, [sp], #16
387	ldp x25, x26, [sp], #16
388	ldp x23, x24, [sp], #16
389	ldp x21, x22, [sp], #16
390	ldp x19, x20, [sp], #16
391
392	// clear frame
393	ldp fp, lr, [sp], #16
394	ret lr
395
396	L_revert_x9_and_done:
397	sub x9, x9, #1
398	b L_done
399
400	.p2align 2
401	L_constant:
402	.long LZ4_COMPRESS_HASH_MULTIPLY
403	.long 0x80808081
404
405	#endif
406