git.saurik.com Git - apple/xnu.git/blame_incremental

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2016-2016 Apple Inc. All rights reserved.
	3	*
	4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
	5	*
	6	* This file contains Original Code and/or Modifications of Original Code
	7	* as defined in and that are subject to the Apple Public Source License
	8	* Version 2.0 (the 'License'). You may not use this file except in
	9	* compliance with the License. The rights granted to you under the License
	10	* may not be used to create, or enable the creation or redistribution of,
	11	* unlawful or unlicensed copies of an Apple operating system, or to
	12	* circumvent, violate, or enable the circumvention or violation of, any
	13	* terms of an Apple operating system software license agreement.
	14	*
	15	* Please obtain a copy of the License at
	16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
	17	*
	18	* The Original Code and all software distributed under the License are
	19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
	20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
	21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
	22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
	23	* Please see the License for the specific language governing rights and
	24	* limitations under the License.
	25	*
	26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
	27	*/
	28
	29	#include <vm/lz4_assembly_select.h>
	30	#include <vm/lz4_constants.h>
	31	#include <arm64/asm.h>
	32
	33	#if LZ4_ENABLE_ASSEMBLY_ENCODE_ARM64
	34
	35	/* void lz4_encode_2gb(uint8_t ** dst_ptr,
	36	size_t dst_size,
	37	const uint8_t ** src_ptr,
	38	const uint8_t * src_begin,
	39	size_t src_size,
	40	lz4_hash_entry_t hash_table[LZ4_COMPRESS_HASH_ENTRIES],
	41	int skip_final_literals) */
	42
	43	.globl _lz4_encode_2gb
	44
	45	#define dst_ptr x0
	46	#define dst_size x1
	47	#define src_ptr x2
	48	#define src_begin x3
	49	#define src_size x4
	50	#define hash_table x5
	51	#define skip_final_literals x6
	52
	53	.text
	54	.p2align 4
	55	_lz4_encode_2gb:
	56
	57	// esteblish frame
	58	ARM64_STACK_PROLOG
	59	stp fp, lr, [sp, #-16]!
	60	mov fp, sp
	61
	62	stp x19, x20, [sp, #-16]!
	63	stp x21, x22, [sp, #-16]!
	64	stp x23, x24, [sp, #-16]!
	65	stp x25, x26, [sp, #-16]!
	66	stp x27, x28, [sp, #-16]!
	67
	68	// constant registers
	69	adr x7, L_constant
	70	ldr w28, [x7, #4] // x28 = 0x80808081 (magic number to cmopute 1/255)
	71	ldr w7, [x7] // x7 = LZ4_COMPRESS_HASH_MULTIPLY
	72	mov x27, #-1 // x27 = 0xffffffffffffffff
	73	dup.4s v1, w27 // q1 = {0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff}
	74
	75
	76	// x9 - is current dst
	77	// x10 - dst_end - safety_margin
	78	ldr x9, [x0] // dst
	79	add x10, x9, x1 // dst_end
	80	sub x10, x10, #LZ4_GOFAST_SAFETY_MARGIN // dst_end - safety_margin
	81	cmp x10, x9 // if dst_size < safety_margin abort
	82	b.lt L_done
	83
	84	// x11 - is current src
	85	// x12 - is src_end - safety margin
	86	ldr x11, [x2] // src
	87	add x12, x11, x4 // src_end
	88	sub x12, x12, #LZ4_GOFAST_SAFETY_MARGIN // src_end - safety_margin
	89	cmp x12, x11 // if src_size < safety_margin skip to trailing_literals
	90	b.lt L_trailing_literals
	91
	92
	93	// this block search for the next available match
	94	// set match_begin to current src (which is also where last match ended)
	95	L_search_next_available_match:
	96	mov x13, x11 // match_begin = src
	97	sub x14, x13, x3 // match_postion = match_begin - src_begin
	98
	99	// compute hash value for the next 5 "quads"
	100	// hash distance need to be 0 < D < 0x10000
	101
	102	L_hash_match:
	103	ldr x15, [x13] // match_first_4_bytes
	104	umull x20, w7, w15 // match_bytes * LZ4_COMPRESS_HASH_MULTIPLY
	105	lsr w20, w20, #LZ4_COMPRESS_HASH_SHIFT // use LZ4_COMPRESS_HASH_BITS MSbits as index
	106	add x20, x5, x20, lsl #3 // hash_table_entry ptr (hash + 8*index)
	107
	108	ldp w19, w22, [x20] // read entry values (w19 - pos, w22 - 4 bytes at pos)
	109	stp w14, w15, [x20] // write entry values (w14 - current pos, w15 - current 4 bytes)
	110
	111	add x26, x14, #1 // next_match pos
	112	lsr x25, x15, #8 // next_match_first_4_bytes
	113	umull x21, w7, w25 // match_bytes * LZ4_COMPRESS_HASH_MULTIPLY
	114	lsr w21, w21, #LZ4_COMPRESS_HASH_SHIFT // use LZ4_COMPRESS_HASH_BITS MSbits as index
	115	add x21, x5, x21, lsl #3 // hash_table_entry ptr (hash + 8*index)
	116
	117	ldp w23, w24, [x21] // read entry values (w23 - pos, w24 - 4 bytes at pos)
	118	stp w26, w25, [x21] // write entry values (w26 - next pos, w25 - next 4 bytes)
	119
	120	cmp w15, w22
	121	b.ne L_try_next_match_0 // compare the 4 bytes to see if there is a match
	122	sub w19, w14, w19 // x19 - match_dist (current_pos - match_pos)
	123	cmp w19, #0x10000
	124	ccmp w19, #0, #0xf, lo
	125	b.eq L_try_next_match_0 // verify the 0 < dist < 0x10000
	126	b L_found_valid_match
	127
	128	L_try_next_match_0:
	129	add x13, x13, #1
	130	add x14, x14, #1
	131
	132	add x26, x14, #1 // next_match pos
	133	lsr x15, x15, #16 // next_match_first_4_bytes
	134	umull x20, w7, w15 // match_bytes * LZ4_COMPRESS_HASH_MULTIPLY
	135	lsr w20, w20, #LZ4_COMPRESS_HASH_SHIFT // use LZ4_COMPRESS_HASH_BITS MSbits as index
	136	add x20, x5, x20, lsl #3 // hash_table_entry ptr (hash + 8*index)
	137
	138	ldp w21, w22, [x20] // read entry values (w19 - pos, w22 - 4 bytes at pos)
	139	stp w26, w15, [x20] // write entry values (w14 - current pos, w15 - current 4 bytes)
	140
	141	cmp w25, w24
	142	b.ne L_try_next_match_1 // compare the 4 bytes to see if there is a match
	143	sub w19, w14, w23 // x19 - match_dist (current_pos - match_pos)
	144	cmp w19, #0x10000
	145	ccmp w19, #0, #0xf, lo
	146	b.eq L_try_next_match_1 // verify the 0 < dist < 0x10000
	147	b L_found_valid_match
	148
	149	L_try_next_match_1:
	150	add x13, x13, #1
	151	add x14, x14, #1
	152
	153	add x26, x14, #1 // next_match pos
	154	lsr x25, x15, #8 // next_match_first_4_bytes
	155	umull x20, w7, w25 // match_bytes * LZ4_COMPRESS_HASH_MULTIPLY
	156	lsr w20, w20, #LZ4_COMPRESS_HASH_SHIFT // use LZ4_COMPRESS_HASH_BITS MSbits as index
	157	add x20, x5, x20, lsl #3 // hash_table_entry ptr (hash + 8*index)
	158
	159	ldp w23, w24, [x20] // read entry values (w23 - pos, w24 - 4 bytes at pos)
	160	stp w26, w25, [x20] // write entry values (w26 - next pos, w25 - next 4 bytes)
	161
	162	cmp w15, w22
	163	b.ne L_try_next_match_2 // compare the 4 bytes to see if there is a match
	164	sub w19, w14, w21 // x19 - match_dist (current_pos - match_pos)
	165	cmp w19, #0x10000
	166	ccmp w19, #0, #0xf, lo
	167	b.eq L_try_next_match_2 // verify the 0 < dist < 0x10000
	168	b L_found_valid_match
	169
	170	L_try_next_match_2:
	171	add x13, x13, #1
	172	add x14, x14, #1
	173
	174	add x26, x14, #1 // next_match pos
	175	lsr x15, x15, #16 // next_match_first_4_bytes
	176	umull x20, w7, w15 // match_bytes * LZ4_COMPRESS_HASH_MULTIPLY
	177	lsr w20, w20, #LZ4_COMPRESS_HASH_SHIFT // use LZ4_COMPRESS_HASH_BITS MSbits as index
	178	add x20, x5, x20, lsl #3 // hash_table_entry ptr (hash + 8*index)
	179
	180	ldp w21, w22, [x20] // read entry values (w19 - pos, w22 - 4 bytes at pos)
	181	stp w26, w15, [x20] // write entry values (w14 - current pos, w15 - current 4 bytes)
	182
	183	cmp w25, w24
	184	b.ne L_try_next_match_3 // compare the 4 bytes to see if there is a match
	185	sub w19, w14, w23 // x19 - match_dist (current_pos - match_pos)
	186	cmp w19, #0x10000
	187	ccmp w19, #0, #0xf, lo
	188	b.eq L_try_next_match_3 // verify the 0 < dist < 0x10000
	189	b L_found_valid_match
	190
	191	L_try_next_match_3:
	192	add x13, x13, #1
	193	add x14, x14, #1
	194
	195	cmp w15, w22
	196	b.ne L_try_next_matchs // compare the 4 bytes to see if there is a match
	197	sub w19, w14, w21 // x19 - match_dist (current_pos - match_pos)
	198	cmp w19, #0x10000
	199	ccmp w19, #0, #0xf, lo
	200	b.eq L_try_next_matchs // verify the 0 < dist < 0x10000
	201	b L_found_valid_match
	202
	203	// this block exapnd the valid match as much as possible
	204	// first it try to expand the match forward
	205	// next it try to expand the match backword
	206	L_found_valid_match:
	207	add x20, x13, #4 // match_end = match_begin+4 (already confirmd the first 4 bytes)
	208	sub x21, x20, x19 // ref_end = match_end - dist
	209	L_found_valid_match_expand_forward_loop:
	210	ldr x22, [x20], #8 // load match_current_8_bytes (safe to load becasue of safety margin)
	211	ldr x23, [x21], #8 // load ref_current_8_bytes
	212	cmp x22, x23
	213	b.ne L_found_valid_match_expand_forward_partial
	214	cmp x20, x12 // check if match_end reached src_end
	215	b.lo L_found_valid_match_expand_forward_loop
	216	b L_found_valid_match_expand_backward
	217	L_found_valid_match_expand_forward_partial:
	218	sub x20, x20, #8 // revert match_end by 8 and compute actual match of current 8 bytes
	219	eor x22, x22, x23 // compare the bits using xor
	220	rbit x22, x22 // revert the bits to use clz (the none equivalent bytes would have at least 1 set bit)
	221	clz x22, x22 // after the revrse for every equal prefix byte clz would count 8
	222	add x20, x20, x22, lsr #3 // add the actual number of matching bytes is (clz result)>>3
	223	L_found_valid_match_expand_backward:
	224	sub x15, x13, x19 // ref_begin = match_begin - dist
	225	L_found_valid_match_expand_backward_loop:
	226	cmp x13, x11 // check if match_begin reached src (previous match end)
	227	ccmp x15, x3, #0xd, gt // check if ref_begin reached src_begin
	228	b.le L_found_valid_match_emit_match
	229	ldrb w22, [x13, #-1]! // load match_current_8_bytes (safe to load becasue of safety margin)
	230	ldrb w23, [x15, #-1]! // load ref_current_8_bytes
	231	cmp w22, w23
	232	b.eq L_found_valid_match_expand_backward_loop
	233	add x13, x13, #1 // revert x13, last compare didn't match
	234
	235	// this block write the match into dst
	236	// it write the ML token [extra L tokens] [literals] <2byte dist> [extar M tokens]
	237	// it update src & dst positions and progress to L_search_next_available_match
	238	L_found_valid_match_emit_match:
	239	sub x21, x20, x13 // match_length - match_end - match_begin
	240	sub x21, x21, #4 // match_length - 4 (first 4 bytes are guaranteed)
	241	sub x22, x13, x11 // literals_length = match_begin - src // compute
	242	sub x26, x10, x9 // dst_remaining_space = dst_end - dst
	243	sub x26, x26, x22 // dst_remaining_space -= literals_length
	244	subs x26, x26, #3 // dst_remaining_space -= 2_dist_bytes + L/M_token
	245	b.lo L_done // exit if dst isn't sufficent
	246
	247	and x23, x21, #0xf // store M 4 LSbits
	248	add x23, x23, x22, lsl #4 // add L 4 LSbits
	249	add x15, x9, #1 // tmp_dst = dst + 1
	250	cmp x22, #15 // if L >= 15 need to write more L tokens
	251	b.lo L_found_valid_match_copy_literals
	252	orr x23, x23, #0xf0 // update L/M token to be 0xfM
	253	sub x24, x22, #15 // reduce 15 from number_of_literals
	254	sub x26, x26, #1 // check if there is space for the extra L token
	255	b.lo L_done
	256	cmp x24, #255 // check if need to compute number of 255 tokens
	257	b.lo L_found_valid_match_skip_L_255_tokens
	258	umull x25, w24, w28 // x25 - (literals_to_token * 1_DIV_255_magic_number)
	259	lsr x25, x25, #39 // x25 - number_of_255_tokens = (literals_to_token * 1_DIV_255_magic_number)>>39
	260	subs x26, x26, x25 // check if there is sufficent space for the 255_tokens
	261	b.lo L_done
	262	mov x13, #255
	263	umsubl x24, w25, w13, x24 // x24 - value_of_remainder_token = literals_to_token - (number_of_255_tokens*255)
	264	L_found_valid_match_L_255_tokens_loop:
	265	str q1, [x15], #16 // store 16 255 tokens into dst_tmp. safe to store because dst has safety_margin
	266	subs x25, x25, #16 // check if there are any 255 token left after current 16
	267	b.hi L_found_valid_match_L_255_tokens_loop
	268	add x15, x15, x25 // revert tmp_dst if written too many 255 tokens.
	269	L_found_valid_match_skip_L_255_tokens:
	270	strb w24, [x15], #1 // write last L token
	271	L_found_valid_match_copy_literals:
	272	ldr q0, [x11], #16 // load current 16 literals. (safe becasue src_end has safety margin)
	273	str q0, [x15], #16 // store current 16 literals. (safe becasue dst_end has safety margin)
	274	subs x22, x22, #16
	275	b.gt L_found_valid_match_copy_literals
	276	add x15, x15, x22 // revert tmp_dst if written too many literals
	277	strh w19, [x15], #2 // store dist bytes
	278	cmp x21, #15 // if M >= 15 need to write more M tokens
	279	b.lo L_found_valid_match_finish_writing_match
	280	orr x23, x23, #0xf // update L/M token to be 0xLf
	281	sub x24, x21, #15 // reduce 15 from match_length
	282	sub x26, x26, #1 // check if there is space for the extra M token
	283	b.lo L_done
	284	cmp x24, #255 // check if need to compute number of 255 tokens
	285	b.lo L_found_valid_match_skip_M_255_tokens
	286	umull x25, w24, w28 // x25 - (match_length * 1_DIV_255_magic_number)
	287	lsr x25, x25, #39 // x25 - number_of_255_tokens = (match_length * 1_DIV_255_magic_number)>>39
	288	subs x26, x26, x25 // check if there is sufficent space for the 255_tokens
	289	b.lo L_done
	290	mov x13, #255
	291	umsubl x24, w25, w13, x24 // x24 - value_of_remainder_token = literals_to_token - (match_length*255)
	292	L_found_valid_match_M_255_tokens_loop:
	293	str q1, [x15], #16 // store 16 255 tokens into dst_tmp. safe to store because dst has safety_margin
	294	subs x25, x25, #16 // check if there are any 255 token left after current 16
	295	b.hi L_found_valid_match_M_255_tokens_loop
	296	add x15, x15, x25 // revert tmp_dst if written too many 255 tokens.
	297	L_found_valid_match_skip_M_255_tokens:
	298	strb w24, [x15], #1 // write last M token
	299	L_found_valid_match_finish_writing_match:
	300	strb w23, [x9] // store first token of match in dst
	301	mov x9, x15 // update dst to last postion written
	302	mov x11, x20 // update src to match_end (last byte that was encoded)
	303	cmp x11, x12 // check if src reached src_end
	304	ccmp x9, x10, #9, lt // check if dst reached dst_end
	305	b.ge L_trailing_literals
	306	b L_search_next_available_match
	307	// !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
	308	// attempted to hash three quad values from the end of each emited match
	309	// this eneded up being slower and less compression (???)
	310	// this block set match_begin and pos for next hash search and
	311	// compute the hash values for the last 3 bytes of currently emited match
	312	// only need to comute these hash becasue other "quads" were hashed when the original
	313	// data was read.
	314
	315	L_try_next_matchs:
	316	add x13, x13, #1 // move to next match
	317	add x14, x14, #1 // update next match pos
	318	cmp x13, x12 // check match_begin didn't reach src_end
	319	b.lo L_hash_match
	320
	321	L_trailing_literals:
	322	// unless skip_final_literals is set
	323	// write the trailing bytes as literals
	324	// traliing bytes include the whole src (with the safty margin)
	325	// need to verify whole dst (withthe safty margin) has sufficent space
	326
	327	tst x6, x6
	328	b.ne L_done // if skip_final_literals is set skip writing them
	329
	330	add x12, x12, #LZ4_GOFAST_SAFETY_MARGIN // add safety_margin
	331	subs x13, x12, x11 // remaining_src
	332	b.eq L_done // finish if there are 0 trailing literals
	333
	334	add x10, x10, #LZ4_GOFAST_SAFETY_MARGIN // add safety_margin
	335	sub x14, x10, x9 // remaining dst (dst_end - dst)
	336	sub x14, x14, #1 // 1 byte is needed at least to write literals token
	337	subs x14, x14, x13 // finish if dst can't contain all remaining literals + 1 literals token
	338	b.le L_done // (need to verify that it has room for literals tokens
	339
	340	cmp x13, #15
	341	b.lt L_trailing_literals_store_less_than_15_literals
	342	subs x14, x14, #1 // 1-extra byte is needed for literals tokens
	343	b.mi L_done
	344	mov w15, #0xf0
	345	strb w15, [x9], #1 // write literals first token (Important !!! if 255 tokens exist but dst isn't sufficent need to revert dst by 1)
	346	sub x15, x13, #15
	347	cmp x15, #255
	348	b.lo L_trailing_literals_no_255_tokens
	349	umull x19, w15, w28 // x19 - (literals_to_token * 1_DIV_255_magic_number)
	350	lsr x19, x19, #39 // x19 - number_of_255_tokens = (literals_to_token * 1_DIV_255_magic_number)>>39
	351	subs x14, x14, x19
	352	b.mi L_revert_x9_and_done
	353	mov x26, #255
	354	umsubl x15, w26, w19, x15 // x15 - value_of_remainder_token = literals_to_token - (number_of_255_tokens*255)
	355	L_tariling_literals_write_16_255_tokens:
	356	str q1, [x9], #16 // store 16 255 tokens each iteration (this is safe becasue there is space for 15 or more literals + remainder token)
	357	subs x19, x19, #16
	358	b.gt L_tariling_literals_write_16_255_tokens
	359	add x9, x9, x19 // fixes dst to actual number of tokens (x19 might not be a mulitple of 16)
	360	L_trailing_literals_no_255_tokens:
	361	strb w15, [x9], #1 // store remainder_token
	362	lsr x14, x13, #4 // check if there are more than 16 literals left to be written
	363	tst x14, x14
	364	b.eq L_trailing_literals_copy_less_than_16_literals
	365	L_trailing_literals_copy_16_literals:
	366	ldr q0, [x11], #16 // load current_16_literals
	367	str q0, [ x9], #16 // *dst16++ = current_16_literals
	368	subs x14, x14, #1
	369	b.gt L_trailing_literals_copy_16_literals
	370	cmp x11, x12
	371	b.lo L_trailing_literals_copy_less_than_16_literals
	372	b L_done
	373
	374	L_trailing_literals_store_less_than_15_literals:
	375	lsl x14, x13, #4 // literals_only_token is 0xL0 (where L is 4 bits)
	376	strb w14, [x9], #1 // *dst++ = literals_only_token
	377	L_trailing_literals_copy_less_than_16_literals:
	378	ldrb w13, [x11], #1 // load current_literal
	379	strb w13, [ x9], #1 // *dst++ = current_literal
	380	cmp x11, x12
	381	b.lo L_trailing_literals_copy_less_than_16_literals
	382
	383	// this block upadte dst & src pointers and remove frame
	384	L_done:
	385	str x9, [x0]
	386	str x11, [x2]
	387
	388	ldp x27, x28, [sp], #16
	389	ldp x25, x26, [sp], #16
	390	ldp x23, x24, [sp], #16
	391	ldp x21, x22, [sp], #16
	392	ldp x19, x20, [sp], #16
	393
	394	// clear frame
	395	ldp fp, lr, [sp], #16
	396	ARM64_STACK_EPILOG
	397
	398	L_revert_x9_and_done:
	399	sub x9, x9, #1
	400	b L_done
	401
	402	.p2align 2
	403	L_constant:
	404	.long LZ4_COMPRESS_HASH_MULTIPLY
	405	.long 0x80808081
	406
	407	#endif
	408

1

/*

2

3

*

4

* @APPLE_OSREFERENCE_LICENSE_HEADER_START@

5

*

6

* This file contains Original Code and/or Modifications of Original Code

7

* as defined in and that are subject to the Apple Public Source License

8

* Version 2.0 (the 'License'). You may not use this file except in

9

* compliance with the License. The rights granted to you under the License

10

* may not be used to create, or enable the creation or redistribution of,

11

* unlawful or unlicensed copies of an Apple operating system, or to

12

* circumvent, violate, or enable the circumvention or violation of, any

13

* terms of an Apple operating system software license agreement.

14

*

15

* Please obtain a copy of the License at

16

* http://www.opensource.apple.com/apsl/ and read it before using this file.

17

*

18

* The Original Code and all software distributed under the License are

19

* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER

20

* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,

21

* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,

22

* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.

23

* Please see the License for the specific language governing rights and

24

* limitations under the License.

25

*

26

* @APPLE_OSREFERENCE_LICENSE_HEADER_END@

27

*/

28

29

#include <vm/lz4_assembly_select.h>

30

#include <vm/lz4_constants.h>

31

#include <arm64/asm.h>

32

33

#if LZ4_ENABLE_ASSEMBLY_ENCODE_ARM64

34

35

/* void lz4_encode_2gb(uint8_t ** dst_ptr,

36

size_t dst_size,

37

const uint8_t ** src_ptr,

38

const uint8_t * src_begin,

39

size_t src_size,

40

lz4_hash_entry_t hash_table[LZ4_COMPRESS_HASH_ENTRIES],

41

int skip_final_literals) */

42

43

.globl _lz4_encode_2gb

#define dst_ptr x0

#define dst_size x1

#define src_ptr x2

#define src_begin x3

#define src_size x4

#define hash_table x5

51

#define skip_final_literals x6

.text

.p2align 4

_lz4_encode_2gb:

// esteblish frame

ARM64_STACK_PROLOG

stp fp, lr, [sp, #-16]!

60

mov fp, sp

61

62

stp x19, x20, [sp, #-16]!

63

stp x21, x22, [sp, #-16]!

64

stp x23, x24, [sp, #-16]!

65

stp x25, x26, [sp, #-16]!

66

stp x27, x28, [sp, #-16]!

67

68

// constant registers

69

adr x7, L_constant

70

ldr w28, [x7, #4] // x28 = 0x80808081 (magic number to cmopute 1/255)

71

ldr w7, [x7] // x7 = LZ4_COMPRESS_HASH_MULTIPLY

72

mov x27, #-1 // x27 = 0xffffffffffffffff

73

dup.4s v1, w27 // q1 = {0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff}

74

75

76

// x9 - is current dst

77

// x10 - dst_end - safety_margin

78

ldr x9, [x0] // dst

79

add x10, x9, x1 // dst_end

80

sub x10, x10, #LZ4_GOFAST_SAFETY_MARGIN // dst_end - safety_margin

81

cmp x10, x9 // if dst_size < safety_margin abort

82

b.lt L_done

83

84

// x11 - is current src

85

// x12 - is src_end - safety margin

86

ldr x11, [x2] // src

87

add x12, x11, x4 // src_end

88

sub x12, x12, #LZ4_GOFAST_SAFETY_MARGIN // src_end - safety_margin

89

cmp x12, x11 // if src_size < safety_margin skip to trailing_literals

90

b.lt L_trailing_literals

91

92

93

// this block search for the next available match

94

// set match_begin to current src (which is also where last match ended)

95

L_search_next_available_match:

96

mov x13, x11 // match_begin = src

97

sub x14, x13, x3 // match_postion = match_begin - src_begin

98

99

// compute hash value for the next 5 "quads"

100

// hash distance need to be 0 < D < 0x10000

101

102

L_hash_match:

103

ldr x15, [x13] // match_first_4_bytes

104

umull x20, w7, w15 // match_bytes * LZ4_COMPRESS_HASH_MULTIPLY

105

lsr w20, w20, #LZ4_COMPRESS_HASH_SHIFT // use LZ4_COMPRESS_HASH_BITS MSbits as index

106

add x20, x5, x20, lsl #3 // hash_table_entry ptr (hash + 8*index)

107

108

ldp w19, w22, [x20] // read entry values (w19 - pos, w22 - 4 bytes at pos)

109

stp w14, w15, [x20] // write entry values (w14 - current pos, w15 - current 4 bytes)

110

111

add x26, x14, #1 // next_match pos

112

lsr x25, x15, #8 // next_match_first_4_bytes

113

umull x21, w7, w25 // match_bytes * LZ4_COMPRESS_HASH_MULTIPLY

114

lsr w21, w21, #LZ4_COMPRESS_HASH_SHIFT // use LZ4_COMPRESS_HASH_BITS MSbits as index

115

add x21, x5, x21, lsl #3 // hash_table_entry ptr (hash + 8*index)

116

117

ldp w23, w24, [x21] // read entry values (w23 - pos, w24 - 4 bytes at pos)

118

stp w26, w25, [x21] // write entry values (w26 - next pos, w25 - next 4 bytes)

119

120

cmp w15, w22

121

b.ne L_try_next_match_0 // compare the 4 bytes to see if there is a match

122

sub w19, w14, w19 // x19 - match_dist (current_pos - match_pos)

123

cmp w19, #0x10000

124

ccmp w19, #0, #0xf, lo

125

b.eq L_try_next_match_0 // verify the 0 < dist < 0x10000

126

b L_found_valid_match

L_try_next_match_0:

add x13, x13, #1

add x14, x14, #1

add x26, x14, #1 // next_match pos

133

lsr x15, x15, #16 // next_match_first_4_bytes

134

umull x20, w7, w15 // match_bytes * LZ4_COMPRESS_HASH_MULTIPLY

135

lsr w20, w20, #LZ4_COMPRESS_HASH_SHIFT // use LZ4_COMPRESS_HASH_BITS MSbits as index

136

add x20, x5, x20, lsl #3 // hash_table_entry ptr (hash + 8*index)

137

138

ldp w21, w22, [x20] // read entry values (w19 - pos, w22 - 4 bytes at pos)

139

stp w26, w15, [x20] // write entry values (w14 - current pos, w15 - current 4 bytes)

140

141

cmp w25, w24

142

b.ne L_try_next_match_1 // compare the 4 bytes to see if there is a match

143

sub w19, w14, w23 // x19 - match_dist (current_pos - match_pos)

144

cmp w19, #0x10000

145

ccmp w19, #0, #0xf, lo

146

b.eq L_try_next_match_1 // verify the 0 < dist < 0x10000

147

b L_found_valid_match

L_try_next_match_1:

add x13, x13, #1

add x14, x14, #1

add x26, x14, #1 // next_match pos

154

lsr x25, x15, #8 // next_match_first_4_bytes

155

umull x20, w7, w25 // match_bytes * LZ4_COMPRESS_HASH_MULTIPLY

156

lsr w20, w20, #LZ4_COMPRESS_HASH_SHIFT // use LZ4_COMPRESS_HASH_BITS MSbits as index

157

add x20, x5, x20, lsl #3 // hash_table_entry ptr (hash + 8*index)

158

159

ldp w23, w24, [x20] // read entry values (w23 - pos, w24 - 4 bytes at pos)

160

stp w26, w25, [x20] // write entry values (w26 - next pos, w25 - next 4 bytes)

161

162

cmp w15, w22

163

b.ne L_try_next_match_2 // compare the 4 bytes to see if there is a match

164

sub w19, w14, w21 // x19 - match_dist (current_pos - match_pos)

165

cmp w19, #0x10000

166

ccmp w19, #0, #0xf, lo

167

b.eq L_try_next_match_2 // verify the 0 < dist < 0x10000

168

b L_found_valid_match

L_try_next_match_2:

add x13, x13, #1

add x14, x14, #1

add x26, x14, #1 // next_match pos

175

lsr x15, x15, #16 // next_match_first_4_bytes

176

umull x20, w7, w15 // match_bytes * LZ4_COMPRESS_HASH_MULTIPLY

177

lsr w20, w20, #LZ4_COMPRESS_HASH_SHIFT // use LZ4_COMPRESS_HASH_BITS MSbits as index

178

add x20, x5, x20, lsl #3 // hash_table_entry ptr (hash + 8*index)

179

180

ldp w21, w22, [x20] // read entry values (w19 - pos, w22 - 4 bytes at pos)

181

stp w26, w15, [x20] // write entry values (w14 - current pos, w15 - current 4 bytes)

182

183

cmp w25, w24

184

b.ne L_try_next_match_3 // compare the 4 bytes to see if there is a match

185

sub w19, w14, w23 // x19 - match_dist (current_pos - match_pos)

186

cmp w19, #0x10000

187

ccmp w19, #0, #0xf, lo

188

b.eq L_try_next_match_3 // verify the 0 < dist < 0x10000

189

b L_found_valid_match

L_try_next_match_3:

add x13, x13, #1

add x14, x14, #1

cmp w15, w22

b.ne L_try_next_matchs // compare the 4 bytes to see if there is a match

197

sub w19, w14, w21 // x19 - match_dist (current_pos - match_pos)

198

cmp w19, #0x10000

199

ccmp w19, #0, #0xf, lo

200

b.eq L_try_next_matchs // verify the 0 < dist < 0x10000

201

b L_found_valid_match

202

203

// this block exapnd the valid match as much as possible

204

// first it try to expand the match forward

205

// next it try to expand the match backword

206

L_found_valid_match:

207

add x20, x13, #4 // match_end = match_begin+4 (already confirmd the first 4 bytes)

208

sub x21, x20, x19 // ref_end = match_end - dist

209

L_found_valid_match_expand_forward_loop:

210

ldr x22, [x20], #8 // load match_current_8_bytes (safe to load becasue of safety margin)

211

ldr x23, [x21], #8 // load ref_current_8_bytes

212

cmp x22, x23

213

b.ne L_found_valid_match_expand_forward_partial

214

cmp x20, x12 // check if match_end reached src_end

215

b.lo L_found_valid_match_expand_forward_loop

216

b L_found_valid_match_expand_backward

217

L_found_valid_match_expand_forward_partial:

218

sub x20, x20, #8 // revert match_end by 8 and compute actual match of current 8 bytes

219

eor x22, x22, x23 // compare the bits using xor

220

rbit x22, x22 // revert the bits to use clz (the none equivalent bytes would have at least 1 set bit)

221

clz x22, x22 // after the revrse for every equal prefix byte clz would count 8

222

add x20, x20, x22, lsr #3 // add the actual number of matching bytes is (clz result)>>3

223

L_found_valid_match_expand_backward:

224

sub x15, x13, x19 // ref_begin = match_begin - dist

225

L_found_valid_match_expand_backward_loop:

226

cmp x13, x11 // check if match_begin reached src (previous match end)

227

ccmp x15, x3, #0xd, gt // check if ref_begin reached src_begin

228

b.le L_found_valid_match_emit_match

229

ldrb w22, [x13, #-1]! // load match_current_8_bytes (safe to load becasue of safety margin)

230

ldrb w23, [x15, #-1]! // load ref_current_8_bytes

231

cmp w22, w23

232

b.eq L_found_valid_match_expand_backward_loop

233

add x13, x13, #1 // revert x13, last compare didn't match

234

235

// this block write the match into dst

236

// it write the ML token [extra L tokens] [literals] <2byte dist> [extar M tokens]

237

// it update src & dst positions and progress to L_search_next_available_match

238

L_found_valid_match_emit_match:

239

sub x21, x20, x13 // match_length - match_end - match_begin

240

sub x21, x21, #4 // match_length - 4 (first 4 bytes are guaranteed)

241

sub x22, x13, x11 // literals_length = match_begin - src // compute

242

sub x26, x10, x9 // dst_remaining_space = dst_end - dst

243

sub x26, x26, x22 // dst_remaining_space -= literals_length

244

subs x26, x26, #3 // dst_remaining_space -= 2_dist_bytes + L/M_token

245

b.lo L_done // exit if dst isn't sufficent

246

247

and x23, x21, #0xf // store M 4 LSbits

248

add x23, x23, x22, lsl #4 // add L 4 LSbits

249

add x15, x9, #1 // tmp_dst = dst + 1

250

cmp x22, #15 // if L >= 15 need to write more L tokens

251

b.lo L_found_valid_match_copy_literals

252

orr x23, x23, #0xf0 // update L/M token to be 0xfM

253

sub x24, x22, #15 // reduce 15 from number_of_literals

254

sub x26, x26, #1 // check if there is space for the extra L token

255

b.lo L_done

256

cmp x24, #255 // check if need to compute number of 255 tokens

257

b.lo L_found_valid_match_skip_L_255_tokens

258

umull x25, w24, w28 // x25 - (literals_to_token * 1_DIV_255_magic_number)

259

lsr x25, x25, #39 // x25 - number_of_255_tokens = (literals_to_token * 1_DIV_255_magic_number)>>39

260

subs x26, x26, x25 // check if there is sufficent space for the 255_tokens

261

b.lo L_done

262

mov x13, #255

263

umsubl x24, w25, w13, x24 // x24 - value_of_remainder_token = literals_to_token - (number_of_255_tokens*255)

264

L_found_valid_match_L_255_tokens_loop:

265

str q1, [x15], #16 // store 16 255 tokens into dst_tmp. safe to store because dst has safety_margin

266

subs x25, x25, #16 // check if there are any 255 token left after current 16

267

b.hi L_found_valid_match_L_255_tokens_loop

268

add x15, x15, x25 // revert tmp_dst if written too many 255 tokens.

269

L_found_valid_match_skip_L_255_tokens:

270

strb w24, [x15], #1 // write last L token

271

L_found_valid_match_copy_literals:

272

ldr q0, [x11], #16 // load current 16 literals. (safe becasue src_end has safety margin)

273

str q0, [x15], #16 // store current 16 literals. (safe becasue dst_end has safety margin)

274

subs x22, x22, #16

275

b.gt L_found_valid_match_copy_literals

276

add x15, x15, x22 // revert tmp_dst if written too many literals

277

strh w19, [x15], #2 // store dist bytes

278

cmp x21, #15 // if M >= 15 need to write more M tokens

279

b.lo L_found_valid_match_finish_writing_match

280

orr x23, x23, #0xf // update L/M token to be 0xLf

281

sub x24, x21, #15 // reduce 15 from match_length

282

sub x26, x26, #1 // check if there is space for the extra M token

283

b.lo L_done

284

cmp x24, #255 // check if need to compute number of 255 tokens

285

b.lo L_found_valid_match_skip_M_255_tokens

286

umull x25, w24, w28 // x25 - (match_length * 1_DIV_255_magic_number)

287

lsr x25, x25, #39 // x25 - number_of_255_tokens = (match_length * 1_DIV_255_magic_number)>>39

288

subs x26, x26, x25 // check if there is sufficent space for the 255_tokens

289

b.lo L_done

290

mov x13, #255

291

umsubl x24, w25, w13, x24 // x24 - value_of_remainder_token = literals_to_token - (match_length*255)

292

L_found_valid_match_M_255_tokens_loop:

293

str q1, [x15], #16 // store 16 255 tokens into dst_tmp. safe to store because dst has safety_margin

294

subs x25, x25, #16 // check if there are any 255 token left after current 16

295

b.hi L_found_valid_match_M_255_tokens_loop

296

add x15, x15, x25 // revert tmp_dst if written too many 255 tokens.

297

L_found_valid_match_skip_M_255_tokens:

298

strb w24, [x15], #1 // write last M token

299

L_found_valid_match_finish_writing_match:

300

strb w23, [x9] // store first token of match in dst

301

mov x9, x15 // update dst to last postion written

302

mov x11, x20 // update src to match_end (last byte that was encoded)

303

cmp x11, x12 // check if src reached src_end

304

ccmp x9, x10, #9, lt // check if dst reached dst_end

305

b.ge L_trailing_literals

306

b L_search_next_available_match

307

// !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

308

// attempted to hash three quad values from the end of each emited match

309

// this eneded up being slower and less compression (???)

310

// this block set match_begin and pos for next hash search and

311

// compute the hash values for the last 3 bytes of currently emited match

312

// only need to comute these hash becasue other "quads" were hashed when the original

// data was read.

L_try_next_matchs:

add x13, x13, #1 // move to next match

317

add x14, x14, #1 // update next match pos

318

cmp x13, x12 // check match_begin didn't reach src_end

b.lo L_hash_match

L_trailing_literals:

// unless skip_final_literals is set

323

// write the trailing bytes as literals

324

// traliing bytes include the whole src (with the safty margin)

325

// need to verify whole dst (withthe safty margin) has sufficent space

326

327

tst x6, x6

328

b.ne L_done // if skip_final_literals is set skip writing them

329

330

add x12, x12, #LZ4_GOFAST_SAFETY_MARGIN // add safety_margin

331

subs x13, x12, x11 // remaining_src

332

b.eq L_done // finish if there are 0 trailing literals

333

334

add x10, x10, #LZ4_GOFAST_SAFETY_MARGIN // add safety_margin

335

sub x14, x10, x9 // remaining dst (dst_end - dst)

336

sub x14, x14, #1 // 1 byte is needed at least to write literals token

337

subs x14, x14, x13 // finish if dst can't contain all remaining literals + 1 literals token

338

b.le L_done // (need to verify that it has room for literals tokens

339

340

cmp x13, #15

341

b.lt L_trailing_literals_store_less_than_15_literals

342

subs x14, x14, #1 // 1-extra byte is needed for literals tokens

343

b.mi L_done

344

mov w15, #0xf0

345

strb w15, [x9], #1 // write literals first token (Important !!! if 255 tokens exist but dst isn't sufficent need to revert dst by 1)

346

sub x15, x13, #15

347

cmp x15, #255

348

b.lo L_trailing_literals_no_255_tokens

349

umull x19, w15, w28 // x19 - (literals_to_token * 1_DIV_255_magic_number)

350

lsr x19, x19, #39 // x19 - number_of_255_tokens = (literals_to_token * 1_DIV_255_magic_number)>>39

351

subs x14, x14, x19

352

b.mi L_revert_x9_and_done

353

mov x26, #255

354

umsubl x15, w26, w19, x15 // x15 - value_of_remainder_token = literals_to_token - (number_of_255_tokens*255)

355

L_tariling_literals_write_16_255_tokens:

356

str q1, [x9], #16 // store 16 255 tokens each iteration (this is safe becasue there is space for 15 or more literals + remainder token)

357

subs x19, x19, #16

358

b.gt L_tariling_literals_write_16_255_tokens

359

add x9, x9, x19 // fixes dst to actual number of tokens (x19 might not be a mulitple of 16)

360

L_trailing_literals_no_255_tokens:

361

strb w15, [x9], #1 // store remainder_token

362

lsr x14, x13, #4 // check if there are more than 16 literals left to be written

363

tst x14, x14

364

b.eq L_trailing_literals_copy_less_than_16_literals

365

L_trailing_literals_copy_16_literals:

366

ldr q0, [x11], #16 // load current_16_literals

367

str q0, [ x9], #16 // *dst16++ = current_16_literals

368

subs x14, x14, #1

369

b.gt L_trailing_literals_copy_16_literals

370

cmp x11, x12

371

b.lo L_trailing_literals_copy_less_than_16_literals

372

b L_done

373

374

L_trailing_literals_store_less_than_15_literals:

375

lsl x14, x13, #4 // literals_only_token is 0xL0 (where L is 4 bits)

376

strb w14, [x9], #1 // *dst++ = literals_only_token

377

L_trailing_literals_copy_less_than_16_literals:

378

ldrb w13, [x11], #1 // load current_literal

379

strb w13, [ x9], #1 // *dst++ = current_literal

380

cmp x11, x12

381

b.lo L_trailing_literals_copy_less_than_16_literals

382

383

// this block upadte dst & src pointers and remove frame

L_done:

str x9, [x0]

str x11, [x2]

ldp x27, x28, [sp], #16

389

ldp x25, x26, [sp], #16

390

ldp x23, x24, [sp], #16

391

ldp x21, x22, [sp], #16

392

ldp x19, x20, [sp], #16

393

394

// clear frame

395

ldp fp, lr, [sp], #16

396

ARM64_STACK_EPILOG

397

398

L_revert_x9_and_done:

sub x9, x9, #1

b L_done

.p2align 2

L_constant:

.long LZ4_COMPRESS_HASH_MULTIPLY

.long 0x80808081

#endif