git.saurik.com Git - apple/xnu.git/blame_incremental - osfmk/x86_64/lz4_decode_x86

... / ...

Commit	Line	Data
	1	#include <vm/lz4_assembly_select.h>
	2	#if LZ4_ENABLE_ASSEMBLY_DECODE_X86_64
	3
	4	/*
	5
	6	int64_t lz4_decode_asm(
	7	uint8_t ** dst_ptr, *dst_ptr points to next output byte to write
	8	uint8_t * dst_begin, points to first valid output byte we can access, dst_begin <= dst
	9	uint8_t * dst_end, "relaxed" end of output buffer (see below)
	10	const uint8_t ** src_ptr, *src_ptr points to next input byte to read
	11	const uint8_t * src_end) "relaxed" end of input buffer (see below)
	12
	13	We test the position of the pointers only to ensure we don't access past src_end/dst_end + some fixed constant.
	14	We never read before dst_begin.
	15
	16	Return 0 on success, -1 on failure
	17	On output, (src_ptr,dst_ptr) receives the last position in both buffers corresponding to the beginning of a LZ4 instruction.
	18
	19	*/
	20
	21	#if MSVC_CALLING_CONVENTIONS
	22	#error TODO implement MSVC calling conventions for LZ4 x86_64 assembly
	23	#endif
	24
	25	// %rax and %rbx are free to use
	26
	27	#define dst %rdi // arg0
	28	#define dst_begin %rsi // arg1
	29	#define dst_end %rdx // arg2
	30	#define src %rcx // arg3
	31	#define src_end %r8 // arg4
	32
	33	#define n_literals %r9
	34	#define n_matches %r10
	35
	36	#define copy_src %r11 // match/literal copy source
	37	#define copy_dst %r12 // match/literal copy destination
	38	#define match_distance %r13 // match distance
	39
	40	#define src_good %r14
	41	#define dst_good %r15
	42
	43	.globl _lz4_decode_asm
	44
	45	.macro establish_frame
	46	push %rbp
	47	mov %rsp,%rbp
	48	push %rbx
	49	push %r12
	50	push %r13
	51	push %r14
	52	push %r15
	53	.endm
	54
	55	.macro clear_frame_and_return
	56	pop %r15
	57	pop %r14
	58	pop %r13
	59	pop %r12
	60	pop %rbx
	61	pop %rbp
	62	#ifdef __AVX2__
	63	vzeroupper
	64	#endif
	65	ret
	66	.endm
	67
	68	// copy_1x16 SOURCE_ADDR DESTINATION_ADDR
	69	// Copy 16 bytes, clobber: xmm0
	70	.macro copy_1x16
	71	#ifdef __AVX2__
	72	vmovdqu ($0),%xmm0
	73	vmovdqu %xmm0,($1)
	74	#else
	75	movdqu ($0),%xmm0
	76	movdqu %xmm0,($1)
	77	#endif
	78	.endm
	79
	80	// copy_1x16_and_increment SOURCE_ADDR DESTINATION_ADDR
	81	// Copy 16 bytes, and increment both addresses by 16, clobber: xmm0
	82	.macro copy_1x16_and_increment
	83	#ifdef __AVX2__
	84	vmovdqu ($0),%xmm0
	85	vmovdqu %xmm0,($1)
	86	#else
	87	movdqu ($0),%xmm0
	88	movdqu %xmm0,($1)
	89	#endif
	90	add $$16,$0
	91	add $$16,$1
	92	.endm
	93
	94	// copy_2x16_and_increment SOURCE_ADDR DESTINATION_ADDR
	95	// Copy 2 times 16 bytes, and increment both addresses by 32, clobber: xmm0
	96	.macro copy_2x16_and_increment
	97	#ifdef __AVX2__
	98	vmovdqu ($0),%xmm0
	99	vmovdqu %xmm0,($1)
	100	vmovdqu 16($0),%xmm0
	101	vmovdqu %xmm0,16($1)
	102	#else
	103	movdqu ($0),%xmm0
	104	movdqu %xmm0,($1)
	105	movdqu 16($0),%xmm0
	106	movdqu %xmm0,16($1)
	107	#endif
	108	add $$32,$0
	109	add $$32,$1
	110	.endm
	111
	112	// copy_1x32_and_increment SOURCE_ADDR DESTINATION_ADDR
	113	// Copy 32 bytes, and increment both addresses by 32, clobber: xmm0,xmm1
	114	.macro copy_1x32_and_increment
	115	#ifdef __AVX2__
	116	vmovdqu ($0),%ymm0
	117	vmovdqu %ymm0,($1)
	118	#else
	119	movdqu ($0),%xmm0
	120	movdqu 16($0),%xmm1
	121	movdqu %xmm0,($1)
	122	movdqu %xmm1,16($1)
	123	#endif
	124	add $$32,$0
	125	add $$32,$1
	126	.endm
	127
	128	.macro check_src_end
	129	cmp src,src_end
	130	jbe L_done // done if src >= src_end
	131	.endm
	132
	133	.macro check_dst_end
	134	cmp dst,dst_end
	135	jbe L_done // done if dst >= dst_end
	136	.endm
	137
	138	.text
	139	.p2align 6
	140	_lz4_decode_asm:
	141	establish_frame
	142	push dst // keep uint8_t ** dst on stack
	143	mov (dst),dst // load current dst from *dst
	144	push src // keep const uint8_t ** src on stack
	145	mov (src),src // load current src from *src
	146
	147	L_decode_command:
	148	// Keep last known good command
	149	mov dst,dst_good
	150	mov src,src_good
	151
	152	// Check limits
	153	check_src_end
	154	check_dst_end
	155
	156	// Decode command
	157	movzb (src),%rax // read command byte LLLLMMMM
	158	add $1,src
	159	mov %rax,n_literals
	160	shr $4,n_literals // n_literals in 0..15
	161	mov %rax,n_matches
	162	and $0xf,n_matches
	163	add $4,n_matches // n_matches in 4..19
	164
	165	// Short literal?
	166	cmp $15,n_literals
	167	je L_decode_long_literal
	168
	169	// Copy literals, n_literals <= 14: copy 16 bytes
	170	L_copy_short_literal:
	171	copy_1x16 src,dst
	172	add n_literals,src // src += n_literals
	173	add n_literals,dst // dst += n_literals
	174	jmp L_expand_match // continue to match
	175
	176	// the number of literals is encoded on more bytes, we need to decode them
	177	L_decode_long_literal:
	178	check_src_end // required here, since we may loop an arbitrarily high number of times
	179	movzb (src),%rax
	180	add $1,src
	181	add %rax,n_literals
	182	cmp $255,%rax
	183	je L_decode_long_literal
	184
	185	// Copy literals, n_literals >= 15
	186	L_copy_long_literal:
	187	mov src,copy_src // literal copy source
	188	mov dst,copy_dst // literal copy destination
	189	add n_literals,src // update src,dst for next step
	190	add n_literals,dst
	191	check_src_end // required here, since n_literals can be arbitrarily high
	192	check_dst_end
	193
	194	// fixed + loop
	195	copy_1x32_and_increment copy_src,copy_dst
	196	copy_1x32_and_increment copy_src,copy_dst
	197	L_copy_long_literal_loop:
	198	copy_1x32_and_increment copy_src,copy_dst
	199	cmp copy_dst,dst
	200	ja L_copy_long_literal_loop
	201	// continue to match
	202
	203	L_expand_match:
	204	// Load match distance, and get match copy source
	205	movzw (src),match_distance
	206	add $2,src
	207	test match_distance,match_distance
	208	jz L_fail // match_distance == 0: FAIL
	209	mov dst,copy_src
	210	sub match_distance,copy_src // copy_src = match copy source
	211	cmp copy_src,dst_begin
	212	ja L_fail // dst_begin > copy_src: FAIL
	213
	214	// Long n_matches encoding?
	215	cmp $19,n_matches
	216	je L_decode_long_match // unlikely
	217	// Long n_matches with short encoding (17 or 18)?
	218	cmp $16,n_matches
	219	ja L_long_match // unlikely
	220
	221	// Copy match, n_matches <= 16
	222	L_copy_short_match:
	223	cmp $16,match_distance
	224	jb L_copy_short_match_overlap
	225
	226	// Copy match, n_matches <= 16 and match_distance >= 16: copy 16 bytes
	227	copy_1x16 copy_src,dst
	228	add n_matches,dst // update dst
	229	jmp L_decode_command // to next command
	230
	231	// Copy match, n_matches <= 16 and match_distance < 16: replicate pattern
	232	L_copy_short_match_overlap:
	233	lea L_match_permtable(%rip),%rax
	234	shl $5,match_distance
	235	#ifdef __AVX2__
	236	vmovdqa (%rax,match_distance),%xmm2 // pattern address is match_permtable + 32 * match_distance
	237	vmovdqu (copy_src),%xmm0 // read the bytes to replicate. exactly match_distance bytes are needed, but we load 16
	238	vpshufb %xmm2,%xmm0,%xmm0 // replicate the pattern in xmm0
	239	vmovdqu %xmm0,(dst) // and store the result
	240	#else
	241	movdqa (%rax,match_distance),%xmm2 // pattern address is match_permtable + 32 * match_distance
	242	movdqu (copy_src),%xmm0 // read the bytes to replicate. exactly match_distance bytes are needed, but we load 16
	243	pshufb %xmm2,%xmm0 // replicate the pattern in xmm0
	244	movdqu %xmm0,(dst) // and store the result
	245	#endif
	246	add n_matches,dst // update dst
	247	jmp L_decode_command // to next command
	248
	249	// n_matches == 19: the number of matches in encoded on more bytes, we need to decode them
	250	L_decode_long_match:
	251	mov $255,%rbx
	252	L_decode_long_match_loop:
	253	check_src_end // required here, since we may loop an arbitrarily high number of times
	254	mov (src),%rax
	255	add $1,src
	256	and %rbx,%rax
	257	add %rax,n_matches
	258	cmp %rbx,%rax
	259	je L_decode_long_match_loop
	260
	261	// n_matches > 16
	262	L_long_match:
	263	mov dst,copy_dst // copy_dst = match copy destination
	264	add n_matches,dst // update dst
	265	check_dst_end // n_matches may be arbitrarily high
	266
	267	cmp $16,match_distance
	268	jb L_copy_long_match_overlap // match_distance < 16: overlapping copy
	269
	270	// Copy match, n_matches >= 16, match_distance >= 16
	271	// fixed + loop
	272	copy_1x16_and_increment copy_src,copy_dst
	273	L_copy_long_match_loop:
	274	copy_2x16_and_increment copy_src,copy_dst
	275	cmp copy_dst,dst
	276	ja L_copy_long_match_loop
	277	jmp L_decode_command // to next command
	278
	279	// Copy match, n_matches >= 16, match_distance < 16: replicate pattern
	280	L_copy_long_match_overlap:
	281	lea L_match_permtable(%rip),%rax
	282	mov match_distance,%rbx
	283	shl $5,%rbx
	284	#ifdef __AVX2__
	285	vmovdqu (copy_src),%xmm0 // read the bytes to replicate. exactly match_distance bytes are needed, but we load 16
	286	vmovdqa %xmm0,%xmm1 // keep a copy for the high bytes
	287	vmovdqa (%rax,%rbx),%xmm2 // pattern for low 16 bytes
	288	vpshufb %xmm2,%xmm0,%xmm0 // replicate the pattern in xmm0
	289	vmovdqa 16(%rax,%rbx),%xmm2 // pattern for high 16 bytes
	290	vpshufb %xmm2,%xmm1,%xmm1 // replicate the pattern in xmm1
	291	vinserti128 $1,%xmm1,%ymm0,%ymm0 // store all 32 bytes into a single register
	292	#else
	293	movdqu (copy_src),%xmm0 // read the bytes to replicate. exactly match_distance bytes are needed, but we load 16
	294	movdqa %xmm0,%xmm1 // keep a copy for the high bytes
	295	movdqa (%rax,%rbx),%xmm2 // pattern for low 16 bytes
	296	pshufb %xmm2,%xmm0 // replicate the pattern in xmm0
	297	movdqa 16(%rax,%rbx),%xmm2 // pattern for high 16 bytes
	298	pshufb %xmm2,%xmm1 // replicate the pattern in xmm1
	299	#endif
	300	// Here, %xmm0:%xmm1 (or %ymm0 for AVX2) is a 32-byte pattern replicating the first match_distance bytes up to 32 bytes
	301	lea L_match_disttable(%rip),%rax
	302	movzb (%rax,match_distance),%rax // and %rax is now the usable length of this pattern, the largest multiple of match_distance less than or equal to 32.
	303
	304	// fixed
	305	#ifdef __AVX2__
	306	vmovdqu %ymm0,(copy_dst)
	307	#else
	308	movdqu %xmm0,(copy_dst)
	309	movdqu %xmm1,16(copy_dst)
	310	#endif
	311	add %rax,copy_dst
	312	L_copy_long_match_overlap_loop:
	313	// loop
	314	#ifdef __AVX2__
	315	vmovdqu %ymm0,(copy_dst)
	316	#else
	317	movdqu %xmm0,(copy_dst)
	318	movdqu %xmm1,16(copy_dst)
	319	#endif
	320	add %rax,copy_dst
	321	cmp copy_dst,dst
	322	ja L_copy_long_match_overlap
	323	jmp L_decode_command // to next command
	324
	325	L_fail:
	326	xor %rax,%rax
	327	dec %rax // -1
	328	jmp L_exit
	329
	330	L_done:
	331	xor %rax,%rax
	332	// continue to exit
	333
	334	L_exit:
	335	pop src
	336	mov src_good,(src)
	337	pop dst
	338	mov dst_good,(dst)
	339	clear_frame_and_return
	340
	341	// permutation tables for short distance matches, 32 byte result, for match_distance = 0 to 15
	342	// value(d)[i] = i%d for i = 0..31
	343	.p2align 6
	344	L_match_permtable:
	345	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // 0
	346	.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // 1
	347	.byte 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 // 2
	348	.byte 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1 // 3
	349	.byte 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 // 4
	350	.byte 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1 // 5
	351	.byte 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1 // 6
	352	.byte 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3 // 7
	353	.byte 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 // 8
	354	.byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4 // 9
	355	.byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1 // 10
	356	.byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 // 11
	357	.byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11, 0, 1, 2, 3, 4, 5, 6, 7 // 12
	358	.byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12, 0, 1, 2, 3, 4, 5 // 13
	359	.byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13, 0, 1, 2, 3 // 14
	360	.byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14, 0, 1 // 15
	361
	362	// valid repeating pattern size, for each match_distance = 0 to 15
	363	// value(d) = 32 - (32%d), is the largest a multiple of d <= 32
	364	.p2align 6
	365	L_match_disttable:
	366	.byte 32,32,32,30 // 0 .. 3
	367	.byte 16,30,30,28 // 4 .. 7
	368	.byte 16,27,30,22 // 8 .. 11
	369	.byte 24,26,28,30 // 12 .. 15
	370
	371	#endif // LZ4_ENABLE_ASSEMBLY_DECODE_X86_64